In [1]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

import pandas as pd
import numpy as np
from enum import Enum
import itertools

Using TensorFlow backend.


## Load the clean and the dirty dataset

In [2]:
df = pd.read_csv('HospitalErrors.csv', encoding='utf8', dtype=object)

In [3]:
df.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample,label
0,10046,RIVERVIEW REGIONAL MEDICAL CENTER,600 SOUTH THIRD STREET,GADSDEN,35901,ETOWAH,2565435200,,Heart Failure,268 patients,0
1,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,36049,CRENSHAW,3343353374,,Heart Failure,8 patients,0


In [4]:
len(df)

742

In [5]:
sum(df.isnull().values.ravel())

206

In [12]:
dfEmpty = df[df.isnull().any(axis=1)]

In [13]:
dfNone = df.query('ZipCode == "0" or PhoneNumber == "0" or ProviderNumber == "0"')

In [14]:
dfEmpty = pd.concat([dfEmpty, dfNone])

In [15]:
len(dfEmpty)

287

In [16]:
dfEmpty.drop(columns=['label'], axis=1, inplace=True)

In [17]:
dfEmpty.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10046,RIVERVIEW REGIONAL MEDICAL CENTER,600 SOUTH THIRD STREET,GADSDEN,35901,ETOWAH,2565435200,,Heart Failure,268 patients
1,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,36049,CRENSHAW,3343353374,,Heart Failure,8 patients


## Word2Vec Model

In [18]:
path = 'HospitalMultiAttributeWord2Vec.w2v'
word2vecModel = KeyedVectors.load(path)

In [19]:
class Attributes(Enum):
    ProviderNumber = 1
    HospitalName = 2
    Address1 = 3
    City = 4
    ZipCode = 5
    CountyName = 6
    PhoneNumber = 7
    HospitalOwner = 8
    Condition = 9
    Sample = 10

In [20]:
attributeMapping = {}
attributeMapping["ProviderNumber"] = Attributes.ProviderNumber
attributeMapping["HospitalName"] = Attributes.HospitalName
attributeMapping["Address1"] = Attributes.Address1
attributeMapping["City"] = Attributes.City
attributeMapping["ZipCode"] = Attributes.ZipCode
attributeMapping["CountyName"] = Attributes.CountyName
attributeMapping["PhoneNumber"] = Attributes.PhoneNumber
attributeMapping["HospitalOwner"] = Attributes.HospitalOwner
attributeMapping["Condition"] = Attributes.Condition
attributeMapping["Sample"] = Attributes.Sample

In [21]:
attributeMapping

{'Address1': <Attributes.Address1: 3>,
 'City': <Attributes.City: 4>,
 'Condition': <Attributes.Condition: 9>,
 'CountyName': <Attributes.CountyName: 6>,
 'HospitalName': <Attributes.HospitalName: 2>,
 'HospitalOwner': <Attributes.HospitalOwner: 8>,
 'PhoneNumber': <Attributes.PhoneNumber: 7>,
 'ProviderNumber': <Attributes.ProviderNumber: 1>,
 'Sample': <Attributes.Sample: 10>,
 'ZipCode': <Attributes.ZipCode: 5>}

Load Model

In [22]:
model = load_model('HospitalMultiAttributeClassifier.h5')

Load tokenizer

In [23]:
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [24]:
mapping = {0: 'Address1',
 1: 'City',
 2: 'Condition',
 3: 'CountyName',
 4: 'HospitalName',
 5: 'HospitalOwner',
 6: 'PhoneNumber',
 7: 'ProviderNumber',
 8: 'Sample',
 9: 'ZipCode'}

## Prediction pipeline

In [25]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=10):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        try:
            results = word2vecModel.wv.most_similar(value, topn=topN)
            for match, confidence in results:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == missingType:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence            
        except KeyError:
            continue
    return output.items()

In [26]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=200)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

Convert each row to a list

In [27]:
nullRows = dfEmpty.values.tolist()

Create a mapping of the form **row: missingAttribute**

In [28]:
rows = []
columnNames = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode', 'CountyName', 'PhoneNumber','HospitalOwner','Condition','Sample']
for row in nullRows:
    missingAttribute = None
    removedNan = []
    for i in range(len(row)):
        if str(row[i]) != 'nan':
            removedNan.append(row[i])
        else:
            missingAttribute = attributeMapping[columnNames[i]]
    rows.append({tuple(removedNan):missingAttribute})

In [29]:
rows[2]

{('10033',
  'UNIVERSITY OF ALABAMA HOSPITAL',
  '619 SOUTH 19TH STREET',
  'BIRMINGHAM',
  '35233',
  'JEFFERSON',
  '2059344011',
  'Government - State',
  'Heart Attack'): <Attributes.Sample: 10>}

## Perform imputation

In [36]:
def getImputedValue(missingRow, attributeType):
    results = impute(word2vecModel, model, tokenizer, missingRow, attributeType, 25)
    return sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None

In [37]:
for row in rows[:20]:
    missingRow = list(row.keys())[0]
    attribute = list(row.values())[0]
    
    predicted = getImputedValue(missingRow, attribute)
    print(predicted)

None
('Government - Federal', 0.5728280544281006)
('357 patients', 0.7957759499549866)
None
('GADSDEN', 0.7519332766532898)
('PRATTVILLE BAPTIST HOSPITAL', 0.7754225134849548)
('101 SIVLEY RD', 0.721992015838623)
('UNIV OF SOUTH ALABAMA MEDICAL CENTER', 0.7747100591659546)
None
('UNIV OF SOUTH ALABAMA MEDICAL CENTER', 0.7747100591659546)
('126 HOSPITAL AVE', 0.7268096208572388)
('38 patients', 0.5959551334381104)
('MARION REGIONAL MEDICAL CENTER', 0.7534360885620117)
None
None
None
('COFFEE', 0.6083629727363586)
('MARSHALL MEDICAL CENTER NORTH', 0.7338075637817383)
('FRANKLIN', 0.45614656805992126)
('OPELIKA', 0.7578517198562622)


In [54]:
rows[1]

{('10008',
  'CRENSHAW COMMUNITY HOSPITAL',
  '101 HOSPITAL CIRCLE',
  'LUVERNE',
  '36049',
  'CRENSHAW',
  '3343353374',
  'Heart Failure',
  '8 patients'): <Attributes.HospitalOwner: 8>}

In [83]:
dfClean = pd.read_csv('truth_values_1100_tuples.csv', encoding='utf8', dtype='object')

In [84]:
dfClean.drop(columns=['label'], axis=1, inplace=True)

In [85]:
len(dfClean)

1100

In [86]:
dfClean.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,Empty
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,Empty


In [87]:
dfClean.query('ProviderNumber == "10008" and HospitalName == "CRENSHAW COMMUNITY HOSPITAL"  and Address1 == "101 HOSPITAL CIRCLE"  and City == "LUVERNE"  and ZipCode == "36049"  and CountyName == "CRENSHAW"  and PhoneNumber == "3343353374"  and Condition == "Heart Failure"  and Sample == "8 patients"')

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
128,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,36049,CRENSHAW,3343353374,Government - Federal,Heart Failure,8 patients


In [77]:
rows[4]

{('10040',
  'GADSDEN REGIONAL MEDICAL CENTER',
  '1007 GOODYEAR AVENUE',
  '35903',
  'ETOWAH',
  '2564944000',
  'Proprietary',
  'Surgical Infection Prevention',
  '564 patients'): <Attributes.City: 4>}

In [89]:
dfClean.query('ProviderNumber == "10040" and HospitalName == "GADSDEN REGIONAL MEDICAL CENTER"  and Address1 == "1007 GOODYEAR AVENUE" and ZipCode == "35903"  and CountyName == "ETOWAH" and HospitalOwner=="Proprietary"  and PhoneNumber == "2564944000"  and Condition == "Surgical Infection Prevention"  and Sample == "564 patients"')

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
821,10040,GADSDEN REGIONAL MEDICAL CENTER,1007 GOODYEAR AVENUE,GADSDEN,35903,ETOWAH,2564944000,Proprietary,Surgical Infection Prevention,564 patients
