In [2]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

import pandas as pd
import numpy as np
from enum import Enum
import itertools

## Load the clean and the dirty dataset

In [3]:
df = pd.read_csv('HospitalErrors.csv', encoding='utf8', dtype=object)

In [4]:
df.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample,label
0,10046,RIVERVIEW REGIONAL MEDICAL CENTER,600 SOUTH THIRD STREET,GADSDEN,35901,ETOWAH,2565435200,,Heart Failure,268 patients,0
1,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,36049,CRENSHAW,3343353374,,Heart Failure,8 patients,0


In [5]:
len(df)

742

In [6]:
sum(df.isnull().values.ravel())

206

In [7]:
dfEmpty = df[df.isnull().any(axis=1)]

In [8]:
dfNone = df.query('ZipCode == "0" or PhoneNumber == "0" or ProviderNumber == "0"')

In [9]:
dfEmpty = pd.concat([dfEmpty, dfNone])

In [10]:
len(dfEmpty)

287

In [11]:
dfEmpty.drop(columns=['label'], axis=1, inplace=True)

In [12]:
dfEmpty.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10046,RIVERVIEW REGIONAL MEDICAL CENTER,600 SOUTH THIRD STREET,GADSDEN,35901,ETOWAH,2565435200,,Heart Failure,268 patients
1,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,36049,CRENSHAW,3343353374,,Heart Failure,8 patients


## Word2Vec Model

In [13]:
path = 'HospitalMultiAttributeWord2Vec.w2v'
word2vecModel = KeyedVectors.load(path)

In [14]:
class Attributes(Enum):
    ProviderNumber = 1
    HospitalName = 2
    Address1 = 3
    City = 4
    ZipCode = 5
    CountyName = 6
    PhoneNumber = 7
    HospitalOwner = 8
    Condition = 9
    Sample = 10

In [15]:
attributeMapping = {}
attributeMapping["ProviderNumber"] = Attributes.ProviderNumber
attributeMapping["HospitalName"] = Attributes.HospitalName
attributeMapping["Address1"] = Attributes.Address1
attributeMapping["City"] = Attributes.City
attributeMapping["ZipCode"] = Attributes.ZipCode
attributeMapping["CountyName"] = Attributes.CountyName
attributeMapping["PhoneNumber"] = Attributes.PhoneNumber
attributeMapping["HospitalOwner"] = Attributes.HospitalOwner
attributeMapping["Condition"] = Attributes.Condition
attributeMapping["Sample"] = Attributes.Sample

In [16]:
attributeMapping

{'Address1': <Attributes.Address1: 3>,
 'City': <Attributes.City: 4>,
 'Condition': <Attributes.Condition: 9>,
 'CountyName': <Attributes.CountyName: 6>,
 'HospitalName': <Attributes.HospitalName: 2>,
 'HospitalOwner': <Attributes.HospitalOwner: 8>,
 'PhoneNumber': <Attributes.PhoneNumber: 7>,
 'ProviderNumber': <Attributes.ProviderNumber: 1>,
 'Sample': <Attributes.Sample: 10>,
 'ZipCode': <Attributes.ZipCode: 5>}

In [17]:
inverseMapping = {y:x for x,y in attributeMapping.items()}

In [18]:
inverseMapping

{<Attributes.Address1: 3>: 'Address1',
 <Attributes.City: 4>: 'City',
 <Attributes.Condition: 9>: 'Condition',
 <Attributes.CountyName: 6>: 'CountyName',
 <Attributes.HospitalName: 2>: 'HospitalName',
 <Attributes.HospitalOwner: 8>: 'HospitalOwner',
 <Attributes.PhoneNumber: 7>: 'PhoneNumber',
 <Attributes.ProviderNumber: 1>: 'ProviderNumber',
 <Attributes.Sample: 10>: 'Sample',
 <Attributes.ZipCode: 5>: 'ZipCode'}

Load Model

In [19]:
model = load_model('HospitalMultiAttributeClassifier.h5')

Load tokenizer

In [20]:
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [21]:
mapping = {0: 'Address1',
 1: 'City',
 2: 'Condition',
 3: 'CountyName',
 4: 'HospitalName',
 5: 'HospitalOwner',
 6: 'PhoneNumber',
 7: 'ProviderNumber',
 8: 'Sample',
 9: 'ZipCode'}

## Prediction pipeline

In [22]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=10):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        try:
            results = word2vecModel.wv.most_similar(value, topn=topN)
            for match, confidence in results:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == missingType:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence            
        except KeyError:
            continue
    return output.items()

In [23]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=45)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

Convert each row to a list

In [24]:
nullRows = dfEmpty.values.tolist()

Create a mapping of the form **row: missingAttribute**

In [55]:
rows = []
columnNames = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode', 'CountyName', 'PhoneNumber','HospitalOwner','Condition','Sample']
for row in nullRows:
    missingAttribute = None
    removedNan = []
    for i in range(len(row)):
        if str(row[i]) != 'nan' and str(row[i]) != '0':
            removedNan.append(row[i])
        else:
            missingAttribute = attributeMapping[columnNames[i]]
    if missingAttribute is not None:
        rows.append({tuple(removedNan):missingAttribute})
    else:
        print(row)

## Perform imputation

In [56]:
def getImputedValue(missingRow, attributeType):
    results = impute(word2vecModel, model, tokenizer, missingRow, attributeType, 25)
    return sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None

### Build the verfication pipeline

In [57]:
dfClean = pd.read_csv('truth_values_1100_tuples.csv', encoding='utf8', dtype='object')

In [58]:
len(dfClean)

1100

In [59]:
dfUnique = dfClean.drop_duplicates()

In [60]:
len(dfUnique)

824

In [61]:
dfUnique.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients
8,10019,HELEN KELLER MEMORIAL HOSPITAL,1300 SOUTH MONTGOMERY AVENUE,SHEFFIELD,35660,JEFFERSON,2563864556,Government - Hospital District or Authority,Heart Attack,33 patients


In [62]:
queryMapping = {
    Attributes.ProviderNumber : 'HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.HospitalName : 'ProviderNumber=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.Address1 : 'ProviderNumber=="{}" and HospitalName=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.City : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.ZipCode : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.CountyName : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.PhoneNumber : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    Attributes.HospitalOwner : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}" and PhoneNumber=="{}" and Condition == "{}" and Sample=="{}"',
    Attributes.Condition : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Sample=="{}"',
    Attributes.Sample : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}"',
}


In [63]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [64]:
for row in rows:
    missingRow = list(row.keys())[0]
    attribute = list(row.values())[0]

    missingRow = [i.strip() for i in missingRow]
    # Run the query
    query = (queryMapping[attribute]).format(*missingRow)
    outputDf = dfClean.query(query)
    actual = outputDf.head(1)[inverseMapping[attribute]].to_string(index=False)
    
    predicted = getImputedValue(missingRow, attribute)       
    if predicted and actual == predicted[0]:
        correct += 1
    else:
        inCorrect += 1
        incorrectPredictions.append((actual, predicted[0] if predicted else None))       
        

In [65]:
correct

206

In [66]:
inCorrect

81

In [67]:
incorrectPredictions

[('Proprietary', None),
 ('229 patients', '357 patients'),
 ('Voluntary non-profit - Private', None),
 ('Proprietary', None),
 ('19 patients', '38 patients'),
 ('Voluntary non-profit - Private', None),
 ('Proprietary', None),
 ('Voluntary non-profit - Other', None),
 ('CULLMAN', 'FRANKLIN'),
 ('Government - Hospital District or Authority', None),
 ('Voluntary non-profit - Private', None),
 ('46 patients', '179 patients'),
 ('Surgical Infection Prevention', None),
 ('2 patients', '38 patients'),
 ('Proprietary', None),
 ('77 patients', '141 patients'),
 ('3 patients', '97 patients'),
 ('3 patients', '205 patients'),
 ('Government - Hospital District or Authority', 'Government - Local'),
 ('106 patients', '521 patients'),
 ('0 patients', '137 patients'),
 ('89 patients', '392 patients'),
 ('239 patients', '404 patients'),
 ('CULLMAN', 'FRANKLIN'),
 ('Proprietary', None),
 ('Voluntary non-profit - Private', 'Government - Local'),
 ('172 patients', '120 patients'),
 ('Heart Attack', None),

In [68]:
accuracy = correct / (correct + inCorrect) * 1.0

In [69]:
accuracy

0.7177700348432056

In [70]:
count = 0
for i in incorrectPredictions:
    if 'patients' in i[0]:
        count += 1

In [71]:
count

31

In [72]:
len(incorrectPredictions)

81

In [73]:
31/81.0

0.38271604938271603