# Record Embedding

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from gensim.models import KeyedVectors


import gensim
from gensim.models import FastText

%matplotlib inline

Using TensorFlow backend.


## Read data
Details here: https://fasttext.cc/

In [2]:
df_truth = pd.read_csv('clean_hosp_dataset.csv',dtype=object, encoding='utf8')
df_dirty = pd.read_csv('HospitalErrorsWithoutNan.csv',dtype=object, encoding='utf8')
df_dirty.drop(columns=['label'], axis=1, inplace=True)

In [3]:
df_truth.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients
1,10019,HELEN KELLER MEMORIAL HOSPITAL,1300 SOUTH MONTGOMERY AVENUE,SHEFFIELD,35660,JEFFERSON,2563864556,Government - Hospital District or Authority,Heart Attack,33 patients


In [4]:
df_dirty.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10011,ST VINCENP'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,35235,JEFFERSON,2058383122,Voluntary non-profit - Private,Pneumonia,69 patients
1,10027,ELBA GENERAL HOSPITAL,987 DRAYTON STREET,ELBA,36323,COFFEE,3348972257,Voluntary non-profit - Other,Surgical Infection Prevention,0 patients


## Preprocessing

In [5]:
combined_hosp = df_truth.values.tolist()
combined_dirty = df_dirty.values.tolist()

In [6]:
columns = df_truth.columns.values

In [7]:
ProviderNumberSet = set(df_truth['ProviderNumber'].tolist())
HospitalNameSet = set(df_truth['HospitalName'].tolist())
AddressSet = set(df_truth['Address1'].tolist())
CitySet = set(df_truth['City'].tolist())
ZipCodeSet = set(df_truth['ZipCode'].tolist())
CountyNameSet = set(df_truth['CountyName'].tolist())
PhoneNumberSet = set(df_truth['PhoneNumber'].tolist())
HospitalOwnerSet = set(df_truth['HospitalOwner'].tolist())
ConditionSet = set(df_truth['Condition'].tolist())
SampleSet = set(df_truth['Sample'].tolist())

In [8]:
combinedSet = [ProviderNumberSet, HospitalNameSet, AddressSet, CitySet, ZipCodeSet, CountyNameSet, PhoneNumberSet, HospitalOwnerSet, ConditionSet, SampleSet]

In [9]:
columns

array(['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode',
       'CountyName', 'PhoneNumber', 'HospitalOwner', 'Condition',
       'Sample'], dtype=object)

## Model

In [10]:
path = 'HospitalMultiAttributeFastText.w2v'
fastTextModel = KeyedVectors.load(path)

In [11]:
model = load_model('HospitalMultiAttributeClassifier.h5')
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [12]:
mapping = dict(enumerate(sorted(df_truth.columns.values)))

In [13]:
mapping

{0: 'Address1',
 1: 'City',
 2: 'Condition',
 3: 'CountyName',
 4: 'HospitalName',
 5: 'HospitalOwner',
 6: 'PhoneNumber',
 7: 'ProviderNumber',
 8: 'Sample',
 9: 'ZipCode'}

## Correction Pipeline

In [14]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=45)
    predictions = model.predict(testData)
    return mapping[np.argmax(predictions[0])]

In [15]:
def correctCell(fastTextModel, model, tokenizer, row, topN=10):
    cellValues = {}
    output = dict()
    isMistake = False
    for cellIndex in range(len(row)):
        currentCellValue = row[cellIndex]
        if not currentCellValue in combinedSet[cellIndex]:
            isMistake = True
            cellValues['mistakeDetected'] = currentCellValue
            predictions = fastTextModel.most_similar(currentCellValue, topn=topN)
            for match, confidence in predictions:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == columns[cellIndex]:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence
    
    if isMistake:
        results = output.items()
        predictedValue = sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None
        cellValues['predictedValue'] = predictedValue[0] if predictedValue else None
        return cellValues
    else:
        return None

## Verification pipeline

In [16]:
dfUnique = df_truth.drop_duplicates()

In [17]:
len(dfUnique)

824

In [18]:
queryMapping = {
    "ProviderNumber" : 'HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "HospitalName" : 'ProviderNumber=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "Address1" : 'ProviderNumber=="{}" and HospitalName=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "City" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "ZipCode" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "CountyName" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "PhoneNumber" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}" and HospitalOwner == "{}" and Condition == "{}" and Sample=="{}"',
    "HospitalOwner" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}" and PhoneNumber=="{}" and Condition == "{}" and Sample=="{}"',
    "Condition" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Sample=="{}"',
    "Sample" : 'ProviderNumber=="{}" and HospitalName=="{}" and Address1=="{}" and City=="{}" and ZipCode=="{}" and CountyName=="{}"and PhoneNumber=="{}" and HospitalOwner == "{}" and Condition == "{}"',
}

In [19]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [20]:
for row in combined_dirty:
    output = correctCell(fastTextModel, model, tokenizer, row, 15)
    if output:
        detectedError = output['mistakeDetected']
        predictedValue = output['predictedValue']
    
        tempRow = row[:]
        errorIndex = row.index(detectedError)
        tempRow.remove(detectedError)

        # Run the query
        query = (queryMapping[columns[errorIndex]]).format(*tempRow)
        outputDf = dfUnique.query(query)
        actual = outputDf.head(1).values.tolist()[0][errorIndex]

        if actual == predictedValue:
            correct += 1
        else:
            inCorrect += 1
            incorrectPredictions.append((actual, detectedError, predictedValue))

## Results

In [21]:
correct

187

In [22]:
inCorrect

6

In [23]:
incorrectPredictions

[('MONTGOMERY', 'MONTGOMEBY', 'BUTLER'),
 ('MONTGOMERY', 'MGNTGOMERY', 'BUTLER'),
 ('MONTGOMERY', 'MONTGOMQRY', 'BUTLER'),
 ('FAYETTE', 'FAMETTE', 'FLORENCE'),
 ('MOBILE', 'MOGILE', 'DALE'),
 ('CULLMAN', 'CULLMUN', 'FAYETTE')]

In [24]:
accuracy = correct / (correct + inCorrect) * 1.0

In [25]:
accuracy

0.9689119170984456

In [26]:
fastTextModel.most_similar("MGNTGOMERY")

[('MONTGOMERY', 0.9960721731185913),
 ('1300 SOUTH MONTGOMERY AVENUE', 0.6285814046859741),
 ('2105 EAST SOUTH BOULEVARD', 0.36631783843040466),
 ('JACKSON HOSPITAL & CLINIC INC', 0.3442744016647339),
 ('35631', 0.2793944180011749),
 ('35660', 0.2650303244590759),
 ('WINFIELD', 0.24821051955223083),
 ('1720 UNIVERSITY BLVD', 0.2395210862159729),
 ('SHEFFIELD', 0.2367148995399475),
 ('ONEONTA', 0.22580210864543915)]

In [27]:
predictAttribute(model, tokenizer, "FAYETTE")

'CountyName'

In [28]:
df_truth[df_truth['City'] == "FAYETTE"]

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
729,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Attack,18 patients
730,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Attack,12 patients
731,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Attack,1 patients
732,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Attack,11 patients
733,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Attack,0 patients
734,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Failure,67 patients
735,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Failure,85 patients
736,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Failure,19 patients
737,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Heart Failure,16 patients
738,10045,FAYETTE MEDICAL CENTER,1653 TEMPLE AVENUE NORTH,FAYETTE,35555,FAYETTE,2059325966,Voluntary non-profit - Other,Pneumonia,117 patients
