In [123]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

import pandas as pd
import numpy as np
from enum import Enum
import itertools

## Load the clean and the dirty dataset

In [91]:
dfClean = pd.read_csv('A.csv', encoding='utf8', dtype=object)

In [92]:
len(dfClean)

315

In [93]:
dfClean.head(3)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
2,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100


In [94]:
dfDirty = pd.read_csv('B.csv', encoding='utf8', dtype=object)

In [95]:
len(dfDirty)

315

In [98]:
dfDirty.head(8)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber
0,,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233.0,JEFFERSON,2053258100.0
1,10018.0,,1720 UNIVERSITY BLVD,BIRMINGHAM,35233.0,JEFFERSON,2053258100.0
2,10018.0,CALLAHAN EYE FOUNDATION HOSPITAL,,BIRMINGHAM,35233.0,JEFFERSON,2053258100.0
3,10018.0,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,,35233.0,JEFFERSON,2053258100.0
4,10018.0,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,,JEFFERSON,2053258100.0
5,10018.0,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233.0,,2053258100.0
6,10018.0,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233.0,JEFFERSON,
7,,HELEN KELLER MEMORIAL HOSPITAL,1300 SOUTH MONTGOMERY AVENUE,SHEFFIELD,35660.0,JEFFERSON,2563864556.0


Get number of empty cells

In [99]:
sum(dfDirty.isnull().values.ravel())

315

Select the rows with empty cells

In [101]:
dfEmpty = dfDirty[dfDirty.isnull().any(axis=1)]

In [102]:
len(dfEmpty)

315

In [103]:
dfEmpty.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber
0,,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
1,10018.0,,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100


## Word2Vec Model

In [104]:
path = 'HospitalWord2Vec.w2v'
word2vecModel = KeyedVectors.load(path)

Define attributes

In [105]:
class Attributes(Enum):
    ProviderNumber = 1
    HospitalName = 2
    Address1 = 3
    City = 4
    State = 5
    ZipCode = 6
    CountyName = 7
    PhoneNumber = 8

In [106]:
attributeMapping = {}
attributeMapping["ProviderNumber"] = Attributes.ProviderNumber
attributeMapping["HospitalName"] = Attributes.HospitalName
attributeMapping["Address1"] = Attributes.Address1
attributeMapping["City"] = Attributes.City
attributeMapping["State"] = Attributes.State
attributeMapping["ZipCode"] = Attributes.ZipCode
attributeMapping["CountyName"] = Attributes.CountyName
attributeMapping["PhoneNumber"] = Attributes.PhoneNumber

In [107]:
attributeMapping

{'Address1': <Attributes.Address1: 3>,
 'City': <Attributes.City: 4>,
 'CountyName': <Attributes.CountyName: 7>,
 'HospitalName': <Attributes.HospitalName: 2>,
 'PhoneNumber': <Attributes.PhoneNumber: 8>,
 'ProviderNumber': <Attributes.ProviderNumber: 1>,
 'State': <Attributes.State: 5>,
 'ZipCode': <Attributes.ZipCode: 6>}

Load Model

In [108]:
model = load_model('HospitalClassifier.h5')

Load tokenizer

In [109]:
with open('HospitalAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [110]:
mapping = {0: 'Address1',
           1: 'City',
           2: 'CountyName',
           3: 'HospitalName',
           4: 'PhoneNumber',
           5: 'ProviderNumber',
           6: 'ZipCode'
          }

## Prediction pipeline

In [111]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=10):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        try:
            results = word2vecModel.wv.most_similar(value, topn=topN)
            for match, confidence in results:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == missingType:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence            
        except KeyError:
            continue
    return output.items()

In [112]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=40)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

Convert each row to a list

In [113]:
nullRows = dfEmpty.values.tolist()

Create a mapping of the form **row: missingAttribute**

e.g: ('10018','CALLAHAN EYE FOUNDATION HOSPITAL','1720 UNIVERSITY BLVD','BIRMINGHAM','35233','2053258100'): Attributes.CountyName

In [114]:
rows = []
columnNames = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode', 'CountyName', 'PhoneNumber']
for row in nullRows:
    missingAttribute = None
    removedNan = []
    for i in range(len(row)):
        if str(row[i]) != 'nan':
            removedNan.append(row[i])
        else:
            missingAttribute = attributeMapping[columnNames[i]]
    rows.append({tuple(removedNan):missingAttribute})

## Perform imputation

In [115]:
def getImputedValue(missingRow, attributeType):
    results = impute(word2vecModel, model, tokenizer, missingRow, attributeType, 10)
    return sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None

Determine if the predicted values are accurate

In [127]:
dfUnique = dfClean.drop_duplicates()

In [128]:
len(dfUnique)

45

In [129]:
dfUnique.head(3)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
7,10019,HELEN KELLER MEMORIAL HOSPITAL,1300 SOUTH MONTGOMERY AVENUE,SHEFFIELD,35660,JEFFERSON,2563864556
14,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,36302,HOUSTON,3347938701


In [130]:
truthValues = dfUnique.values.tolist()
truthValues = list(itertools.chain.from_iterable(truthValues))

In [131]:
len(truthValues)

315

Impute

In [142]:
correct = 0
inCorrect = 0
incorrectPredictions = []
truthValuesIter = iter(truthValues)

In [143]:
for row in rows:
    missingRow = list(row.keys())[0]
    attribute = list(row.values())[0]
    
    predicted = getImputedValue(missingRow, attribute)
    actual = next(truthValuesIter)
    if predicted and predicted[0] == actual:
        correct += 1
    else:
        inCorrect += 1
        incorrectPredictions.append((actual, predicted[0] if predicted else None))
        

In [144]:
correct

304

In [145]:
accuracy = correct / (correct + inCorrect) * 1.0

In [146]:
accuracy

0.9650793650793651

In [147]:
incorrectPredictions

[('MOBILE', 'DE KALB'),
 ('FRANKLIN', None),
 ('ANCHORAGE', 'HUNTSVILLE'),
 ('20018', '10085'),
 ('PO BOX 287', None),
 ('BETHEL', '20018'),
 ('BETHEL', 'PO BOX 287'),
 ('MONTGOMERY', 'ANCHORAGE'),
 ('MONTGOMERY', None),
 ('CULLMAN', 'PO BOX 287'),
 ('FAYETTE', 'CHEROKEE')]

In [148]:
df = pd.read_csv('truthvalue.csv', encoding='utf8',dtype=object)

In [149]:
sum(df.isnull().values.ravel())

0