# Record Embedding

In [34]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

%matplotlib inline

## Embedding using fastText
Details here: https://fasttext.cc/

In [20]:
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.models import FastText

In [21]:

df_truth = pd.read_csv("truthvalue.csv",dtype=object, encoding='utf8')
df_truth.drop(columns=['City', 'State','HospitalType','HospitalOwner','EmergencyService','Condition','MeasureCode', 'MeasureName' ,'Score' ,'Sample' ,'Stateavg'], axis=1, inplace=True)

df_dirty = pd.read_csv("dirty_data_transformed.csv",dtype=object, encoding='utf8')
df_dirty.drop(columns=['State', 'Address2', 'Address3', 'HospitalType','HospitalOwner','EmergencyService','Condition','MeasureCode', 'MeasureName' ,'Score' ,'Sample' ,'Stateavg'], axis=1, inplace=True)

provider = df_truth['ProviderNumber'].tolist()
hospital = df_truth['HospitalName'].tolist()
address = df_truth['Address1'].tolist()
city = df_truth['City.1'].tolist()
zipcode = df_truth['ZipCode'].tolist()
county = df_truth['CountyName'].tolist()
phone = df_truth['PhoneNumber'].tolist()

combined_hosp = list(zip(provider, hospital, address, city, zipcode, county))
df_truth.head()


Unnamed: 0,ProviderNumber,HospitalName,Address1,City.1,ZipCode,CountyName,PhoneNumber
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
2,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
3,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
4,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100


In [22]:
df_dirty.head()

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
2,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100
3,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHxM,35233,JEFFERSON,2053258100
4,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100


In [23]:
len(df_truth)

1000

In [24]:
len(combined_hosp[1])

6

In [25]:
#hospital dirty dataset; maintain dirty dataset separately
provider = df_dirty['ProviderNumber'].tolist()
hospital = df_dirty['HospitalName'].tolist()
address = df_dirty['Address1'].tolist()
city = df_dirty['City'].tolist()
zipcode = df_dirty['ZipCode'].tolist()
county = df_dirty['CountyName'].tolist()
phone = df_dirty['PhoneNumber'].tolist()

combined_dirty = list(zip(provider, hospital, address, city, zipcode, county))
len(combined_dirty[1])

6

In [26]:
#Calculating error or dirty cells
dirty_list = []
truth_list = []
for i in range(len(combined_dirty)):
    for j in range(len(combined_dirty[0])):
        if combined_dirty[i][j] != combined_hosp[i][j]:
            dirty_list.append(combined_dirty[i][j])
            truth_list.append(combined_hosp[i][j])

In [49]:
len(dirty_list)

202

## Mapping to get column information

In [68]:
mapping = dict(enumerate(df_truth.columns.values))

In [69]:
mapping

{0: 'ProviderNumber',
 1: 'HospitalName',
 2: 'Address1',
 3: 'City.1',
 4: 'ZipCode',
 5: 'CountyName',
 6: 'PhoneNumber'}

## Error Cleaning using fastText

Convert each tuple into a row. We load hospital dataset. The fastText model is trained using groundtruth dataset for hospital and then the dirty dataset is used for correcting spelling errors. The fastText algorithm outputs top 10 matches and loses column information. Here we use Attribute classifier to understand the column info of the erroroneous tuple and the first match with same attribute in the predicted list is assumed as the right cell value for the cell. 

In [70]:
#training the model with truth data.
model_hosp = FastText(combined_hosp, min_count=1, workers=8, iter=1000)

## Module to predict the top value by the model. Some values are not broken into tokens.

In [None]:
excluded_list = []
result = []
predictedValues = []

## Loading attribute classifier

In [90]:
model = load_model('HospitalClassifier.h5')
with open('HospitalAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Error Cleaning using Attribute Classifer over FastText

In [86]:
for i in range(len(dirty_list)):
    sequences = tokenizer.texts_to_sequences(dirty_list[i])
    dirtyData = pad_sequences(sequences, maxlen=40)
    dirtyCol = model.predict(dirtyData)
    if isinstance(dirty_list[i], str):
        try:
            predictedValues = model_hosp.most_similar(dirty_list[i])
            match = 0
            for j in range(len(predictedValues)):
                predSequences = tokenizer.texts_to_sequences(predictedValues[j][0])
                testData = pad_sequences(predSequences, maxlen=40)
                correctCol = model.predict(testData)
                if mapping[np.argmax(correctCol[0])] == mapping[np.argmax(dirtyCol[0])]:
                    result.append((predictedValues[j][0],dirty_list[i],truth_list[i]))
                    match = 1
                    break
            if match == 0:
                result.append((predictedValues[0][0],dirty_list[i],truth_list[i]))
                print("No match")
        except KeyError:
            excluded_list.append(dirty_list[i])
            #print(dirty_list[i])
            pass

In [87]:
print (len(excluded_list))
print (len(result))
print (len(excluded_list) + len(result))
print (len(dirty_list))
excluded_list[0]


12
190
202
202


'3x23x'

In [88]:
#identify the true positives and false positives. Store them in the seperate list for further analysis.
true_pos = 0
false_pos = 0
correctly_predicted = []
falsely_predicted = []
for i in range(len(result)):
    if result[i][0] == result[i][2]:
        true_pos = true_pos + 1
        correctly_predicted.append(result[i])
    else:
        false_pos = false_pos + 1
        falsely_predicted.append(result[i])

In [89]:
print(len(result))
print(len(correctly_predicted))
print(len(falsely_predicted))

190
154
36


In [91]:
for i in falsely_predicted:
    print (i)

('36067', '360x9', '36049')
('10086', 'Voluntary non-profit - Private', '10011')
('1007 GOODYEAR AVENUE', '60%', 'BIRMINGHAM')
('ANCHORAGE', 'AL_SCIP-INF-4', '35235')
('10016', '10011', 'JEFFERSON')
('35957', '359x8', '35968')
('36116', '1xx16', '10016')
('10085', '100x8', '10038')
('36801', '36x01', '36201')
('10085', '100x6', '10086')
('EAST ALABAMA MEDICAL CENTER AND SNF', 'xNIV OF SOxTH ALABAMA MEDICAL CENTER', 'UNIV OF SOUTH ALABAMA MEDICAL CENTER')
('PO BOX 287', '3x0x7', '36067')
('99508', 'x0x08', '10108')
('FAYETTE MEDICAL CENTER', 'DALE MEDICAL CENTER', '10021')
('CALLAHAN EYE FOUNDATION HOSPITAL', '126 HOSPITAL AVE', 'DALE MEDICAL CENTER')
('ALASKA REGIONAL HOSPITAL', 'AL', 'OZARK')
('LAUDERDALE', 'DALE', '36360')
('36801', '3347742601', 'DALE')
('35960', 'x6x60', '36360')
('JACKSON HOSPITAL & CLINIC INC', '1xx24', '10024')
('TALLADEGA', 'TALLAxxEE', 'TALLASSEE')
('10085', '100x4', '10034')
('36278', '36x78', '36078')
('10085', '100x5', '10035')
('35235', '1xx35', '10035')
(

In [83]:
for i in correctly_predicted:
    print (i)

('BIRMINGHAM', 'BIRMINGHxM', 'BIRMINGHAM')
('BIRMINGHAM', 'BIRMINGxAM', 'BIRMINGHAM')
('SHEFFIELD', 'SHEFFxELD', 'SHEFFIELD')
('10019', '1xx19', '10019')
('SOUTHEAST ALABAMA MEDICAL CENTER', 'SOUTHEAST ALABAMA MEDxCAL CENTER', 'SOUTHEAST ALABAMA MEDICAL CENTER')
('35957', 'x5957', '35957')
('10005', 'x0005', '10005')
('BOAZ', 'BOxZ', 'BOAZ')
('2505 U S HIGHWAY 431 NORTH', '2505xUxSxHIGHWAYx431xNORTH', '2505 U S HIGHWAY 431 NORTH')
('35957', '3595x', '35957')
('MARSHALL', 'MxRSHxLL', 'MARSHALL')
('35631', '3563x', '35631')
('LAUDERDALE', 'LAUDxRDALx', 'LAUDERDALE')
('LAUDERDALE', 'LAUDExDALE', 'LAUDERDALE')
('FLORENCE', 'FxORENCE', 'FLORENCE')
('10006', '1000x', '10006')
('FLORENCE', 'FLORxNCx', 'FLORENCE')
('FLORENCE', 'FLxRENCE', 'FLORENCE')
('35631', '3563x', '35631')
('702 N MAIN ST', '702xNxMAINxST', '702 N MAIN ST')
('702 N MAIN ST', '702 x MAIx ST', '702 N MAIN ST')
('CRENSHAW COMMUNITY HOSPITAL', 'CRENSHAW CxMMUNITY HxSPITAL', 'CRENSHAW COMMUNITY HOSPITAL')
('CRENSHAW', 'CRxNSHA

In [84]:
falsely_predicted[1]

('10086', 'Voluntary non-profit - Private', '10011')

In [85]:
model_hosp.most_similar('3595x')

[('35957', 0.9885784387588501),
 ('35968', 0.8877739310264587),
 ('35976', 0.885722815990448),
 ('35960', 0.8576430082321167),
 ('35903', 0.830594539642334),
 ('35901', 0.8117714524269104),
 ('35058', 0.6940221190452576),
 ('35007', 0.6867320537567139),
 ('35640', 0.627211332321167),
 ('35653', 0.6189522743225098)]