# Record Embedding

In [1]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Embedding using fastText
Details here: https://fasttext.cc/

In [2]:
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.models import FastText

In [3]:

df_truth = pd.read_csv('truth_values_1100_tuples.csv',dtype=object, encoding='utf8')

df_dirty = pd.read_csv('HospitalErrorsWithoutNan.csv',dtype=object, encoding='utf8')

providerT = df_truth['ProviderNumber'].tolist()
hospitalT = df_truth['HospitalName'].tolist()
addressT = df_truth['Address1'].tolist()
cityT = df_truth['City'].tolist()
zipcodeT = df_truth['ZipCode'].tolist()
countyT = df_truth['CountyName'].tolist()
phoneT = df_truth['PhoneNumber'].tolist()
OwnerT = df_truth['HospitalOwner'].tolist()
ConditionT = df_truth['Condition'].tolist()
SampleT = df_truth['Sample'].tolist()

combined_hosp = list(zip(providerT, hospitalT, addressT, cityT, zipcodeT, countyT, OwnerT, ConditionT, SampleT))
df_truth.head(2)


Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients


In [4]:
df_dirty.head()

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample,label
0,10011,ST VINCENP'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,35235,JEFFERSON,2058383122,Voluntary non-profit - Private,Pneumonia,69 patients,0
1,10027,ELBA GENERAL HOSPITAL,987 DRAYTON STREET,ELBA,36323,COFFEE,3348972257,Voluntary non-profit - Other,Surgical Infection Prevention,0 patients,0
2,10044,MARION REGIONAL MEDICAL CENTER,1256 MILITARY STREET SOUTH,HAMILTON,35570,MARIYN,2059216200,Voluntary non-profit - Private,Pneumonia,42 patients,0
3,10036,ANDALUSIA REGIONAL HOSPITAL,849 SOUTH THREE NOTCH STREET,ANDTLUSIA,36420,COVINGTON,3342228466,Proprietary,Heart Attack,3 patients,0
4,10023,BAPTIST MEDICAL CENTER SOUTH,2105 EVST SOUTH BOULEVARD,MONTGOMERY,36116,MONTGOMERY,3342882100,Voluntary non-profit - Church,Surgical Infection Prevention,392 patients,0


In [5]:
len(df_truth)

1100

In [6]:
len(combined_hosp[1])

9

In [107]:
#hospital dirty dataset; maintain dirty dataset separately
provider = df_dirty['ProviderNumber'].tolist()
hospital = df_dirty['HospitalName'].tolist()
address = df_dirty['Address1'].tolist()
city = df_dirty['City'].tolist()
zipcode = df_dirty['ZipCode'].tolist()
county = df_dirty['CountyName'].tolist()
phone = df_dirty['PhoneNumber'].tolist()
Owner = df_dirty['HospitalOwner'].tolist()
Condition = df_dirty['Condition'].tolist()
Sample = df_dirty['Sample'].tolist()

combined_dirty = list(zip(address, city, Condition, county, hospital, Owner, phone, provider, Sample, zipcode))
combined_dirty[1][9]

'36323'

In [119]:
uniqueTokens = []
uniqueTokens.append(set(addressT))
uniqueTokens.append(set(cityT))
uniqueTokens.append(set(ConditionT))
uniqueTokens.append(set(countyT))
uniqueTokens.append(set(hospitalT))
uniqueTokens.append(set(OwnerT))
uniqueTokens.append(set(phoneT))
uniqueTokens.append(set(providerT))
uniqueTokens.append(set(SampleT)) 
uniqueTokens.append(set(zipcodeT))
len(uniqueTokens)

10

In [120]:
len(combined_dirty[0])

10

In [121]:
dirty_list = []
for i in range(0, len(combined_dirty)):
    for j in range(0, len(uniqueTokens)):
        if combined_dirty[i][j] not in uniqueTokens[j]:
            dirty_list.append(combined_dirty[i][j])

In [122]:
len(dirty_list)

193

In [123]:
dirty_list

["ST VINCENP'S EAST",
 'MARIYN',
 'ANDTLUSIA',
 '2105 EVST SOUTH BOULEVARD',
 '1725 PSNE STREET',
 'SYLACXUGA',
 'PRATTVILOE',
 '209 NORTH MAIN STREZT',
 'ZUVERNE',
 '1256 MILITARP STREET SOUTH',
 'EFOWAH',
 'MONTGOMEBY',
 'SHEKBY BAPTIST MEDICAL CENTER',
 'WEOOWEE',
 'ALABAETER',
 'FAYKTTE',
 '1178 ROSS CLARK CIRCLE',
 'ANCWORAGE',
 'MARSHALL MEDICAR CENTER SOUTH',
 'MARSHALL MEDICAL CEMTER SOUTH',
 '315 W HICKOQY ST',
 'QLBA',
 'ANDAXUSIA',
 'FLORKNCE',
 'AUVERNE',
 'MSRGAN',
 'GADSDEN REUIONAL MEDICAL CENTER',
 'COMMUNITY HOSPFTAL INC',
 'G H LANGER MEMORIAL HOSPITAL',
 '1000 FIRST JTREET NORTH',
 'ELOORE',
 'DE KAMB',
 'CHAMBERF',
 'CARION',
 'WEDOWEE HOSEITAL',
 '200 DED CENTER DRIVE',
 'VAKLEY',
 '209 NORTH MAIN STREEP',
 '315 W HICKOQY ST',
 'THYMASVILLE',
 '1000 FIRST STREET NOJTH',
 'MORWAN',
 'JEFFSRSON',
 'LAUDERFALE',
 '301 EXST 18TH ST',
 '1000 FIRST STREET NOQTH',
 'SALE',
 '406 NORTHWOOD DR',
 'VANLEY',
 'JJFFERSON',
 'LUVEQNE',
 '400 N EDGARDS STREET',
 'WEDOWEJ HOSPITA

## Mapping to get column information

In [124]:
mapping = dict(enumerate(sorted(df_truth.columns.values)))

In [125]:
mapping

{0: 'Address1',
 1: 'City',
 2: 'Condition',
 3: 'CountyName',
 4: 'HospitalName',
 5: 'HospitalOwner',
 6: 'PhoneNumber',
 7: 'ProviderNumber',
 8: 'Sample',
 9: 'ZipCode'}

## Error Cleaning using fastText

Convert each tuple into a row. We load hospital dataset. The fastText model is trained using groundtruth dataset for hospital and then the dirty dataset is used for correcting spelling errors. The fastText algorithm outputs top 10 matches and loses column information. Here we use Attribute classifier to understand the column info of the erroroneous tuple and the first match with same attribute in the predicted list is assumed as the right cell value for the cell. 

In [38]:
#training the model with truth data.
model_hosp = FastText(combined_hosp, min_count=1, workers=8, iter=1000)

## Module to predict the top value by the model. Some values are not broken into tokens.

In [126]:
excluded_list = []
result = []
predictedValues = []

## Loading attribute classifier

In [127]:
model = load_model('HospitalMultiAttributeClassifier.h5')
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Error Cleaning using Attribute Classifer over FastText

In [128]:
for i in range(len(dirty_list)):
    testStr = [dirty_list[i]]
    sequences = tokenizer.texts_to_sequences(testStr)
    dirtyData = pad_sequences(sequences, maxlen=45)
    dirtyCol = model.predict(dirtyData)
    if isinstance(dirty_list[i], str):
        try:
            predictedValues = model_hosp.most_similar(dirty_list[i])
            match = 0
            flag = 0
            tempAttr = ""
            for j in range(0, len(predictedValues)):
                tryStr = [predictedValues[j][0]]
                predSequences = tokenizer.texts_to_sequences(tryStr)
                testData = pad_sequences(predSequences, maxlen=45)
                correctCol = model.predict(testData)
                if flag == 0:
                    tempAttr = correctCol
                    flag = 1
                if mapping[np.argmax(correctCol[0])] == mapping[np.argmax(dirtyCol[0])]:
                    result.append((mapping[np.argmax(correctCol[0])], predictedValues[j][0]))
                    #print(mapping[np.argmax(dirtyCol[0])], predictedValues[j][0])
                    match = 1
                    break
            if match == 0:
                predSequences0 = tokenizer.texts_to_sequences(predictedValues[0][0])
                testData0 = pad_sequences(predSequences0, maxlen=45)
                correctCol0 = model.predict(testData0)
                #print(mapping[np.argmax(dirtyCol[0])], dirty_list[i])
                result.append((mapping[np.argmax(tempAttr)], predictedValues[0][0]))
                #print("No match")
        except KeyError:
            excluded_list.append(dirty_list[i])
            #print(dirty_list[i])
            pass

In [129]:
print (len(excluded_list))
print (len(result))
print (len(excluded_list) + len(result))
print (len(dirty_list))

0
193
193
193


In [130]:
inverseMapping = {'Address1':0,
'City':1,
'Condition':2,
'CountyName':3,
'HospitalName':4,
'HospitalOwner':5,
'PhoneNumber':6,
'ProviderNumber': 7,
'Sample': 8,
'ZipCode':9}

In [131]:
y_pred = 0
incorrect = 0
incorrectAttr = []
incorrectValues = [] 
for i in range(0, len(result)):
    #print(result[i][0],result[i][1])
    attribute=result[i][0]
    strng=result[i][1]
    runQuery = "{} == @strng".format(attribute)
    if df_truth.query(runQuery).shape[0] > 0:
        print(result[i][0], result[i][1], dirty_list[i])
        y_pred = y_pred+1
    else:
        incorrect = incorrect + 1
        print(strng)
        incorrectAttr.append(attribute)
        incorrectValues.append(strng)

HospitalName ST VINCENT'S EAST ST VINCENP'S EAST
CountyName MARION MARIYN
City ANDALUSIA ANDTLUSIA
Address1 2105 EAST SOUTH BOULEVARD 2105 EVST SOUTH BOULEVARD
Address1 1725 PINE STREET 1725 PSNE STREET
City SYLACAUGA SYLACXUGA
City PRATTVILLE PRATTVILOE
Address1 209 NORTH MAIN STREET 209 NORTH MAIN STREZT
City LUVERNE ZUVERNE
Address1 1256 MILITARY STREET SOUTH 1256 MILITARP STREET SOUTH
CountyName ETOWAH EFOWAH
City MONTGOMERY MONTGOMEBY
HospitalName SHELBY BAPTIST MEDICAL CENTER SHEKBY BAPTIST MEDICAL CENTER
City WEDOWEE WEOOWEE
City ALABASTER ALABAETER
City FAYETTE FAYKTTE
Address1 1108 ROSS CLARK CIRCLE 1178 ROSS CLARK CIRCLE
City ANCHORAGE ANCWORAGE
HospitalName MARSHALL MEDICAL CENTER SOUTH MARSHALL MEDICAR CENTER SOUTH
HospitalName MARSHALL MEDICAL CENTER SOUTH MARSHALL MEDICAL CEMTER SOUTH
Address1 315 W HICKORY ST 315 W HICKOQY ST
City ELBA QLBA
City ANDALUSIA ANDAXUSIA
City FLORENCE FLORKNCE
City LUVERNE AUVERNE
CountyName MORGAN MSRGAN
HospitalName GADSDEN REGIONAL MEDICAL 

In [132]:
print(y_pred)
print(incorrect)

193
0


In [103]:
for i in range(0, len(incorrectAttr)):
    print(incorrectAttr[i], incorrectValues[i])

## Precision, Recall, F1-score

In [91]:
for i in falsely_predicted:
    print (i)

('36067', '360x9', '36049')
('10086', 'Voluntary non-profit - Private', '10011')
('1007 GOODYEAR AVENUE', '60%', 'BIRMINGHAM')
('ANCHORAGE', 'AL_SCIP-INF-4', '35235')
('10016', '10011', 'JEFFERSON')
('35957', '359x8', '35968')
('36116', '1xx16', '10016')
('10085', '100x8', '10038')
('36801', '36x01', '36201')
('10085', '100x6', '10086')
('EAST ALABAMA MEDICAL CENTER AND SNF', 'xNIV OF SOxTH ALABAMA MEDICAL CENTER', 'UNIV OF SOUTH ALABAMA MEDICAL CENTER')
('PO BOX 287', '3x0x7', '36067')
('99508', 'x0x08', '10108')
('FAYETTE MEDICAL CENTER', 'DALE MEDICAL CENTER', '10021')
('CALLAHAN EYE FOUNDATION HOSPITAL', '126 HOSPITAL AVE', 'DALE MEDICAL CENTER')
('ALASKA REGIONAL HOSPITAL', 'AL', 'OZARK')
('LAUDERDALE', 'DALE', '36360')
('36801', '3347742601', 'DALE')
('35960', 'x6x60', '36360')
('JACKSON HOSPITAL & CLINIC INC', '1xx24', '10024')
('TALLADEGA', 'TALLAxxEE', 'TALLASSEE')
('10085', '100x4', '10034')
('36278', '36x78', '36078')
('10085', '100x5', '10035')
('35235', '1xx35', '10035')
(

In [83]:
for i in correctly_predicted:
    print (i)

('BIRMINGHAM', 'BIRMINGHxM', 'BIRMINGHAM')
('BIRMINGHAM', 'BIRMINGxAM', 'BIRMINGHAM')
('SHEFFIELD', 'SHEFFxELD', 'SHEFFIELD')
('10019', '1xx19', '10019')
('SOUTHEAST ALABAMA MEDICAL CENTER', 'SOUTHEAST ALABAMA MEDxCAL CENTER', 'SOUTHEAST ALABAMA MEDICAL CENTER')
('35957', 'x5957', '35957')
('10005', 'x0005', '10005')
('BOAZ', 'BOxZ', 'BOAZ')
('2505 U S HIGHWAY 431 NORTH', '2505xUxSxHIGHWAYx431xNORTH', '2505 U S HIGHWAY 431 NORTH')
('35957', '3595x', '35957')
('MARSHALL', 'MxRSHxLL', 'MARSHALL')
('35631', '3563x', '35631')
('LAUDERDALE', 'LAUDxRDALx', 'LAUDERDALE')
('LAUDERDALE', 'LAUDExDALE', 'LAUDERDALE')
('FLORENCE', 'FxORENCE', 'FLORENCE')
('10006', '1000x', '10006')
('FLORENCE', 'FLORxNCx', 'FLORENCE')
('FLORENCE', 'FLxRENCE', 'FLORENCE')
('35631', '3563x', '35631')
('702 N MAIN ST', '702xNxMAINxST', '702 N MAIN ST')
('702 N MAIN ST', '702 x MAIx ST', '702 N MAIN ST')
('CRENSHAW COMMUNITY HOSPITAL', 'CRENSHAW CxMMUNITY HxSPITAL', 'CRENSHAW COMMUNITY HOSPITAL')
('CRENSHAW', 'CRxNSHA

In [84]:
falsely_predicted[1]

('10086', 'Voluntary non-profit - Private', '10011')

In [85]:
model_hosp.most_similar('3595x')

[('35957', 0.9885784387588501),
 ('35968', 0.8877739310264587),
 ('35976', 0.885722815990448),
 ('35960', 0.8576430082321167),
 ('35903', 0.830594539642334),
 ('35901', 0.8117714524269104),
 ('35058', 0.6940221190452576),
 ('35007', 0.6867320537567139),
 ('35640', 0.627211332321167),
 ('35653', 0.6189522743225098)]