# Record Embedding

In [81]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics

%matplotlib inline

## Embedding using fastText
Details here: https://fasttext.cc/

In [82]:
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.models import FastText

In [98]:

df_truth = pd.read_csv('truth_values_1100_tuples.csv',dtype=object, encoding='utf8')

df_dirty = pd.read_csv('HospitalErrorsWithoutNan.csv',dtype=object, encoding='utf8')

providerT = df_truth['ProviderNumber'].tolist()
hospitalT = df_truth['HospitalName'].tolist()
addressT = df_truth['Address1'].tolist()
cityT = df_truth['City'].tolist()
zipcodeT = df_truth['ZipCode'].tolist()
countyT = df_truth['CountyName'].tolist()
phoneT = df_truth['PhoneNumber'].tolist()
OwnerT = df_truth['HospitalOwner'].tolist()
ConditionT = df_truth['Condition'].tolist()
SampleT = df_truth['Sample'].tolist()

combined_hosp = list(zip(providerT, hospitalT, addressT, cityT, zipcodeT, countyT, OwnerT, ConditionT, SampleT))
df_truth.head(2)


Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample
0,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients
1,10018,CALLAHAN EYE FOUNDATION HOSPITAL,1720 UNIVERSITY BLVD,BIRMINGHAM,35233,JEFFERSON,2053258100,Voluntary non-profit - Private,Surgical Infection Prevention,0 patients


In [99]:
df_dirty.head()

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,ZipCode,CountyName,PhoneNumber,HospitalOwner,Condition,Sample,label
0,10011,ST VINCENP'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,35235,JEFFERSON,2058383122,Voluntary non-profit - Private,Pneumonia,69 patients,0
1,10027,ELBA GENERAL HOSPITAL,987 DRAYTON STREET,ELBA,36323,COFFEE,3348972257,Voluntary non-profit - Other,Surgical Infection Prevention,0 patients,0
2,10044,MARION REGIONAL MEDICAL CENTER,1256 MILITARY STREET SOUTH,HAMILTON,35570,MARIYN,2059216200,Voluntary non-profit - Private,Pneumonia,42 patients,0
3,10036,ANDALUSIA REGIONAL HOSPITAL,849 SOUTH THREE NOTCH STREET,ANDTLUSIA,36420,COVINGTON,3342228466,Proprietary,Heart Attack,3 patients,0
4,10023,BAPTIST MEDICAL CENTER SOUTH,2105 EVST SOUTH BOULEVARD,MONTGOMERY,36116,MONTGOMERY,3342882100,Voluntary non-profit - Church,Surgical Infection Prevention,392 patients,0


In [100]:
len(df_truth)

1100

In [101]:
len(combined_hosp[1])

9

In [102]:
#hospital dirty dataset; maintain dirty dataset separately
provider = df_dirty['ProviderNumber'].tolist()
hospital = df_dirty['HospitalName'].tolist()
address = df_dirty['Address1'].tolist()
city = df_dirty['City'].tolist()
zipcode = df_dirty['ZipCode'].tolist()
county = df_dirty['CountyName'].tolist()
phone = df_dirty['PhoneNumber'].tolist()
Owner = df_dirty['HospitalOwner'].tolist()
Condition = df_dirty['Condition'].tolist()
Sample = df_dirty['Sample'].tolist()

combined_dirty = list(zip(provider, hospital, address, city, zipcode, county, Owner, Condition, Sample))
len(combined_dirty[1])

9

In [103]:
df_dirty.drop(['label'], axis=1, inplace=True)

In [107]:
#Calculating error or dirty cells
dirty_list = []
truth_list = []
for i in range(len(combined_dirty)):
    for j in range(len(combined_dirty[0])):
        if combined_dirty[i][j] != combined_hosp[i][j]:
            dirty_list.append(combined_dirty[i][j])
            truth_list.append(combined_hosp[i][j])

In [106]:
uniqueTokens = []
uniqueTokens.append(set(providerT))
uniqueTokens.append(set(hospitalT))
uniqueTokens.append(set(addressT))
uniqueTokens.append(set(cityT))
uniqueTokens.append(set(zipcodeT))
uniqueTokens.append(set(countyT))
uniqueTokens.append(set(OwnerT))
uniqueTokens.append(set(ConditionT)) 
uniqueTokens.append(set(SampleT))

45

In [112]:
len(combined_dirty[0])

9

In [113]:
for i in range(0, len(combined_dirty)):
    for j in range(0, len(combined_dirty[0])):
        if combined_dirty[i][j] not in uniqueTokens[j]:
            dirty_list.append(combined_dirty[i][j])

In [114]:
len(dirty_list)

4008

In [115]:
dirty_list

['10011',
 "ST VINCENP'S EAST",
 '50 MEDICAL PARK EAST DRIVE',
 '35235',
 'Pneumonia',
 '69 patients',
 '10027',
 'ELBA GENERAL HOSPITAL',
 '987 DRAYTON STREET',
 'ELBA',
 '36323',
 'COFFEE',
 'Voluntary non-profit - Other',
 '10044',
 'MARION REGIONAL MEDICAL CENTER',
 '1256 MILITARY STREET SOUTH',
 'HAMILTON',
 '35570',
 'MARIYN',
 'Pneumonia',
 '42 patients',
 '10036',
 'ANDALUSIA REGIONAL HOSPITAL',
 '849 SOUTH THREE NOTCH STREET',
 'ANDTLUSIA',
 '36420',
 'COVINGTON',
 'Proprietary',
 'Heart Attack',
 '3 patients',
 '10023',
 'BAPTIST MEDICAL CENTER SOUTH',
 '2105 EVST SOUTH BOULEVARD',
 'MONTGOMERY',
 '36116',
 'MONTGOMERY',
 'Voluntary non-profit - Church',
 '392 patients',
 '10038',
 'STRINGFELLOW MEMORIAL HOSPITAL',
 '301 EAST 18TH ST',
 'HARTSELLE',
 '36201',
 'CALHOUN',
 'Proprietary',
 'Heart Attack',
 '73 patients',
 '10024',
 'JACKSON HOSPITAL & CLINIC INC',
 '1725 PSNE STREET',
 'MONTGOMERY',
 '36106',
 'MONTGOMERY',
 'Pneumonia',
 '184 patients',
 '10025',
 'G H LANIER 

## Mapping to get column information

In [135]:
mapping = dict(enumerate(df_truth.columns.values))

In [136]:
mapping

{0: 'ProviderNumber',
 1: 'HospitalName',
 2: 'Address1',
 3: 'City',
 4: 'ZipCode',
 5: 'CountyName',
 6: 'PhoneNumber',
 7: 'HospitalOwner',
 8: 'Condition',
 9: 'Sample'}

## Error Cleaning using fastText

Convert each tuple into a row. We load hospital dataset. The fastText model is trained using groundtruth dataset for hospital and then the dirty dataset is used for correcting spelling errors. The fastText algorithm outputs top 10 matches and loses column information. Here we use Attribute classifier to understand the column info of the erroroneous tuple and the first match with same attribute in the predicted list is assumed as the right cell value for the cell. 

In [137]:
#training the model with truth data.
model_hosp = FastText(combined_hosp, min_count=1, workers=8, iter=1000)

## Module to predict the top value by the model. Some values are not broken into tokens.

In [138]:
excluded_list = []
result = []
predictedValues = []

## Loading attribute classifier

In [143]:
model = load_model('HospitalMultiAttributeClassifier.h5')
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Error Cleaning using Attribute Classifer over FastText

In [148]:
for i in range(len(dirty_list)):
    print(dirty_list[i])
    testStr = [dirty_list[i]]
    sequences = tokenizer.texts_to_sequences(testStr)
    dirtyData = pad_sequences(sequences, maxlen=200)
    dirtyCol = model.predict(dirtyData)
    if isinstance(dirty_list[i], str):
        try:
            predictedValues = model_hosp.most_similar(dirty_list[i])
            match = 0
            for j in range(0, len(predictedValues)):
                tryStr = [predictedValues[j][0]]
                predSequences = tokenizer.texts_to_sequences(tryStr)
                testData = pad_sequences(predSequences, maxlen=200)
                correctCol = model.predict(testData)
                if mapping[np.argmax(correctCol[0])] == mapping[np.argmax(dirtyCol[0])]:
                    result.append((mapping[np.argmax(correctCol[0])], predictedValues[j][0]))
                    print(mapping[np.argmax(dirtyCol[0])], predictedValues[j][0])
                    match = 1
                    break
            if match == 0:
                predSequences0 = tokenizer.texts_to_sequences(predictedValues[0][0])
                testData0 = pad_sequences(predSequences0, maxlen=200)
                correctCol0 = model.predict(testData0)
                result.append((mapping[np.argmax(correctCol[0])],predictedValues[0][0]))
                print("No match")
        except KeyError:
            excluded_list.append(dirty_list[i])
            #print(dirty_list[i])
            pass

10011
HospitalOwner 10015
ST VINCENP'S EAST
No match
50 MEDICAL PARK EAST DRIVE
HospitalOwner 150 GILBREATH DRIVE
35235
Sample 35233
Pneumonia
Address1 Heart Failure
69 patients
Condition 49 patients
10027
HospitalOwner 10022
ELBA GENERAL HOSPITAL
ProviderNumber DECATUR GENERAL HOSPITAL
987 DRAYTON STREET
ProviderNumber 2451 FILLINGIM STREET
ELBA
HospitalName SYLACAUGA
36323
Sample 36330
COFFEE
City CALHOUN
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
10044
HospitalOwner 10047
MARION REGIONAL MEDICAL CENTER
ZipCode CULLMAN REGIONAL MEDICAL CENTER
1256 MILITARY STREET SOUTH
ZipCode 849 SOUTH THREE NOTCH STREET
HAMILTON
HospitalName ANNISTON
35570
Sample 35594
MARIYN
HospitalName OPP
Pneumonia
Address1 Heart Failure
42 patients
Condition 342 patients
10036
HospitalOwner 10034
ANDALUSIA REGIONAL HOSPITAL
ZipCode ALASKA REGIONAL HOSPITAL
849 SOUTH THREE NOTCH STREET
ZipCode 201 PINE STREET NORTHWEST
ANDTLUSIA
HospitalName ANDALUSIA
36420
Sample 36467
COVINGTON
City

No match
1653 TEMPLE AVENUE NORTH
ZipCode 209 NORTH MAIN STREET
FAYETTE
HospitalName FLORENCE
35555
Sample 35570
FAYKTTE
HospitalName FAYETTE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Pneumonia
Address1 Heart Failure
117 patients
Condition 317 patients
10029
HospitalOwner 10022
RIVERVIEW REGIONAL MEDICAL CENTER
ZipCode GADSDEN REGIONAL MEDICAL CENTER
600 SOUTH THIRD STREET
No match
GADSDEN
HospitalName RUSSELLVILLE
35901
Sample 35903
ETOWAH
City MORGAN
Proprietary
No match
Heart Attack
Address1 Surgical Infection Prevention
145 patients
Condition 45 patients
1178 ROSS CLARK CIRCLE
ProviderNumber 1108 ROSS CLARK CIRCLE
191 patients
Condition 135 patients
20017
HospitalOwner 10012
ALASKA REGIONAL HOSPITAL
ZipCode ANDALUSIA REGIONAL HOSPITAL
2801 DEBARR ROAD
ProviderNumber 805 FRIENDSHIP ROAD
ANCWORAGE
HospitalName ANCHORAGE
99508
Sample 99559
ANCHORAGE
HospitalName ENTERPRISE
Proprietary
No match
39 patients
Condition 169 patients
10008
HospitalOwner 10009
CRE

ZipCode CRENSHAW COMMUNITY HOSPITAL
805 FRIENDSHIP ROAD
ProviderNumber 2801 DEBARR ROAD
TALLASSEE
HospitalName VALLEY
36078
Sample 36067
ELOORE
HospitalName CENTRE
Voluntary non-profit - Private
CountyName Voluntary non-profit - Other
9 patients
Condition 269 patients
10012
HospitalOwner 10016
DEKALB REGIONAL MEDICAL CENTER
ZipCode MARION REGIONAL MEDICAL CENTER
200 MED CENTER DRIVE
No match
FORT PAYNE
HospitalName FAYETTE
35968
Sample 35960
DE KAMB
HospitalName ALABASTER
Voluntary non-profit - Church
CountyName Voluntary non-profit - Other
139 patients
Condition 269 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
VALLEY
HospitalName TALLASSEE
36854
Sample 36278
CHAMBERF
HospitalName ELBA
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
172 patients
Condition 175 patients
10086
HospitalOwner 10087
NORTHWEST MEDICAL CENTER
ZipCode HARTSELLE MEDICAL CENTER
1530 U S H

Sample 35653
LAUDERFALE
HospitalName FAYETTE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Surgical Infection Prevention
Address1 Heart Attack
462 patients
Condition 962 patients
10047
HospitalOwner 10044
GEORGIANA HOSPITAL
ProviderNumber DECATUR GENERAL HOSPITAL
515 MIRANDA ST
ProviderNumber 315 W HICKORY ST
ANNISTON
HospitalName HAMILTON
36033
Sample 36067
BUTLER
City CHEROKEE
16 patients
Condition 86 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
VALLEY
HospitalName TALLASSEE
36854
Sample 36278
CHAMBERS
City ELMORE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Pneumonia
Address1 Heart Failure
43 patients
Condition 243 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
VALLEY
HospitalName TALLASSEE
36106
Sample 36116
CHAMBERS
City ELMORE


Sample 36078
AUTAUGA
City TALLADEGA
Voluntary non-profit - Private
CountyName Voluntary non-profit - Other
Heart Attack
Address1 Surgical Infection Prevention
0 patients
Condition 2 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
VALLEY
HospitalName TALLASSEE
36854
Sample 36278
CHAMBERS
City ELMORE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Heart Attack
Address1 Surgical Infection Prevention
137 patients
Condition 37 patients
CRENSHAD
HospitalName LUVERNE
Heart Failure
Address1 Pneumonia
39 patients
Condition 169 patients
10045
HospitalOwner 10047
FAYETTE MEDICAL CENTER
No match
1653 TEMPLE AVENUE NORTH
ZipCode 209 NORTH MAIN STREET
FAYETTE
HospitalName FLORENCE
35555
Sample 35570
FAYETTE
HospitalName FLORENCE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Surgical Infection Prevention
Address1 Heart Attack
0 patients
Condition 2 patien

HospitalName OPP
36049
Sample 36067
CRENSHPW
HospitalName LUVERNE
Government - Federal
CountyName Government - Local
Surgical Infection Prevention
Address1 Heart Attack
0 patients
Condition 2 patients
10034
HospitalOwner 10039
COMMUNITY HOSPITAL INC
ZipCode CRENSHAW COMMUNITY HOSPITAL
805 FRIEEDSHIP ROAD
No match
TALLASSEE
HospitalName VALLEY
36078
Sample 36067
ELMORE
City COVINGTON
Voluntary non-profit - Private
CountyName Voluntary non-profit - Other
4 patients
Condition 73 patients
10009
HospitalOwner 10008
HARTSELLE MEDICAL CENTER
ZipCode DALE MEDICAL CENTER
201 PINE STREET NORTHWEST
ZipCode 209 NORTH MAIN STREET
HARTSELLE
HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Proprietary
No match
Pneumonia
Address1 Heart Failure
44 patients
Condition 24 patients
10009
HospitalOwner 10008
HARTSELLE MEDICAL CENTER
ZipCode DALE MEDICAL CENTER
201 PINE STREET NORTHWEST
ZipCode 209 NORTH MAIN STREET
HARTSELLE
HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Prop

No match
1653 TEMPLE AVENUE NORTH
ZipCode 209 NORTH MAIN STREET
FAYETTE
HospitalName FLORENCE
35555
Sample 35570
FAYETTE
HospitalName FLORENCE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Surgical Infection Prevention
Address1 Heart Attack
0 patients
Condition 2 patients
10012
HospitalOwner 10016
DEKALB REGIONAL MEDICAL CENTER
ZipCode MARION REGIONAL MEDICAL CENTER
200 MED CENTER DRIVE
No match
FORT PAYNE
HospitalName FAYETTE
35968
Sample 35960
DE KALB
City SHELBY
Voluntary non-profit - Church
CountyName Voluntary non-profit - Other
Surgical Infection Prevention
Address1 Heart Attack
41 patients
Condition 681 patients
10009
HospitalOwner 10008
HARTSELLE MEDICAL CENTER
ZipCode DALE MEDICAL CENTER
201 PINE STREET NORTHWEST
ZipCode 209 NORTH MAIN STREET
HARTSELLE
HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Proprietary
No match
Surgical Infection Prevention
Address1 Heart Attack
11 patients
Condition 111 patients
10009
HospitalOwner 10008
HARTSE

ZipCode 209 NORTH MAIN STREET
HARTSELLE
HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Proprietary
No match
25 patients
Condition 145 patients
10044
HospitalOwner 10047
MARION REGIONAL MEDICAL CENTER
ZipCode CULLMAN REGIONAL MEDICAL CENTER
1256 MILITARY STREET SOUTH
ZipCode 849 SOUTH THREE NOTCH STREET
HAMILTON
HospitalName ANNISTON
35570
Sample 35594
MARION
City CHEROKEE
Voluntary non-profit - Private
CountyName Voluntary non-profit - Other
Heart Attack
Address1 Surgical Infection Prevention
4 patients
Condition 73 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
VALLEY
HospitalName TALLASSEE
36854
Sample 36278
CHAMBERS
City ELMORE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Heart Failure
Address1 Pneumonia
137 patients
Condition 37 patients
10008
HospitalOwner 10009
CRENSHAW COMMUNITY HOSPITAL
ZipCode COMMUNITY HOSPITAL INC
101 HOSPITAL CIRCL

Sample 36305
HOUSTON
City COVINGTON
Government - Hospital District or Authority
CountyName Government - Local
405 patients
Condition 385 patients
10006
HospitalOwner 10008
ELIZA COFFEE MEMORIOL HOSPITAL
City LAUDERDALE
205 MARENGO STREET
ProviderNumber 400 N EDWARDS STREET
FLORENCE
HospitalName FAYETTE
35631
Sample 35653
LAUDERDALE
City DALE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
242 patients
Condition 462 patients
10006
HospitalOwner 10008
ELIZA COFFEE MEMORIAL HOSPITAL
ZipCode G H LANIER MEMORIAL HOSPITAL
205 MARENGO STREET
ProviderNumber 400 N EDWARDS STREET
FLORENCE
HospitalName FAYETTE
35631
Sample 35653
LQUDERDALE
HospitalName FAYETTE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Pneumonia
Address1 Heart Failure
174 patients
Condition 74 patients
10021
HospitalOwner 10024
DALE MEDICAL CENTER
ZipCode HARTSELLE MEDICAL CENTER
126 HOSPOTAL AVE
No match
OZARK
HospitalName WEDOWEE
36360
Sample 36302
DALE
City LAUDERDALE
Government

ZipCode ANDALUSIA REGIONAL HOSPITAL
2801 DEBARR ROAD
ProviderNumber 805 FRIENDSHIP ROAD
ANCHORAGE
HospitalName ENTERPRISE
99508
Sample 99559
ANCHORAGE
HospitalName ENTERPRISE
Proprietary
No match
Surgical Infection Prevention
Address1 Heart Attack
513 patients
Condition 73 patients
10023
HospitalOwner 10024
BAPTIST MEDICAL CENTER SOUTH
ZipCode SHELBY BAPTIST MEDICAL CENTER
2105 EAST SOUTH BOULEVARD
No match
MONTGOZERY
HospitalName ONEONTA
36116
Sample 36106
MONTGOMERY
City ELMORE
Voluntary non-profit - Church
CountyName Voluntary non-profit - Other
53 patients
Condition 303 patients
10025
HospitalOwner 10022
G H LANIER MEMORIAL HOSPITAL
ZipCode HELEN KELLER MEMORIAL HOSPITAL
4800 48TH ST
ProviderNumber 1201 7TH STREET SE
SHEFFIELD
HospitalName WINFIELD
36854
Sample 36278
CHAMBERS
City ELMORE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
75 patients
Condition 575 patients
10008
HospitalOwner 10009
CRENSHAW COMMUNITY HOSPITAL
ZipCode COMMUNITY HOSPITAL INC
101 HOS

HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Heart Attack
Address1 Surgical Infection Prevention
1 patients
Condition 3 patients
10046
HospitalOwner 10040
RIVERVIEW REGIONAL MEDICAL CENTER
ZipCode GADSDEN REGIONAL MEDICAL CENTER
600 SOUTH THIRD STREET
No match
GADSDEN
HospitalName RUSSELLVILLE
35901
Sample 35903
VTOWAH
HospitalName ANCHORAGE
239 patients
Condition 509 patients
10001
HospitalOwner 10009
BAPTIST MEDICAL CENTER SOUTH
ZipCode SHELBY BAPTIST MEDICAL CENTER
1108 ROSS CLARK CIRCLE
ProviderNumber 101 HOSPITAL CIRCLE
36302
Sample 36305
Government - Hospital District or Authority
CountyName Government - Local
Heart Attack
Address1 Surgical Infection Prevention
244 patients
Condition 441 patients
10016
HospitalOwner 10012
SHELBY BAPTJST MEDICAL CENTER
ZipCode SHELBY BAPTIST MEDICAL CENTER
1000 FIRST STREET NORTH
ZipCode 209 NORTH MAIN STREET
ALABASTER
HospitalName DOTHAN
35007
Sample 35045
SHELBY
City DE KALB
Voluntary non-profit - Church
CountyName Voluntary n

No match
1653 TEMPLE AVENUE NORTH
ZipCode 209 NORTH MAIN STREET
FAYETTE
HospitalName FLORENCE
35555
Sample 35570
FAYETTE
HospitalName FLORENCE
Voluntary non-profit - Other
CountyName Voluntary non-profit - Church
Heart Attack
Address1 Surgical Infection Prevention
11 patients
Condition 111 patients
10008
HospitalOwner 10009
CRENSHAW COMMUNITY HOSPITAL
ZipCode COMMUNITY HOSPITAL INC
101 HOSPITAL CIRCLE
ProviderNumber 126 HOSPITAL AVE
LUVERNE
HospitalName OPP
36049
Sample 36067
CRENSHAW
City CLARKE
Government - Federal
CountyName Government - Local
Surgical Infection Prevention
Address1 Heart Attack
2 patients
Condition 5 patients
10055
HospitalOwner 10056
FLOWERS HOSPITAL
ProviderNumber RUSSELLVILLE HOSPITAL
4370 WEST MAIN STREET
ProviderNumber 1725 PINE STREET
DOTHAN
HospitalName CULLMAN
36305
Sample 36302
HOUSTON
City COVINGTON
62 patients
Condition 92 patients
10012
HospitalOwner 10016
DEKALB REGIONAL MEDICAL CENTER
ZipCode MARION REGIONAL MEDICAL CENTER
200 MED CENTER DRIVE
No match

Condition 16 patients
10001
HospitalOwner 10009
SOUTHEAST ALABAMA MEDICAL CENTER
ZipCode SOUTHWEST ALABAMA MEDICAL CENTER
1108 ROSS CLARK CIRCLE
ProviderNumber 101 HOSPITAL CIRCLE
DOTHAN
HospitalName CULLMAN
36302
Sample 36305
HOUSTON
City COVINGTON
Government - Hospital District or Authority
CountyName Government - Local
Surgical Infection Prevention
Address1 Heart Attack
184 patients
Condition 64 patients
10011
HospitalOwner 10015
HELEN KELLER MEMORIAL HOSPITAL
ZipCode G H LANIER MEMORIAL HOSPITAL
50 MEDICAL PARK EAST DRIVE
HospitalOwner 150 GILBREATH DRIVE
BIRMINGHAM
HospitalName SYLACAUGA
35235
Sample 35233
JEFFERSON
City MADISON
Heart Failure
Address1 Pneumonia
316 patients
Condition 16 patients
10007
HospitalOwner 10008
MIZELL MEMORIAL ZOSPITAL
HospitalName OPP
702 N MAIN ST
ZipCode 209 NORTH MAIN STREET
OPP
HospitalName ANDALUSIA
36467
Sample 36420
COVINGTON
City HOUSTON
Pneumonia
Address1 Heart Failure
85 patients
Condition 45 patients
10027
HospitalOwner 10022
ELBA GENERAL HOS

HospitalOwner 10022
ELBA GENERAL HOSPITAL
ProviderNumber DECATUR GENERAL HOSPITAL
987 DRAYTON STREET
ProviderNumber 2451 FILLINGIM STREET
ELBA
HospitalName SYLACAUGA
36323
Sample 36330
COFFEE
City CALHOUN
Heart Attack
Address1 Surgical Infection Prevention
0 patients
Condition 2 patients
10087
HospitalOwner 10086
UNIV OF SOUTH ALABAMA MEDICAL CENTER
ZipCode SOUTHWEST ALABAMA MEDICAL CENTER
2451 FILLINGIM STREET
ProviderNumber 987 DRAYTON STREET
MOBIWE
HospitalName BIRMINGHAM
36617
Sample 36784
MOBILE
City LAUDERDALE
Government - State
CountyName Government - Local
Heart Attack
Address1 Surgical Infection Prevention
58 patients
Condition 68 patients
10009
HospitalOwner 10008
HARTSELLE MEDICAL CENTER
ZipCode DALE MEDICAL CENTER
201 PINE STREET NORTHWEST
ZipCode 209 NORTH MAIN STREET
HARTSELLE
HospitalName RUSSELLVILLE
35640
Sample 35609
MORGAN
City ETOWAH
Proprietary
No match
Heart Attack
Address1 Surgical Infection Prevention
112 patients
Condition 212 patients
10005
HospitalOwner 10008

HospitalName SYLACAUGA
35235
Sample 35233
JEFFERSON
City MADISON
Voluntary non-profit - Private
CountyName Voluntary non-profit - Other
Pneumonia
Address1 Heart Failure
69 patients
Condition 49 patients
ST VINCENP'S EAST
No match
MARIYN
HospitalName OPP
ANDTLUSIA
HospitalName ANDALUSIA
2105 EVST SOUTH BOULEVARD
No match
1725 PSNE STREET
No match
SYLACXUGA
HospitalName SYLACAUGA
PRATTVILOE
HospitalName PRATTVILLE
209 NORTH MAIN STREZT
HospitalName WEDOWEE
ZUVERNE
HospitalName LUVERNE
1256 MILITARP STREET SOUTH
No match
EFOWAH
HospitalName ANCHORAGE
MONTGOMEBY
HospitalName ONEONTA
SHEKBY BAPTIST MEDICAL CENTER
ZipCode SHELBY BAPTIST MEDICAL CENTER
WEOOWEE
HospitalName WEDOWEE
ALABAETER
HospitalName ALABASTER
FAYKTTE
HospitalName FAYETTE
1178 ROSS CLARK CIRCLE
ProviderNumber 1108 ROSS CLARK CIRCLE
ANCWORAGE
HospitalName ANCHORAGE
MARSHALL MEDICAR CENTER SOUTH
ProviderNumber FAYETTE MEDICAL CENTER
MARSHALL MEDICAL CEMTER SOUTH
ProviderNumber FAYETTE MEDICAL CENTER
315 W HICKOQY ST
No match

In [141]:
print (len(excluded_list))
print (len(result))
print (len(excluded_list) + len(result))
print (len(dirty_list))

0
4008
4008
4008


In [None]:
#write SQL query to validate predictions
for i in range(0, len(result)):
    

In [88]:
#identify the true positives and false positives. Store them in the seperate list for further analysis.
true_pos = 0
false_pos = 0
correctly_predicted = []
falsely_predicted = []
for i in range(len(result)):
    if result[i][0] == result[i][2]:
        true_pos = true_pos + 1
        correctly_predicted.append(result[i])
    else:
        false_pos = false_pos + 1
        falsely_predicted.append(result[i])

In [89]:
print(len(result))
print(len(correctly_predicted))
print(len(falsely_predicted))

190
154
36


## Precision, Recall, F1-score

In [None]:
classification_report(truth_list, dirty_list)

In [91]:
for i in falsely_predicted:
    print (i)

('36067', '360x9', '36049')
('10086', 'Voluntary non-profit - Private', '10011')
('1007 GOODYEAR AVENUE', '60%', 'BIRMINGHAM')
('ANCHORAGE', 'AL_SCIP-INF-4', '35235')
('10016', '10011', 'JEFFERSON')
('35957', '359x8', '35968')
('36116', '1xx16', '10016')
('10085', '100x8', '10038')
('36801', '36x01', '36201')
('10085', '100x6', '10086')
('EAST ALABAMA MEDICAL CENTER AND SNF', 'xNIV OF SOxTH ALABAMA MEDICAL CENTER', 'UNIV OF SOUTH ALABAMA MEDICAL CENTER')
('PO BOX 287', '3x0x7', '36067')
('99508', 'x0x08', '10108')
('FAYETTE MEDICAL CENTER', 'DALE MEDICAL CENTER', '10021')
('CALLAHAN EYE FOUNDATION HOSPITAL', '126 HOSPITAL AVE', 'DALE MEDICAL CENTER')
('ALASKA REGIONAL HOSPITAL', 'AL', 'OZARK')
('LAUDERDALE', 'DALE', '36360')
('36801', '3347742601', 'DALE')
('35960', 'x6x60', '36360')
('JACKSON HOSPITAL & CLINIC INC', '1xx24', '10024')
('TALLADEGA', 'TALLAxxEE', 'TALLASSEE')
('10085', '100x4', '10034')
('36278', '36x78', '36078')
('10085', '100x5', '10035')
('35235', '1xx35', '10035')
(

In [83]:
for i in correctly_predicted:
    print (i)

('BIRMINGHAM', 'BIRMINGHxM', 'BIRMINGHAM')
('BIRMINGHAM', 'BIRMINGxAM', 'BIRMINGHAM')
('SHEFFIELD', 'SHEFFxELD', 'SHEFFIELD')
('10019', '1xx19', '10019')
('SOUTHEAST ALABAMA MEDICAL CENTER', 'SOUTHEAST ALABAMA MEDxCAL CENTER', 'SOUTHEAST ALABAMA MEDICAL CENTER')
('35957', 'x5957', '35957')
('10005', 'x0005', '10005')
('BOAZ', 'BOxZ', 'BOAZ')
('2505 U S HIGHWAY 431 NORTH', '2505xUxSxHIGHWAYx431xNORTH', '2505 U S HIGHWAY 431 NORTH')
('35957', '3595x', '35957')
('MARSHALL', 'MxRSHxLL', 'MARSHALL')
('35631', '3563x', '35631')
('LAUDERDALE', 'LAUDxRDALx', 'LAUDERDALE')
('LAUDERDALE', 'LAUDExDALE', 'LAUDERDALE')
('FLORENCE', 'FxORENCE', 'FLORENCE')
('10006', '1000x', '10006')
('FLORENCE', 'FLORxNCx', 'FLORENCE')
('FLORENCE', 'FLxRENCE', 'FLORENCE')
('35631', '3563x', '35631')
('702 N MAIN ST', '702xNxMAINxST', '702 N MAIN ST')
('702 N MAIN ST', '702 x MAIx ST', '702 N MAIN ST')
('CRENSHAW COMMUNITY HOSPITAL', 'CRENSHAW CxMMUNITY HxSPITAL', 'CRENSHAW COMMUNITY HOSPITAL')
('CRENSHAW', 'CRxNSHA

In [84]:
falsely_predicted[1]

('10086', 'Voluntary non-profit - Private', '10011')

In [85]:
model_hosp.most_similar('3595x')

[('35957', 0.9885784387588501),
 ('35968', 0.8877739310264587),
 ('35976', 0.885722815990448),
 ('35960', 0.8576430082321167),
 ('35903', 0.830594539642334),
 ('35901', 0.8117714524269104),
 ('35058', 0.6940221190452576),
 ('35007', 0.6867320537567139),
 ('35640', 0.627211332321167),
 ('35653', 0.6189522743225098)]