In [1]:
import pandas as pd
import hashlib
from datetime import datetime, timedelta
import fuzzy
import re #regular expressions
soundex = fuzzy.Soundex(4)

## Load Patient Data

In [2]:
df_patient = pd.read_csv('Patient Matching Data.csv')

In [3]:
df_patient

Unnamed: 0,GroupID,PatientID,Patient Acct #,First Name,MI,Last Name,Date of Birth,Sex,Current Street 1,Current Street 2,...,Current State,Current Zip Code,Previous First Name,Previous MI,Previous Last Name,Previous Street 1,Previous Street 2,Previous City,Previous State,Previous Zip Code
0,1,1,247028705-7,Sutton,J,Power,9/20/1945,Male,1858 Sullivan Parkway,,...,California,93726.0,,,,2 Erie Crossing,Apt 9,Mount Vernon,New York,10557.0
1,1,2,,Suttin,James,Power,9/21/1945,Male,1859 Sullivan Parkway,#2,...,California,93726.0,,,,2 Erie Crossing,Apartment # 9,Mount Vernon,New York,10557.0
2,1,3,247028705-7,Sutton,J,Power,9/20/1945,Male,1858 Sullivan Parkway,,...,CA,93726.0,,,,,,,,
3,1,4,,Sutton,,Power,9/20/1954,Male,1858 Sullivan Parkway,,...,California,93726.0,,,,,,,,
4,1,5,,SUTTON,,POWER,9/20/1954,Male,1858 SULLIVAN PKWAY,APT 2,...,California,93726.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,61,197,,Bill,,Smith,5/16/1972,Male,16595 City View Lane,,...,WA,98684.0,,,,,,,,
197,62,198,,Bill,,Smith,5/16/1972,Male,,,...,,,,,,,,,,
198,63,199,,Sarah,,Field,3/17/2010,Female,9850 Kelso Road,,...,WA,98626.0,,,,,,,,
199,64,200,,Sara,,Field,3/17/2010,Female,,,...,,,,,,,,,,


In [4]:
def convert_date_string(x):
    try:
        parsed_date = datetime.strptime(x['Date of Birth'], '%m/%d/%Y')
        x['dob_string'] = str(parsed_date.strftime('%Y%m%d'))
        return x
    except:
        bad_row = x['Date of Birth']
        bad_row_splits = bad_row.split('/')
        x['dob_string'] = bad_row_splits[2] + bad_row_splits[1] + bad_row_splits[0]
        return x

def clean_date_data(df):
    return df.apply(convert_date_string, axis=1)

df_patient = clean_date_data(df_patient)

In [5]:
def clean_sex_data(df):
    df['Sex'].fillna('U', inplace = True)
    df['Sex'].replace('Male', 'M', inplace = True)
    df['Sex'].replace('Female', 'F', inplace = True)

def fill_empty_name_data(df):
    df['First Name'].fillna('', inplace=True)
    df['Last Name'].fillna('', inplace=True)

clean_sex_data(df_patient)
fill_empty_name_data(df_patient)

In [6]:
def normalize_patient_first_and_last_names(df):
    df['rnaFirstName'] = df['First Name'].str.replace('[^a-zA-Z]', '').str.lower()
    df['rnaLastName'] = df['Last Name'].str.replace('[^a-zA-Z]', '').str.lower()

normalize_patient_first_and_last_names(df_patient)

In [7]:
def full_name_hash(first_name, last_name, gender, dob):
    SALT = 'OATEST'
    hasher = hashlib.sha1()
    hasher.update('{}{}~{}{}'.format(SALT, dob, gender, first_name).encode('utf-8'))
    return '{}~{}'.format(hasher.hexdigest(), last_name)

def partial_hash(first_name, last_name, gender, dob):
    # first three of first and last name
    first_name = first_name[:3] if len(first_name) >= 3 else 'X' * (3 - len(first_name)) + first_name
    last_name = last_name[:3] if len(last_name) >= 3 else 'X' * (3 - len(last_name)) + last_name 
    SALT = 'OATEST'
    hasher = hashlib.sha1()
    hasher.update('{}{}~{}{}'.format(SALT, dob, gender, first_name).encode('utf-8'))
    return '{}~{}'.format(hasher.hexdigest(), last_name)

def df_full_name_hash(x):
    return full_name_hash(x['First Name'], x['Last Name'], x['Sex'], x['dob_string'])

def df_partial_hash(x):
    return partial_hash(x['First Name'], x['Last Name'], x['Sex'], x['dob_string'])

def create_hash_tokens(df):
    df['full_name_hash'] = df.apply(df_full_name_hash, axis=1)
    df['partial_name_hash'] = df.apply(df_partial_hash, axis=1)
    
create_hash_tokens(df_patient)

In [8]:
df_patient

Unnamed: 0,GroupID,PatientID,Patient Acct #,First Name,MI,Last Name,Date of Birth,Sex,Current Street 1,Current Street 2,...,Previous Street 1,Previous Street 2,Previous City,Previous State,Previous Zip Code,dob_string,rnaFirstName,rnaLastName,full_name_hash,partial_name_hash
0,1,1,247028705-7,Sutton,J,Power,9/20/1945,M,1858 Sullivan Parkway,,...,2 Erie Crossing,Apt 9,Mount Vernon,New York,10557.0,19450920,sutton,power,f513a340f370e0d1e213cbe5a7e030f3c3729928~Power,8904f84c4af7db2bba24968b9628c21a678941bb~Pow
1,1,2,,Suttin,James,Power,9/21/1945,M,1859 Sullivan Parkway,#2,...,2 Erie Crossing,Apartment # 9,Mount Vernon,New York,10557.0,19450921,suttin,power,cb4eb0277fecfbdee00d5951d5e4e4f1cabbe283~Power,4e82ecca5923b548d8f4e9583e378401046302b4~Pow
2,1,3,247028705-7,Sutton,J,Power,9/20/1945,M,1858 Sullivan Parkway,,...,,,,,,19450920,sutton,power,f513a340f370e0d1e213cbe5a7e030f3c3729928~Power,8904f84c4af7db2bba24968b9628c21a678941bb~Pow
3,1,4,,Sutton,,Power,9/20/1954,M,1858 Sullivan Parkway,,...,,,,,,19540920,sutton,power,d31c28f50fed0591d910c358b3d1fa93b05f49e0~Power,944a54f27ba70e232d8ed8c541e5af2df06d2dc1~Pow
4,1,5,,SUTTON,,POWER,9/20/1954,M,1858 SULLIVAN PKWAY,APT 2,...,,,,,,19540920,sutton,power,63fae202660839596baf56a46c6396dbd629705b~POWER,73cdae46281d17b0df93453d065f05a419f85cb2~POW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,61,197,,Bill,,Smith,5/16/1972,M,16595 City View Lane,,...,,,,,,19720516,bill,smith,4c04d8977da3a043584cd6190536d8f576e6b9b2~Smith,12ac8663b465bc1eabfd52456161e3587a332fc1~Smi
197,62,198,,Bill,,Smith,5/16/1972,M,,,...,,,,,,19720516,bill,smith,4c04d8977da3a043584cd6190536d8f576e6b9b2~Smith,12ac8663b465bc1eabfd52456161e3587a332fc1~Smi
198,63,199,,Sarah,,Field,3/17/2010,F,9850 Kelso Road,,...,,,,,,20100317,sarah,field,caf30d9710cf2852506b6752a27a3c84e5180271~Field,c0d4a6a68381657e4b4ab1a7abde60cbb1358e48~Fie
199,64,200,,Sara,,Field,3/17/2010,F,,,...,,,,,,20100317,sara,field,10d69a413a0841682292498fceef1edf6498641e~Field,c0d4a6a68381657e4b4ab1a7abde60cbb1358e48~Fie


In [9]:
def findConfidenceLevel(first_name1, last_name1, rna_first_name1, rna_last_name1, first_name2, last_name2, rna_first_name2, rna_last_name2):
    if rna_first_name1 == rna_first_name2 and rna_last_name1 == rna_last_name2:
        return 100
    elif rna_last_name1 == rna_last_name2 and rna_first_name1[:4] == rna_first_name2[:4]:
        return 90
    elif rna_last_name1 == rna_last_name2:
        return 85
    elif rna_last_name1[0:5] == rna_last_name2[0:5] and rna_first_name1[:4] == rna_first_name2[:4]:
        return 80
    elif soundex(last_name1) == soundex(last_name2) and soundex(first_name1) == soundex(first_name2):
        return 79
    elif soundex(rna_last_name1) == soundex(rna_last_name2) and soundex(rna_first_name1) == soundex(rna_first_name2):
        return 77
    elif rna_first_name1 == rna_first_name2 and soundex(rna_last_name1[:4]) == soundex(rna_last_name2[:4]):
        return 76
    elif rna_first_name1 == rna_first_name2:
        return 60
    else:
        return 50
    
def findConfidenceLevel2(first_name1, last_name1, rna_first_name1, rna_last_name1, first_name2, last_name2, rna_first_name2, rna_last_name2):   
    if rna_first_name1 == rna_first_name2 and rna_last_name1 == rna_last_name2:
        return 100
    elif rna_last_name1 == rna_last_name2 and rna_first_name1[:4] == rna_first_name2[:4]:
        return 90
    elif rna_last_name1 == rna_last_name2 and soundex(first_name1) == soundex(first_name2):
        return 85
    elif rna_last_name1[0:5] == rna_last_name2[0:5] and rna_first_name1[:4] == rna_first_name2[:4]:
        return 80
    elif soundex(last_name1) == soundex(last_name2) and soundex(first_name1) == soundex(first_name2):
        return 79
    elif soundex(rna_last_name1) == soundex(rna_last_name2) and soundex(rna_first_name1) == soundex(rna_first_name2):
        return 77
    elif rna_first_name1 == rna_first_name2 and soundex(rna_last_name1[:4]) == soundex(rna_last_name2[:4]):
        return 76
    elif rna_last_name1 == rna_last_name2:
        return 75
    elif rna_first_name1 == rna_first_name2:
        return 60
    else:
        return 50

In [10]:
def generate_confidence_df(confidence_type, confidence_func, df):
    confidences = dict()
    i = 0
    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            confidences[i] = {
                'index1': index1,
                'index2': index2,
                'confidence': confidence_func(
                    row1['First Name'],
                    row1['Last Name'],
                    row1['rnaFirstName'],
                    row1['rnaLastName'],
                    row2['First Name'],
                    row2['Last Name'],
                    row2['rnaFirstName'],
                    row2['rnaLastName']
                ),
                'confidence_type': confidence_type,
            }
            i += 1
    return pd.DataFrame.from_dict(confidences, orient='index')

In [11]:
#generate_confidence_df(1, findConfidenceLevel, df_patient)

In [12]:
#generate_confidence_df(2, findConfidenceLevel2, df_patient)

In [55]:
from Levenshtein import distance as levenshtein_distance


def generate_Ldist_df(df, column):
    confidences = dict()
    i = 0
    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            isSame = 0
            if row1["GroupID"] == row2["GroupID"]:
                isSame = 1
            confidences[i] = {
                'index1': index1,
                'index2': index2,
                'Ldist': levenshtein_distance(
                    row1[column],
                    row2[column]
                ),
                'isSame':isSame
            }
            i += 1
    return pd.DataFrame.from_dict(confidences, orient='index')

In [59]:
dob_conf_stats = generate_Ldist_df(df_patient, "dob_string")
first_name_conf_stats = generate_Ldist_df(df_patient, "rnaFirstName")
last_name_conf_stats = generate_Ldist_df(df_patient, "rnaLastName")
gender_conf_stats = generate_Ldist_df(df_patient, "Sex")


In [72]:

def printConfvLDist(df, i):
    for j in range(i):
        dist = df[df.Ldist == j]
        distconf = dist.isSame.value_counts(1)[1]
        print(distconf)




In [74]:
print("DOB stats")
printConfvLDist(dob_conf_stats, 5)

print("Fname stats")
printConfvLDist(first_name_conf_stats, 5)

print("Lname stats")
printConfvLDist(last_name_conf_stats, 5)

print("Gender stats")
printConfvLDist(gender_conf_stats, 2)

DOB stats
0.8081471747700394
0.6627906976744186
0.056962025316455694
0.002883922134102379
0.0012186205215695832
Fname stats
0.6854256854256854
0.5869565217391305
0.288135593220339
0.036637931034482756
0.0031413612565445027
Lname stats
0.7108433734939759
0.5862068965517241
0.46153846153846156
0.038461538461538464
0.0030959752321981426
Gender stats
0.03863623619133068
0.0033381450739805125
