In [None]:
import pandas as pd
import hashlib
from datetime import datetime, timedelta
import fuzzy
import re #regular expressions
import numpy as np
from Levenshtein import distance as levenshtein_distance
soundex = fuzzy.Soundex(4)

## Load Patient Data

In [None]:
df_patient = pd.read_csv('Patient Matching Data.csv')

In [None]:
df_patient

In [None]:
def swap_full_state_name(x):
    states = {
        "AK" : "Alaska",
        "AL" : "Alabama",
        "AR" : "Arkansas",
        "AS" : "American Samoa",
        "AZ" : "Arizona",
        "CA" : "California",
        "CO" : "Colorado",
        "CT" : "Connecticut",
        "DC" : "District of Columbia",
        "DE" : "Delaware",
        "FL" : "Florida",
        "GA" : "Georgia",
        "GU" : "Guam",
        "HI" : "Hawaii",
        "IA" : "Iowa",
        "ID" : "Idaho",
        "IL" : "Illinois",
        "IN" : "Indiana",
        "KS" : "Kansas",
        "KY" : "Kentucky",
        "LA" : "Louisiana",
        "MA" : "Massachusetts",
        "MD" : "Maryland",
        "ME" : "Maine",
        "MI" : "Michigan",
        "MN" : "Minnesota",
        "MO" : "Missouri",
        "MS" : "Mississippi",
        "MT" : "Montana",
        "NC" : "North Carolina",
        "ND" : "North Dakota",
        "NE" : "Nebraska",
        "NH" : "New Hampshire",
        "NJ" : "New Jersey",
        "NM" : "New Mexico",
        "NV" : "Nevada",
        "NY" : "New York",
        "OH" : "Ohio",
        "OK" : "Oklahoma",
        "OR" : "Oregon",
        "PA" : "Pennsylvania",
        "PR" : "Puerto Rico",
        "RI" : "Rhode Island",
        "SC" : "South Carolina",
        "SD" : "South Dakota",
        "TN" : "Tennessee",
        "TX" : "Texas",
        "UT" : "Utah",
        "VA" : "Virginia",
        "VI" : "Virgin Islands",
        "VT" : "Vermont",
        "WA" : "Washington",
        "WI" : "Wisconsin",
        "WV" : "West Virginia",
        "WY" : "Wyoming"
    }
    if len(x) < 3:
        return x.upper()
    else:
        min_lev_dist_state = 'XX'
        min_dist = 1000
        for abb, state in states.items():
            dist = levenshtein_distance(x, state)
            if dist < min_dist:
                min_lev_dist_state = abb
                min_dist = dist
        return min_lev_dist_state

def swap_abb_state_name(x):
    states = {
        "AK" : "Alaska",
        "AL" : "Alabama",
        "AR" : "Arkansas",
        "AS" : "American Samoa",
        "AZ" : "Arizona",
        "CA" : "California",
        "CO" : "Colorado",
        "CT" : "Connecticut",
        "DC" : "District of Columbia",
        "DE" : "Delaware",
        "FL" : "Florida",
        "GA" : "Georgia",
        "GU" : "Guam",
        "HI" : "Hawaii",
        "IA" : "Iowa",
        "ID" : "Idaho",
        "IL" : "Illinois",
        "IN" : "Indiana",
        "KS" : "Kansas",
        "KY" : "Kentucky",
        "LA" : "Louisiana",
        "MA" : "Massachusetts",
        "MD" : "Maryland",
        "ME" : "Maine",
        "MI" : "Michigan",
        "MN" : "Minnesota",
        "MO" : "Missouri",
        "MS" : "Mississippi",
        "MT" : "Montana",
        "NC" : "North Carolina",
        "ND" : "North Dakota",
        "NE" : "Nebraska",
        "NH" : "New Hampshire",
        "NJ" : "New Jersey",
        "NM" : "New Mexico",
        "NV" : "Nevada",
        "NY" : "New York",
        "OH" : "Ohio",
        "OK" : "Oklahoma",
        "OR" : "Oregon",
        "PA" : "Pennsylvania",
        "PR" : "Puerto Rico",
        "RI" : "Rhode Island",
        "SC" : "South Carolina",
        "SD" : "South Dakota",
        "TN" : "Tennessee",
        "TX" : "Texas",
        "UT" : "Utah",
        "VA" : "Virginia",
        "VI" : "Virgin Islands",
        "VT" : "Vermont",
        "WA" : "Washington",
        "WI" : "Wisconsin",
        "WV" : "West Virginia",
        "WY" : "Wyoming",
        'XX' : ''
    }
    min_lev_dist_state = 'XX'
    min_dist = 1000
    for abb in states.keys():
        dist = levenshtein_distance(x, abb)
        if dist < min_dist:
            min_lev_dist_state = abb
            min_dist = dist
    return min_lev_dist_state
    
def clean_state_data(df):
    df['Current State'].fillna('XX', inplace=True)
    df['Current State'] = df['Current State'].apply(swap_full_state_name)
    df['Current State'] = df['Current State'].apply(swap_abb_state_name)

def clean_zip_code(df):
    df['Current Zip Code'].fillna(0, inplace=True)
    df['Zip Code String'] = df['Current Zip Code'].apply(lambda x: str(int(x)))
    df['Zip Code String'] = df['Zip Code String'].replace('0', '00000')
    df['National Area'] = df['Zip Code String'].apply(lambda x: int(x[0]))
    df['Sectional Center'] = df['Zip Code String'].apply(lambda x: int(x[1:3]))
    df['Delivery Area'] = df['Zip Code String'].apply(lambda x: int(x[3:]))
    
clean_state_data(df_patient)
clean_zip_code(df_patient)

In [None]:
# Clustering discrete and mixed data (from stackoverflow: https://datascience.stackexchange.com/questions/8681/clustering-for-mixed-numeric-and-nominal-discrete-data)
from sklearn.neighbors import DistanceMetric
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

df_patient.fillna(0, inplace=True)
X = gower_distance(df_patient)

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import OPTICS
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.8).fit(X)
clustering.labels_

In [None]:
def convert_date_string(x):
    try:
        parsed_date = datetime.strptime(x['Date of Birth'], '%m/%d/%Y')
        x['dob_string'] = str(parsed_date.strftime('%Y%m%d'))
        return x
    except:
        bad_row = x['Date of Birth']
        bad_row_splits = bad_row.split('/')
        x['dob_string'] = bad_row_splits[2] + bad_row_splits[1] + bad_row_splits[0]
        return x

def clean_date_data(df):
    return df.apply(convert_date_string, axis=1)

df_patient = clean_date_data(df_patient)

In [None]:
def clean_sex_data(df):
    df['Sex'].fillna('U', inplace=True)
    df['Sex'] = df['Sex'].apply(lambda x: x[0].upper() if x[0].upper() in {'M', 'F'} else 'U')

def fill_empty_name_data(df):
    df['First Name'].fillna('', inplace=True)
    df['Last Name'].fillna('', inplace=True)

clean_sex_data(df_patient)
fill_empty_name_data(df_patient)
df_patient['Sex'].unique()

In [None]:
def normalize_patient_first_and_last_names(df):
    df['rnaFirstName'] = df['First Name'].str.replace('[^a-zA-Z]', '').str.lower()
    df['rnaLastName'] = df['Last Name'].str.replace('[^a-zA-Z]', '').str.lower()

normalize_patient_first_and_last_names(df_patient)

In [None]:
def full_name_hash(first_name, last_name, gender, dob):
    SALT = 'OATEST'
    hasher = hashlib.sha1()
    hasher.update('{}{}~{}{}'.format(SALT, dob, gender, first_name).encode('utf-8'))
    return '{}~{}'.format(hasher.hexdigest(), last_name)

def partial_hash(first_name, last_name, gender, dob):
    # first three of first and last name
    first_name = first_name[:3] if len(first_name) >= 3 else 'X' * (3 - len(first_name)) + first_name
    last_name = last_name[:3] if len(last_name) >= 3 else 'X' * (3 - len(last_name)) + last_name 
    SALT = 'OATEST'
    hasher = hashlib.sha1()
    hasher.update('{}{}~{}{}'.format(SALT, dob, gender, first_name).encode('utf-8'))
    return '{}~{}'.format(hasher.hexdigest(), last_name)

def df_full_name_hash(x):
    return full_name_hash(x['First Name'], x['Last Name'], x['Sex'], x['dob_string'])

def df_partial_hash(x):
    return partial_hash(x['First Name'], x['Last Name'], x['Sex'], x['dob_string'])

def create_hash_tokens(df):
    df['full_name_hash'] = df.apply(df_full_name_hash, axis=1)
    df['partial_name_hash'] = df.apply(df_partial_hash, axis=1)
    
create_hash_tokens(df_patient)

In [None]:
df_patient

In [None]:
def findConfidenceLevel(first_name1, last_name1, rna_first_name1, rna_last_name1, first_name2, last_name2, rna_first_name2, rna_last_name2):
    if rna_first_name1 == rna_first_name2 and rna_last_name1 == rna_last_name2:
        return 100
    elif rna_last_name1 == rna_last_name2 and rna_first_name1[:4] == rna_first_name2[:4]:
        return 90
    elif rna_last_name1 == rna_last_name2:
        return 85
    elif rna_last_name1[0:5] == rna_last_name2[0:5] and rna_first_name1[:4] == rna_first_name2[:4]:
        return 80
    elif soundex(last_name1) == soundex(last_name2) and soundex(first_name1) == soundex(first_name2):
        return 79
    elif soundex(rna_last_name1) == soundex(rna_last_name2) and soundex(rna_first_name1) == soundex(rna_first_name2):
        return 77
    elif rna_first_name1 == rna_first_name2 and soundex(rna_last_name1[:4]) == soundex(rna_last_name2[:4]):
        return 76
    elif rna_first_name1 == rna_first_name2:
        return 60
    else:
        return 50
    
def findConfidenceLevel2(first_name1, last_name1, rna_first_name1, rna_last_name1, first_name2, last_name2, rna_first_name2, rna_last_name2):   
    if rna_first_name1 == rna_first_name2 and rna_last_name1 == rna_last_name2:
        return 100
    elif rna_last_name1 == rna_last_name2 and rna_first_name1[:4] == rna_first_name2[:4]:
        return 90
    elif rna_last_name1 == rna_last_name2 and soundex(first_name1) == soundex(first_name2):
        return 85
    elif rna_last_name1[0:5] == rna_last_name2[0:5] and rna_first_name1[:4] == rna_first_name2[:4]:
        return 80
    elif soundex(last_name1) == soundex(last_name2) and soundex(first_name1) == soundex(first_name2):
        return 79
    elif soundex(rna_last_name1) == soundex(rna_last_name2) and soundex(rna_first_name1) == soundex(rna_first_name2):
        return 77
    elif rna_first_name1 == rna_first_name2 and soundex(rna_last_name1[:4]) == soundex(rna_last_name2[:4]):
        return 76
    elif rna_last_name1 == rna_last_name2:
        return 75
    elif rna_first_name1 == rna_first_name2:
        return 60
    else:
        return 50

In [None]:
def generate_confidence_matrix(confidence_type, confidence_func, df, threshold):
    matrix = np.zeros((df.shape[0], df.shape[0]))
    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            conf = confidence_func(
                    row1['First Name'],
                    row1['Last Name'],
                    row1['rnaFirstName'],
                    row1['rnaLastName'],
                    row2['First Name'],
                    row2['Last Name'],
                    row2['rnaFirstName'],
                    row2['rnaLastName']
            )
            if conf > threshold:
                matrix[index1][index2] = 1
            else:
                matrix[index1][index2] = 0
    return matrix

In [None]:
confidence_matrix = generate_confidence_matrix(2, findConfidenceLevel2, df_patient, 70)

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
graph = csr_matrix(confidence_matrix)
n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True)

In [None]:
n_components

In [None]:
labels

In [None]:
def measure_accuracy(labels, df):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            if index1 >= index2:
                continue
            if row1['GroupID'] == row2['GroupID']: # P
                if labels[index1] == labels[index2]: # T
                    tp += 1
                else:
                    fn += 1
            else: # N
                if labels[index1] == labels[index2]: # F
                    fp += 1
                else: # T
                    tn += 1
    return (tp, fp, tn, fn)

In [None]:
tp, fp, tn, fn = measure_accuracy(clustering.labels_, df_patient)

In [None]:
(tp + tn) / (tp + fp + tn + fn)

In [None]:
tp / (tp + fp)

In [None]:
tp / (tp + fn)

In [None]:
tp, fp, tn, fn

In [1]:
from clean_data import *

In [2]:
load_and_clean_data('Patient Matching Data.csv')

Unnamed: 0,GroupID,PatientID,Patient Acct #,First Name,MI,Last Name,Date of Birth,Sex,Current Street 1,Current Street 2,...,Previous City,Previous State,Previous Zip Code,Zip Code String,National Area,Sectional Center,Delivery Area,dob_string,rnaFirstName,rnaLastName
0,1,1,247028705-7,Sutton,J,Power,9/20/1945,M,1858 Sullivan Parkway,,...,Mount Vernon,New York,10557.0,93726,9,37,26,19450920,sutton,power
1,1,2,,Suttin,James,Power,9/21/1945,M,1859 Sullivan Parkway,#2,...,Mount Vernon,New York,10557.0,93726,9,37,26,19450921,suttin,power
2,1,3,247028705-7,Sutton,J,Power,9/20/1945,M,1858 Sullivan Parkway,,...,,,,93726,9,37,26,19450920,sutton,power
3,1,4,,Sutton,,Power,9/20/1954,M,1858 Sullivan Parkway,,...,,,,93726,9,37,26,19540920,sutton,power
4,1,5,,SUTTON,,POWER,9/20/1954,M,1858 SULLIVAN PKWAY,APT 2,...,,,,93726,9,37,26,19540920,sutton,power
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,61,197,,Bill,,Smith,5/16/1972,M,16595 City View Lane,,...,,,,98684,9,86,84,19720516,bill,smith
197,62,198,,Bill,,Smith,5/16/1972,M,,,...,,,,00000,0,0,0,19720516,bill,smith
198,63,199,,Sarah,,Field,3/17/2010,F,9850 Kelso Road,,...,,,,98626,9,86,26,20100317,sarah,field
199,64,200,,Sara,,Field,3/17/2010,F,,,...,,,,00000,0,0,0,20100317,sara,field
