# API

https://recordlinkage.readthedocs.io/en/latest/guides/link_two_dataframes.html

In [None]:
import recordlinkage
from recordlinkage.datasets import load_febrl4

dfA, dfB = load_febrl4()

# Indexation step
indexer = recordlinkage.Index()
indexer.block("given_name")
candidate_links = indexer.index(dfA, dfB)

# Comparison step
compare_cl = recordlinkage.Compare()

compare_cl.exact("given_name", "given_name", label="given_name")
compare_cl.string("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
compare_cl.exact("date_of_birth", "date_of_birth", label="date_of_birth")
compare_cl.exact("suburb", "suburb", label="suburb")
compare_cl.exact("state", "state", label="state")
compare_cl.string("address_1", "address_1", threshold=0.85, label="address_1")

features = compare_cl.compute(candidate_links, dfA, dfB)

# Classification step
matches = features[features.sum(axis=1) > 3]
print(len(matches))
print(matches.head())

https://recordlinkage.readthedocs.io/en/latest/guides/data_deduplication.html

In [None]:
import recordlinkage
from recordlinkage.datasets import load_febrl1

dfA = load_febrl1()

# Indexation step
indexer = recordlinkage.Index()
indexer.block(left_on="given_name")
candidate_links = indexer.index(dfA)

# Comparison step
compare_cl = recordlinkage.Compare()

compare_cl.exact("given_name", "given_name", label="given_name")
compare_cl.string("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
compare_cl.exact("date_of_birth", "date_of_birth", label="date_of_birth")
compare_cl.exact("suburb", "suburb", label="suburb")
compare_cl.exact("state", "state", label="state")
compare_cl.string("address_1", "address_1", threshold=0.85, label="address_1")

features = compare_cl.compute(candidate_links, dfA)

# Classification step
matches = features[features.sum(axis=1) > 3]
print(len(matches))
print(matches.head())

In [None]:
import pandas
from recordlinkage.preprocessing import clean

names = ['Mary-ann',
        'Bob :)',
        'Angel',
        'Bob (alias Billy)',
        None]
print(names)
s = pandas.Series(names)
print(clean(s).values.tolist())

# example

In [1]:
import pandas as pd
import recordlinkage

hospital_accounts = pd.read_csv('./sample/hospital_account_info.csv', index_col='Account_Num')
hospital_reimbursement = pd.read_csv('./sample/hospital_reimbursement.csv', index_col='Provider_Num')

In [2]:
display(hospital_accounts.head())
display(hospital_reimbursement.head())
display(hospital_accounts.info())
display(hospital_reimbursement.info())

Unnamed: 0_level_0,Facility Name,Address,City,State,ZIP Code,County Name,Phone Number,Hospital Type,Hospital Ownership
Account_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10605,SAGE MEMORIAL HOSPITAL,STATE ROUTE 264 SOUTH 191,GANADO,AZ,86505,APACHE,(928) 755-4541,Critical Access Hospitals,Voluntary non-profit - Private
24250,WOODRIDGE BEHAVIORAL CENTER,600 NORTH 7TH STREET,WEST MEMPHIS,AR,72301,CRITTENDEN,(870) 394-4113,Psychiatric,Proprietary
10341,DOUGLAS GARDENS HOSPITAL,5200 NE 2ND AVE,MIAMI,FL,33137,MIAMI-DADE,(305) 751-8626,Acute Care Hospitals,Voluntary non-profit - Private
81101,SUNCOAST BEHAVIORAL HEALTH CENTER,4480 51ST ST W,BRADENTON,FL,34210,MANATEE,(941) 792-2222,Psychiatric,Proprietary
39835,TREASURE VALLEY HOSPITAL,8800 WEST EMERALD STREET,BOISE,ID,83704,ADA,(208) 373-5000,Acute Care Hospitals,Proprietary


Unnamed: 0_level_0,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
Provider_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
839987,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,118,20855.61,5026.19,4115.52
519118,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,43,13289.09,5413.63,4490.93
733073,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,73,22261.6,4922.18,4021.79
201752,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,12,10901.33,5343.5,4284.17
678488,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,74,28117.95,5947.12,4819.53


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5339 entries, 10605 to 51586
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Facility Name       5339 non-null   object
 1   Address             5339 non-null   object
 2   City                5339 non-null   object
 3   State               5339 non-null   object
 4   ZIP Code            5339 non-null   int64 
 5   County Name         5339 non-null   object
 6   Phone Number        5339 non-null   object
 7   Hospital Type       5339 non-null   object
 8   Hospital Ownership  5339 non-null   object
dtypes: int64(1), object(8)
memory usage: 417.1+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2697 entries, 839987 to 322584
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Provider Name              2697 non-null   object 
 1   Provider Street Address    2697 non-null   object 
 2   Provider City              2697 non-null   object 
 3   Provider State             2697 non-null   object 
 4   Provider Zip Code          2697 non-null   int64  
 5   Total Discharges           2697 non-null   int64  
 6   Average Covered Charges    2697 non-null   float64
 7   Average Total Payments     2697 non-null   float64
 8   Average Medicare Payments  2697 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 210.7+ KB


None

In [5]:
indexer = recordlinkage.Index()
indexer.full()

candidates = indexer.index(hospital_accounts, hospital_reimbursement)
print(len(candidates))
print(len(hospital_accounts)*len(hospital_reimbursement))

14399283
14399283


In [6]:
compare = recordlinkage.Compare()
compare.exact('City', 'Provider City', label='City')
compare.string('Facility Name',
            'Provider Name',
            threshold=0.85,
            label='Hosp_Name')
# compare.string('Address',
#             'Provider Street Address',
#             method='jarowinkler',
#             threshold=0.85,
#             label='Hosp_Address')
features = compare.compute(candidates, hospital_accounts,
                        hospital_reimbursement)

KeyboardInterrupt: 

In [3]:
indexer = recordlinkage.Index()
indexer.block(left_on='State', right_on='Provider State')
candidates = indexer.index(hospital_accounts, hospital_reimbursement)
print(len(candidates))

475830


In [4]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='State', right_on='Provider State')
candidates = indexer.index(hospital_accounts, hospital_reimbursement)
print(len(candidates))

998860
