In [None]:
import pandas as pd
from splink.exploratory import completeness_chart
from splink import DuckDBAPI, block_on
from splink import SettingsCreator, Linker
import splink.comparison_library as cl
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)
import numpy as np
db_api = DuckDBAPI()



# Load Datasets

In [2]:
a_clean = pd.read_pickle("data/a_clean")
a_clean = a_clean.replace('', pd.NA)
a_clean.head()

Unnamed: 0,unique_id,parent_id,top_id,cnt_children,orgtype_id,geo_id,name,email,phone,fax,...,region,division,congress_district,congress_land_area,country_y,continent,iso,zipcode,address_number,street_name
9651,18802368,,18802368.0,0,0,100239,lohberger engineering pty ltdsjf,,,,...,,,,,usa,america,us,,,
67057,18802818,288468727.0,288468696.0,0,0,34197,lockheed martin,,,,...,midwest,central,02,243791,usa,america,us,55121.0,,eagan
31323,18802831,,18802831.0,2,0,0,logus,,,,...,not_defined,not_defined,not_defined,not_defined,usa,america,us,34409.0,,palm beach
22669,18804109,156221130.0,288468696.0,0,0,4502,lockheed martin,,,,...,,middle atlantic,0103,350028997,usa,america,us,8057.0,,moorestown
18784,18883986,47362264.0,47362264.0,0,0,58834,lugokdc,,,,...,,pacific,080910,735971834382667,usa,america,us,98424.0,5210.0,12th cfifewa


In [3]:
b_clean = pd.read_pickle("data/b_clean")
b_clean.head()

Unnamed: 0,unique_id,parent_id,top_id,name,entity_proper_name,primary_sic_code,industry_code,sector_code,iso_country_x,metro_area,...,fax_area,fax,fax_full,hq,address,iso,state,zipcode,address_number,street_name
0,000bfge,,000bfge,lotsoff,lotoff,5331.0,3520.0,3500.0,us,san antoniotx metro,...,,,,1.0,1201 austin 116,us,tx,782094859,1201,austin 116
14,000ff7e,,000ff7e,legacy,legacy,7819.0,3430.0,3400.0,us,san franciscoca metro,...,310.0,4173500.0,13104173500.0,1.0,4160 4th,us,ca,94538,4160,4th
7,000fjle,,000fjle,leadingside,leadingside,7371.0,3308.0,3300.0,us,bostonma metro,...,,,,1.0,1 canal park 3300,us,ma,2141,1,canal park 3300
15,000hnqe,,000hnqe,lci industries,lci industries,3716.0,1250.0,1200.0,us,chicagoil metro,...,,,,1.0,3501 county 6,us,in,46514,3501,county 6
16,000hnwe,05hjcve,05hjcve,lasercard,lasercard,3695.0,1315.0,1300.0,us,san franciscoca metro,...,650.0,9693140.0,16509693140.0,1.0,1875 shoreline,us,ca,94043,1875,shoreline


# Visualize completeness for column selection

In [4]:
completeness_chart(a_clean, db_api=db_api)

In [5]:
completeness_chart(b_clean, db_api=db_api)

In [6]:
used_columns = ['unique_id', 'name', 'iso', 'state', 'city', 'zipcode', 'address_number', 'street_name', 'websiteurl', 'area_code']

a_used = a_clean[used_columns]
a_used.head(50)

Unnamed: 0,unique_id,name,iso,state,city,zipcode,address_number,street_name,websiteurl,area_code
9651,18802368,lohberger engineering pty ltdsjf,us,,,,,,,
67057,18802818,lockheed martin,us,mn,saint paul,55121,,eagan,httpwwwlockheedmartincom,651
31323,18802831,logus,us,not_defined,not_defined,34409,,palm beach,,not
22669,18804109,lockheed martin,us,nj,moorestown,08057,,moorestown,,856
18784,18883986,lugokdc,us,wa,tacoma,98424,5210.0,12th cfifewa,,253
9397,18884006,lockheed martin gyrocam,us,,sarasota,34243,7345.0,16th 101sarasotafl,httpwwwgyrocamsystemscomcgibinext_redirpl,941
20023,18885103,l3 global communications,us,ny,victor,14564,7640.0,omnitech victor,,585
72198,18889982,levin professional,us,md,silver spring,20902,,washington prof 11242 grandview wheaton md,,301
21208,18890128,lepier oil,us,mn,fosston,56542,320.0,1st fosston,,218
3401,18890131,linc government,us,ky,hopkinsville,42240,101.0,walton hopkinsville,,270


In [7]:
b_used = b_clean[used_columns]
b_used.head(50)

Unnamed: 0,unique_id,name,iso,state,city,zipcode,address_number,street_name,websiteurl,area_code
0,000bfge,lotsoff,us,tx,san antonio,782094859.0,1201.0,austin 116,,210.0
14,000ff7e,legacy,us,ca,fremont,94538.0,4160.0,4th,,510.0
7,000fjle,leadingside,us,ma,cambridge,2141.0,1.0,canal park 3300,,617.0
15,000hnqe,lci industries,us,in,elkhart,46514.0,3501.0,county 6,httpwwwlci1com,574.0
16,000hnwe,lasercard,us,ca,mountain,94043.0,1875.0,shoreline,httpwwwlasercardcom,650.0
19,000jf6e,longwen,us,az,scottsdale,85258.0,7702.0,doubletree ranch 300,,480.0
3,000jp9e,leonardo spa,it,,rome,195.0,,piazza monte grappa 4,httpwwwleonardocompanycom,6.0
6,000l15e,lisi sa,fr,fc,belfort,90000.0,,le millenium 18 rue albert camus,httpwwwlisigroupcom,3.0
22,000ny4e,lumara health,us,mo,chesterfield,63005.0,16640.0,chesterfield 200,,314.0
8,000ny7e,la gear,us,ca,los angeles,90049.0,844.0,moraga,httpwwwlagearcom,310.0


# Determine Blocking Methodology

From these plots, I can see that the following variables appear to be good candidates:
- iso would make a good blocking variable
- company name comparison is key
- comparing addresses might give more information (although a lot of data is missing)
- city, state, and zip code are also missing information but still might be useful 

What do I want to see from blocking rules?
- < 10M added comparisons for each block
- Compare highly specific blocks first, then move to more generalized blocks

In [None]:
blocking_rules = [

    # Highly specific blocks
    block_on("zipcode", "area_code"),
    block_on("state", "city", "area_code"),
    block_on("state", "city"),
    block_on("name", "iso"),
    block_on("street_name", "iso"),

    # Moderately specific blocks
    block_on("state", "area_code"),
    block_on("city", "area_code"),       
    block_on("city"),                    
    block_on("zipcode"),


    # General fallback blocks
    block_on("LEFT(zipcode, 3)"),
    block_on("area_code"),
    block_on("LEFT(name, 8)", "iso"),
]

cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[a_used, b_used],
    blocking_rules=blocking_rules,
    db_api=db_api,
    link_type="link_only"
)

# Initialize Linkage Model

In [None]:
p_2_records = 1/np.min([a_used.shape[0], b_used.shape[0]])
settings = SettingsCreator(
    link_type = "link_only",
    blocking_rules_to_generate_predictions=blocking_rules,
    comparisons=[
        cl.JaroWinklerAtThresholds("name", score_threshold_or_thresholds=[0.92, 0.88, 0.7]),
        cl.JaroWinklerAtThresholds("street_name", score_threshold_or_thresholds=[0.92, 0.88, 0.7]),
        cl.DamerauLevenshteinAtThresholds("address_number"),
        cl.JaroAtThresholds('websiteurl'),

        cl.DamerauLevenshteinAtThresholds("zipcode").configure(term_frequency_adjustments=True),
        cl.DamerauLevenshteinAtThresholds("area_code").configure(term_frequency_adjustments=True),
        
        cl.ExactMatch("iso").configure(term_frequency_adjustments=True),

    ],
    retain_intermediate_calculation_columns=True,

    probability_two_random_records_match=p_2_records
)

linker = Linker([a_used, b_used], settings, db_api=db_api)

# Train Linkage Model

In [10]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name (no m values are trained).
    - street_name (no m values are trained).
    - address_number (no m values are trained).
    - websiteurl (no m values are trained).
    - zipcode (no m values are trained).
    - area_code (no m values are trained).
    - iso (no m values are trained).


In [11]:
training_blocking_rule = block_on("state", "city")
training_session_geo = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."state" = r."state") AND (l."city" = r."city")

Parameter estimates will be made for the following comparison(s):
    - name
    - street_name
    - address_number
    - websiteurl
    - zipcode
    - area_code
    - iso

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.677 in the m_probability of name, level `Exact match on name`
Iteration 2: Largest change in params was 0.68 in the m_probability of street_name, level `All other comparisons`
Iteration 3: Largest change in params was 0.502 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.246 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.0689 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.0315 in probability_two_random

In [12]:
training_blocking_rule = block_on("zipcode")
training_session_zipcode = linker.training.estimate_parameters_using_expectation_maximisation(
    training_blocking_rule
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."zipcode" = r."zipcode"

Parameter estimates will be made for the following comparison(s):
    - name
    - street_name
    - address_number
    - websiteurl
    - area_code
    - iso

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - zipcode

Iteration 1: Largest change in params was 0.31 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.317 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.106 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0344 in probability_two_random_records_match
Iteration 5: Largest change in params was -0.0212 in the m_probability of area_code, level `Exact match on area_code`
Iteration 6: Largest change in params was -0.0146 in the m_probability of area_code, level `Exact match on area_co

In [13]:
training_blocking_rule = block_on("area_code")
training_session_areacode = linker.training.estimate_parameters_using_expectation_maximisation(
    training_blocking_rule
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."area_code" = r."area_code"

Parameter estimates will be made for the following comparison(s):
    - name
    - street_name
    - address_number
    - websiteurl
    - zipcode
    - iso

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - area_code

Iteration 1: Largest change in params was -0.226 in the m_probability of zipcode, level `All other comparisons`
Iteration 2: Largest change in params was 0.231 in the m_probability of zipcode, level `Damerau-Levenshtein distance of zipcode <= 2`
Iteration 3: Largest change in params was 0.0207 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0096 in the m_probability of zipcode, level `All other comparisons`
Iteration 5: Largest change in params was 0.0147 in the m_probability of zipcode, level `All other comparisons`
Iteration 6: Largest 

In [14]:
linker.visualisations.match_weights_chart()

In [15]:
linker.evaluation.unlinkables_chart()

# Predict using trained model

In [16]:
predictions = linker.inference.predict()
pred_df = predictions.as_pandas_dataframe()
pred_df.head()

Blocking time: 11.08 seconds
Predict time: 19.13 seconds


Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,name_l,name_r,gamma_name,bf_name,...,gamma_iso,tf_iso_l,tf_iso_r,bf_iso,bf_tf_adj_iso,state_l,state_r,city_l,city_r,match_key
0,-3.334911,0.090168,__splink__input_table_0,__splink__input_table_1,156280820,0dtrwme,lou moody,cold storage,0,0.972405,...,1,0.503065,0.503065,3.086101,0.643988,tx,tx,san antonio,san antonio,1
1,-9.106291,0.001811,__splink__input_table_0,__splink__input_table_1,156280913,061sl0e,lou marks and sons,louisville of,0,0.972405,...,1,0.503065,0.503065,3.086101,0.643988,ky,ky,louisville,louisville,1
2,-2.589175,0.1425,__splink__input_table_0,__splink__input_table_1,156280941,0081wqe,laughlin industries,lighthouse document technologies,0,0.972405,...,1,0.503065,0.503065,3.086101,0.643988,wa,wa,seattle,seattle,1
3,-5.055099,0.029201,__splink__input_table_0,__splink__input_table_1,156281340,003jhge,leonard parker associates,larsonallen,0,0.972405,...,1,0.503065,0.503065,3.086101,0.643988,mn,mn,minneapolis,minneapolis,1
4,-0.495859,0.41491,__splink__input_table_0,__splink__input_table_1,156281398,0g798ce,locks gallery,locust wny,0,0.972405,...,1,0.503065,0.503065,3.086101,0.643988,pa,pa,philadelphia,philadelphia,1


In [17]:
records_to_view = predictions.as_record_dict(limit=30)
linker.visualisations.waterfall_chart(records_to_view, filter_nulls=False)

In [None]:

def filter_highest_match_probability(pred_df):

    # Sort by unique_id_l and match_probability (descending)
    sorted_df = pred_df.sort_values(['unique_id_l', 'match_probability'], 
                                   ascending=[True, False])
    
    # Keep only the first row (highest probability) for each unique_id_l
    filtered_df = sorted_df.groupby('unique_id_l').first().reset_index()
    
    return filtered_df

In [1]:
filtered_predictions = filter_highest_match_probability(pred_df)
filtered_predictions.head(100)

NameError: name 'filter_highest_match_probability' is not defined