In [None]:
# I got the 'Getting Started' section running here:

import pandas as pd
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

In [None]:
db_api = DuckDBAPI()

In [None]:
# Read in the data
df = pd.read_csv('clean_data.csv', dtype=str)
df.head()

In [None]:
df.shape

In [None]:
# Create the row number column (1 through N)
row_numbers = range(1, len(df) + 1)

# Insert as the first column (at index 0)
df.insert(0, 'unique_id', row_numbers)

df.head()

In [None]:
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.NameComparison("first_name"),
        cl.JaroAtThresholds("last_name"),
        cl.ExactMatch("sex").configure(term_frequency_adjustments=False),
     #   cl.ExactMatch("race").configure(term_frequency_adjustments=False),
        cl.DateOfBirthComparison("birth_date", input_is_string=True),
        cl.DateOfBirthComparison("death_date", input_is_string=True),
        cl.ExactMatch("social_security_number").configure(term_frequency_adjustments=False),
        cl.LevenshteinAtThresholds("address", 1),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.ExactMatch("state").configure(term_frequency_adjustments=True),
        cl.ExactMatch("zip_code").configure(term_frequency_adjustments=True),
        cl.ExactMatch("phone").configure(term_frequency_adjustments=False),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "birth_date"),
        block_on("last_name", "birth_date"),
        block_on("social_security_number"),
        block_on("phone"),
        block_on("zip_code", "birth_date"),
        block_on("last_name", "death_date"),
        block_on("address", "birth_date"),
        block_on("first_name", "last_name", "birth_date"),
        block_on("first_name", "last_name", "zip_code")
    ]
)

In [None]:
linker = Linker(df, settings, db_api)

In [None]:
linker.training.estimate_probability_two_random_records_match(
    [block_on("first_name", "last_name")],
    recall=0.7,
)

In [None]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
training_blocks = [
    block_on("first_name", "last_name"),
    block_on("phone"),
    block_on("address", "birth_date"),
    block_on("last_name", "birth_date"),
    block_on("social_security_number"),
    block_on("zip_code", "birth_date")
]

for rule in training_blocks:
    linker.training.estimate_parameters_using_expectation_maximisation(rule)

In [None]:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

In [None]:
dd = pairwise_predictions.as_pandas_dataframe()

In [None]:
dd['match_probability'].value_counts()

In [None]:
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.95
)

In [None]:
df_clusters = clusters.as_pandas_dataframe()

In [None]:
df_clusters.head()

In [None]:
df_clusters.shape

In [None]:
df_clusters['unique_id'].unique

In [None]:
df['unique_id'].nunique()

In [None]:
df_clusters['unique_id'].nunique()

In [None]:
df_clusters[df_clusters['unique_id'] == 12]

In [None]:
df_clusters['cluster_id'].nunique()

In [None]:
dupes = df_clusters.groupby("source_person_id")["cluster_id"].nunique()

In [None]:
dupes

In [None]:
conflicted_ids = dupes[dupes > 1].index

In [None]:
df_conflicts = df_clusters[df_clusters["source_person_id"].isin(conflicted_ids)]

In [None]:
df_conflicts