In [None]:
# I got the 'Getting Started' section running here:

import pandas as pd
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

In [None]:
db_api = DuckDBAPI()

In [None]:
# Read in the data
df = pd.read_csv('clean_data.csv', dtype=str)
df.head()

In [None]:
df.shape

In [None]:
# Create the row number column (1 through N)
row_numbers = range(1, len(df) + 1)

# Insert as the first column (at index 0)
df.insert(0, 'unique_id', row_numbers)

df.head()

In [None]:
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.JaroAtThresholds("first_name"),
        cl.NameComparison("last_name"),
        cl.DateOfBirthComparison(
            "birth_date",
            input_is_string=True,
        ),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.ExactMatch("state").configure(term_frequency_adjustments=True),
        cl.ExactMatch("zip_code").configure(term_frequency_adjustments=True),
        cl.ExactMatch("social_security_number").configure(term_frequency_adjustments=True),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "birth_date"),
        block_on("last_name"),
    ]
)

import splink.comparison_library as cl

first_name_comparison = cl.ExactMatch("first_name").configure(
    term_frequency_adjustments=True
)

In [None]:
linker = Linker(df, settings, db_api)

In [None]:
linker.training.estimate_probability_two_random_records_match(
    [block_on("first_name", "last_name")],
    recall=0.7,
)

In [None]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("first_name", "last_name")
)

In [None]:
linker.training.estimate_parameters_using_expectation_maximisation(block_on("social_security_number"))

In [None]:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

In [None]:
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.95
)

In [None]:
df_clusters = clusters.as_pandas_dataframe(limit=5)

In [None]:
df_clusters