Election records splink demonstration

In [1]:
import altair as alt
import pandas as pd
from splink.datasets import splink_datasets
from splink.duckdb.linker import DuckDBLinker

pd.options.display.max_rows = 1000
election_df = pd.read_csv("/project/output/transformed/election_results_table.csv")

In [2]:
from splink.duckdb.blocking_rule_library import block_on

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "last_name", "year","month","state"]),
    ],
}
linker = DuckDBLinker(election_df, settings)

linker.profile_columns(
    ["first_name", "last_name", "state"], top_n=10, bottom_n=5
)

In [3]:
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "last_name", "year","month","state"]),
    ],
    "comparisons": [
        ctl.name_comparison("first_name", term_frequency_adjustments=True),
        ctl.name_comparison("last_name", term_frequency_adjustments=True),
        cl.exact_match("year", term_frequency_adjustments=True),
        cl.exact_match("month",  term_frequency_adjustments=True),
        cl.exact_match("state",  term_frequency_adjustments=True),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01
}

linker = DuckDBLinker(election_df, settings)

In [4]:
linker.estimate_probability_two_random_records_match(
    [
        "l.first_name = r.first_name and l.last_name = r.last_name and l.state = r.state and l.year = r.year and l.month = r.month",
    ],
    recall=0.6,
)

Probability two random records match is estimated to be  5.51e-05.
This means that amongst all possible pairwise record comparisons, one in 18,139.80 are expected to match.  With 1,343,977,935 total possible comparisons, we expect a total of around 74,090.00 matching pairs
