In [15]:
import re
import uuid

# TODO: #92 Make orgs classification script into more well-defined pipeline
import numpy as np
import pandas as pd
import requests
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

from utils.constants import BASE_FILEPATH, organizations_settings, organizations_blocking
from utils.linkage import standardize_corp_names, splink_dedupe
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl


nlp = spacy.load("en_core_web_sm")

In [4]:
aggregated_classification_csv = (
    BASE_FILEPATH / "data" / "classification" / "merged_cleaned_company_classification.csv"
)

In [5]:
data = pd.read_csv(aggregated_classification_csv)
data.head()

Unnamed: 0,company_name,stock_symbol,legal_name,address,city,state,zipcode,area_code,ABI,primary_SIC_code,SIC6_description,primary_NAICS_code,NAICS8_description,parent_company_ABI,classification,unique_id,parent_company_unique_id
0,3R PETROLEUM OLEO E GAS,RRRP3,3R Petroleum Oleo E Gas SA,,,,,,,,,,,,f,29cc3767-ea5d-49d3-a10e-f248951f646c,
1,88 ENERGY,88E,88 Energy Ltd,,,,,,,,,,,,f,40053e24-f387-4f1a-bc7d-382e80d063c0,
2,A.B.P. NOCIVELLI,ABP,A.b.p. Nocivelli SpA,,,,,,,,,,,,f,fb2181b7-e2d8-495b-af38-667b060b896b,
3,A2A,A2A,A2A SpA,,,,,,,,,,,,f,dff4fe0b-a3f2-4702-9404-a0dde1bb7002,
4,ABRAJ ENERGY SERVS,ABRJ,Abraj Energy Services,,,,,,,,,,,,f,5fcbcc39-4f8d-4e31-9812-06414ea1ef5e,


In [8]:
# Initialise the linker, passing in the input dataset(s)
from splink.duckdb.linker import DuckDBLinker
linker = DuckDBLinker(data)

In [9]:
linker.missingness_chart()


In [10]:
linker.profile_columns(top_n=10, bottom_n=5)


* Lots of 770 area code --> use term_frequency_adjustments to weigh this differently
* company_name is a good linking variable along with zipcode

In [11]:

def convert_duplicates_to_dict(df_with_matches: pd.DataFrame) -> None:
    """Map each uuid to all other uuids for which it has been deemed a match

    Given a dataframe where the uuids of all rows deemed similar are stored in a
    list and all but the first row of each paired uuid is dropped, this function
    maps the matched uuids to a single uuid.

    Args:
        df_with_matches: A pandas df containing a column called 'duplicated',
            where each row is a list of all uuids deemed a match. In each list,
            all uuids but the first have their rows already dropped.

    Returns:
        None. However it outputs a file to the output directory, with 2
        columns. The first lists all the uuids in df, and is labeled
        'original_uuids.' The 2nd shows the uuids to which each entry is mapped
        to, and is labeled 'mapped_uuids'.
    """
    deduped_dict = {}
    for i in range(len(df_with_matches)):
        deduped_uudis = df_with_matches.iloc[i]["duplicated"]
        for j in range(len(deduped_uudis)):
            deduped_dict.update({deduped_uudis[j]: df_with_matches.iloc[i]["id"]})

    # now convert dictionary into a csv file
    deduped_df = pd.DataFrame.from_dict(deduped_dict, "index")
    deduped_df = deduped_df.reset_index().rename(
        columns={"index": "original_uuids", 0: "mapped_uuid"}
    )
    deduped_df.to_csv(
        BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv",
        index=False,
        mode="a",
    )


In [12]:
def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame:
    """Use splink to deduplicate dataframe based on settings

    Configuration settings and blocking can be found in constants.py as
    individuals_settings, indivduals_blocking, organizations_settings,
    organizations_blocking

    Uses the splink library which employs probabilistic matching for
    record linkage
    https://moj-analytical-services.github.io/splink/index.html


    Args:
        df: dataframe
        settings: configuration settings
            (based on splink documentation and dataframe columns)
        blocking: list of columns to block on for the table
            (cuts dataframe into parts based on columns labeled blocks)

    Returns:
        deduplicated version of initial dataframe with column 'matching_id'
        that holds list of matching unique_ids
    """
    linker = DuckDBLinker(df, settings)
    linker.estimate_probability_two_random_records_match(
        blocking, recall=0.6
    )  # default
    linker.estimate_u_using_random_sampling(max_pairs=5e6)

    for i in blocking:
        linker.estimate_parameters_using_expectation_maximisation(i)

    df_predict = linker.predict()
    clusters = linker.cluster_pairwise_predictions_at_threshold(
        df_predict, threshold_match_probability=0.7
    )  # default
    clusters_df = clusters.as_pandas_dataframe()

    match_list_df = (
        clusters_df.groupby("cluster_id")["unique_id"].agg(list).reset_index()
    )  # dataframe where cluster_id maps unique_id to initial instance of row
    match_list_df = match_list_df.rename(columns={"unique_id": "duplicated"})

    first_instance_df = clusters_df.drop_duplicates(subset="cluster_id")
    col_names = np.append("cluster_id", df.columns)
    first_instance_df = first_instance_df[col_names]

    deduped_df = first_instance_df.merge(
        match_list_df[["cluster_id", "duplicated"]],
        on="cluster_id",
        how="left",
    )
    deduped_df = deduped_df.rename(columns={"cluster_id": "unique_id"})

    deduped_df["duplicated"] = deduped_df["duplicated"].apply(
        lambda x: x if isinstance(x, list) else [x]
    )
    convert_duplicates_to_dict(deduped_df)

    deduped_df = deduped_df.drop(columns=["duplicated"])

    return deduped_df


In [33]:
organizations_settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.company_name = r.company_name",
        "l.zipcode = r.zipcode"
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01,
}

In [34]:
organizations_blocking = ["l.company_name = r.company_name", "l.zipcode = r.zipcode"]

In [35]:
splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)

Probability two random records match is estimated to be  0.0155.
This means that amongst all possible pairwise record comparisons, one in 64.67 are expected to match.  With 1,212,903 total possible comparisons, we expect a total of around 18,756.67 matching pairs
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is fully trained. All comparisons have at least one estimate for their m and u values

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.company_name = r.company_name

Parameter estimates will be made for the following comparison(s):

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 



SplinkException: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):

        CREATE TABLE __splink__m_u_counts_d3d7043ef
        AS
        (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), 
__splink__df_match_weight_parts as (
    select "unique_id_l","unique_id_r","zipcode_l","zipcode_r","company_name_l","company_name_r",match_key 
    from __splink__df_comparison_vectors
    ), 
__splink__df_predict as (
    select
    log2(cast(0.015707176032864763 as float8) * ) as match_weight,
    CASE WHEN  THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,
    "unique_id_l","unique_id_r","zipcode_l","zipcode_r","company_name_l","company_name_r",match_key 
    from __splink__df_match_weight_parts
    
    order by 1
    ) 
    select 0 as comparison_vector_value,
           sum(match_probability * 1) /
               sum(1) as m_count,
           sum((1-match_probability) * 1) /
               sum(1) as u_count,
           '_probability_two_random_records_match' as output_column_name
    from __splink__df_predict
    )
        

Error was: Parser Error: syntax error at or near ")"