# End to End Matching Example

Before running this notebook on Google Colab, please make sure to set the runtime type to "GPU". Do that by going to the "Runtime" menu and selecting "Change runtime type".

Also, please run [Record-Linkage-Example.ipynb](https://colab.research.google.com/github/vintasoftware/entity-embed/blob/google-collab-notebooks/notebooks/google-colab/Record-Linkage-Example.ipynb) before this one in order to get the trained model and data.

## Boilerplate

In [None]:
!pip install entity-embed
!pip install "matplotlib==3.1.1" \
             "pynndescent==0.5.2" \
             "scikit-learn==0.24.1" \
             "seaborn==0.11.1" \
             "unidecode==1.1.2"

In [None]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [None]:
import sys

sys.path.insert(0, '..')

In [None]:
import entity_embed

In [None]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

## Loading Test Data

We'll use Google Drive to read the Record Linkage output files, please make sure to authorize Google Colab to use your Google Drive account once the cell below runs. Also, please ensure that all files exist.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

rl_output_path = "/content/drive/My Drive/trained-models/notebooks/rl"

In [None]:
import json
from ordered_set import OrderedSet

def load_pair_set(filepath):
    with open(filepath, 'r') as f:
        test_pos_pair_set = json.load(f)
        return OrderedSet(tuple(pair) for pair in test_pos_pair_set)

train_pos_pair_set = load_pair_set(f'{rl_output_path}/rl-train-pos-pairs.json')
valid_pos_pair_set = load_pair_set(f'{rl_output_path}/rl-valid-pos-pairs.json')
test_pos_pair_set = load_pair_set(f'{rl_output_path}/rl-test-pos-pairs.json')

In [None]:
import json

def load_record_dict(filepath):
    with open(filepath, 'r') as f:
        record_dict = json.load(f)
        return {int(id_): record for id_, record in record_dict.items()}

train_record_dict = load_record_dict(f'{rl_output_path}/rl-train-records.json')
valid_record_dict = load_record_dict(f'{rl_output_path}/rl-valid-records.json')
test_record_dict = load_record_dict(f'{rl_output_path}/rl-test-records.json')

## Loading Model

In [None]:
from entity_embed import LinkageEmbed

model = LinkageEmbed.load_from_checkpoint(f'{rl_output_path}/rl-model.ckpt')
model = model.to(torch.device('cuda'))

## Blocking

Use `sim_threshold = 0.375` to have ~6k pairs in `train_found_pair_set` and have a fair comparison with [baseline-models/End-to-End-Matching-Baseline.ipynb](./baseline-models/End-to-End-Matching-Baseline.ipynb)

In [None]:
%%time

eval_batch_size = 64
ann_k = 100
sim_threshold = 0.375

train_found_pair_set, train_left_field_vector_dict, train_right_field_vector_dict = model.predict_pairs(
    record_dict=train_record_dict,
    batch_size=eval_batch_size,
    ann_k=ann_k,
    sim_threshold=sim_threshold,
    show_progress=True,
    return_field_embeddings=True
)
len(train_found_pair_set)

In [None]:
%%time

valid_found_pair_set, valid_left_field_vector_dict, valid_right_field_vector_dict = model.predict_pairs(
    record_dict=valid_record_dict,
    batch_size=eval_batch_size,
    ann_k=ann_k,
    sim_threshold=sim_threshold,
    show_progress=True,
    return_field_embeddings=True
)
len(valid_found_pair_set)

In [None]:
%%time

test_found_pair_set, test_left_field_vector_dict, test_right_field_vector_dict = model.predict_pairs(
    record_dict=test_record_dict,
    batch_size=eval_batch_size,
    ann_k=ann_k,
    sim_threshold=sim_threshold,
    show_progress=True,
    return_field_embeddings=True
)
len(test_found_pair_set)

In [None]:
test_attn_scores_dict = model.interpret_attention(
    record_dict=test_record_dict,
    batch_size=eval_batch_size,
    field='title',
)

In [None]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(test_found_pair_set), len(test_record_dict))

In [None]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(test_found_pair_set, test_pos_pair_set)

Complement the train/valid `found_pair_set` with `pos_pair_set` for training.  
Leave test untoched, to reproduce production behavior:

In [None]:
train_found_pair_set |= train_pos_pair_set
valid_found_pair_set |= valid_pos_pair_set

## Matching: Compare

Make a dataframe `df` with all records (train, valid, test) to add additional features:

In [None]:
record_dict = {**train_record_dict, **valid_record_dict, **test_record_dict}

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(record_dict, orient='index')
df = df.drop(columns='id')

In [None]:
df['all'] = df.agg('{0[title]} - {0[manufacturer]} - {0[price]}'.format, axis=1)
df['price'] = pd.to_numeric(df['price'].str.replace(' ', ''), errors='coerce')
del df['cluster']
df.head(3)

Replace all `record_dict`s (train, valid, test) to add additional features:

In [None]:
train_record_dict = df.loc[train_record_dict.keys()].to_dict(orient='index')
valid_record_dict = df.loc[valid_record_dict.keys()].to_dict(orient='index')
test_record_dict = df.loc[test_record_dict.keys()].to_dict(orient='index')

In [None]:
import textdistance as td
import math

def exact_eq(x, y):
    return float(x == y)

def token_ops(func):
    def new_func(x, y):
        return func(x.split(), y.split())    
    return new_func

def abs_diff(x, y):
    return abs(x - y)

def abs_diff_log10(x, y):
    diff = abs_diff(x, y)
    if diff > 1:
        return math.log10(diff)
    else:
        return 0.0

SIM_FUNC_DICT = {
    ("all", "jaccard"): token_ops(td.jaccard.normalized_similarity),
    ("all", "overlap"): token_ops(td.overlap.normalized_similarity),
    ("all", "damerau_levenshtein"): td.damerau_levenshtein.normalized_similarity,
    ("all", "jaro_winkler"): td.jaro_winkler.normalized_similarity,
    ("manufacturer", "jaccard"): token_ops(td.jaccard.normalized_similarity),
    ("manufacturer", "overlap"): token_ops(td.overlap.normalized_similarity),
    ("manufacturer", "damerau_levenshtein"): td.damerau_levenshtein.normalized_similarity,
    ("manufacturer", "jaro_winkler"): td.jaro_winkler.normalized_similarity,
    ("title", "jaccard"): token_ops(td.jaccard.normalized_similarity),
    ("title", "overlap"): token_ops(td.overlap.normalized_similarity),
    ("title", "damerau_levenshtein"): td.damerau_levenshtein.normalized_similarity,
    ("title", "jaro_winkler"): td.jaro_winkler.normalized_similarity,
    ("price", "abs_diff"): abs_diff,
    ("price", "abs_diff_log10"): abs_diff_log10,
}

def record_sim_func(record_pair):
    record_left, record_right = record_pair
    feature_dict = {}
    
    for (field, sim_func_name), sim_func in SIM_FUNC_DICT.items():
        x = record_left[field]
        y = record_right[field]
        if x and y:
            if sim_func_name.startswith('abs_diff') and (math.isnan(x) or math.isnan(y)):
                sim = -1.0
            else:
                sim = sim_func(x, y)
        else:
            sim = -1.0
        feature_dict[f"{field}_{sim_func_name}"] = sim
    
    return feature_dict

In [None]:
%%time

pair = next(iter(test_pos_pair_set))
id_left, id_right = pair
feature_dict = record_sim_func((test_record_dict[id_left], test_record_dict[id_right]))

# display(test_record_dict[id_left], test_record_dict[id_right])
feature_dict

In [None]:
from collections import defaultdict
import multiprocessing
from tqdm.auto import tqdm

def compare_pairs(record_dict, found_pair_set):
    all_feature_dict = defaultdict(list)
    chunksize = 100
    tasks = (
        (record_dict[id_left], record_dict[id_right])
        for (id_left, id_right)
        in found_pair_set
    )

    with multiprocessing.Pool() as pool:
        for feature_dict in tqdm(
            pool.imap(record_sim_func, tasks, chunksize=chunksize),
            total=len(found_pair_set)
        ):
            for feature, val in feature_dict.items():
                all_feature_dict[feature].append(val)

        pool.close()
        pool.join()
    
    return pd.DataFrame(all_feature_dict, index=pd.MultiIndex.from_tuples(found_pair_set))

In [None]:
%%time

train_feature_df = compare_pairs(train_record_dict, train_found_pair_set)
assert len(train_feature_df) == len(train_found_pair_set)
len(train_found_pair_set)

In [None]:
%%time

valid_feature_df = compare_pairs(valid_record_dict, valid_found_pair_set)
assert len(valid_feature_df) == len(valid_found_pair_set)
len(valid_found_pair_set)

In [None]:
%%time

test_feature_df = compare_pairs(test_record_dict, test_found_pair_set)
assert len(test_feature_df) == len(test_found_pair_set)
len(test_found_pair_set)

## Matching: Compare - TFIDF Feature

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf_vectorizer(train_record_dict, valid_record_dict, field='all'):
    tfidf_vectorizer = TfidfVectorizer(
        analyzer='char',
        ngram_range=(2,4),
        min_df=2
    )
    train_valid_record_dict = {**train_record_dict, **valid_record_dict}
    tfidf_vectorizer.fit(record[field] for record in train_valid_record_dict.values())
    return tfidf_vectorizer

tfidf_vectorizer = get_tfidf_vectorizer(train_record_dict, valid_record_dict)
tfidf_vectorizer

In [None]:
import numpy as np

def add_tfidf_feature(tfidf_vectorizer, feature_df, record_dict, found_pair_set, field='all'):
    tfidf_matrix = tfidf_vectorizer.transform(record[field] for record in record_dict.values())

    id_to_idx = {id_: idx for idx, id_ in enumerate(record_dict.keys())}
    left_idx = [id_to_idx[left_id] for left_id, __ in found_pair_set]
    right_idx = [id_to_idx[right_id] for __, right_id in found_pair_set]
    tfidf_sim = tfidf_matrix[left_idx].multiply(tfidf_matrix[right_idx]).sum(axis=1)  # cos per row

    feature_df[f'{field}_tfidf'] = tfidf_sim

In [None]:
%%time

add_tfidf_feature(tfidf_vectorizer, train_feature_df, train_record_dict, train_found_pair_set)

In [None]:
%%time

add_tfidf_feature(tfidf_vectorizer, valid_feature_df, valid_record_dict, valid_found_pair_set)

In [None]:
%%time

add_tfidf_feature(tfidf_vectorizer, test_feature_df, test_record_dict, test_found_pair_set)

## Matching: Compare - Embedding Cosine Feature 

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

def add_embed_cos_features(feature_df, left_field_vector_dict, right_field_vector_dict, found_pair_set):
    fields = next(iter(left_field_vector_dict.values())).keys()
    
    for field in fields:
        left_embed = np.stack([left_field_vector_dict[left_id][field] for left_id, __ in found_pair_set])
        right_embed = np.stack([right_field_vector_dict[right_id][field] for __, right_id in found_pair_set])
        normalize(left_embed, copy=False)
        normalize(right_embed, copy=False)
        sim = np.multiply(left_embed, right_embed).sum(axis=1)  # cos per row
        feature_df[f'embed_{field}_cos'] = sim

In [None]:
%%time

add_embed_cos_features(
    train_feature_df,
    train_left_field_vector_dict,
    train_right_field_vector_dict,
    train_found_pair_set)

In [None]:
%%time

add_embed_cos_features(
    valid_feature_df,
    valid_left_field_vector_dict,
    valid_right_field_vector_dict,
    valid_found_pair_set)

In [None]:
%%time

add_embed_cos_features(
    test_feature_df,
    test_left_field_vector_dict,
    test_right_field_vector_dict,
    test_found_pair_set)

## Matching: Classify

In [None]:
train_valid_feature_df = pd.concat([train_feature_df, valid_feature_df])
train_valid_feature_df.head(3)

In [None]:
train_true_y = np.array([pair in train_pos_pair_set for pair in train_found_pair_set], dtype='i4')
valid_true_y = np.array([pair in valid_pos_pair_set for pair in valid_found_pair_set], dtype='i4')
test_true_y = np.array([pair in test_pos_pair_set for pair in test_found_pair_set], dtype='i4')

In [None]:
train_valid_true_y = np.concatenate([train_true_y, valid_true_y])

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV

cv = PredefinedSplit(
    np.concatenate([
        np.full(train_true_y.shape[0], -1 ,dtype='i4'),
        np.zeros(valid_true_y.shape[0], dtype='i4')
    ])
)
param_grid = {
    'n_estimators': [10, 100, 200],
    'max_depth': [5, 10, 25, 50, None],
    'min_samples_leaf': [1, 3, 5],
}
clf = RandomForestClassifier(oob_score=True, random_state=random_seed)
clf = GridSearchCV(clf, param_grid, scoring='f1', cv=cv, verbose=10, n_jobs=-1)
clf.fit(train_valid_feature_df, train_valid_true_y);

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
clf.best_estimator_.oob_score_

In [None]:
feature_importances = dict(zip(train_valid_feature_df.columns, clf.best_estimator_.feature_importances_))
sorted(feature_importances.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
from sklearn.metrics import accuracy_score

cls_threshold = 0.3
train_valid_pred_y = clf.predict_proba(train_valid_feature_df)
train_valid_pred_y[train_valid_pred_y >= cls_threshold] = 1
train_valid_pred_y[train_valid_pred_y < cls_threshold] = 0
train_valid_pred_y = train_valid_pred_y[:, 1]
accuracy_score(train_valid_pred_y, train_valid_true_y)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

prob_y = clf.predict_proba(test_feature_df)
pred_y = np.copy(prob_y)
pred_y[pred_y >= cls_threshold] = 1
pred_y[pred_y < cls_threshold] = 0
pred_y = pred_y[:, 1]
precision_recall_fscore_support(test_true_y, pred_y, labels=[1])

In [None]:
cls_found_pair_set = OrderedSet(test_feature_df[pred_y.astype(bool)].index)

precision_and_recall(cls_found_pair_set, test_pos_pair_set)

False negatives:

In [None]:
for x, y in list(test_pos_pair_set - cls_found_pair_set)[:5]:
    print(clf.predict_proba(test_feature_df.loc[[(x, y)]]))
    display(df.loc[[x, y]])

In [None]:
pair = (725, 1803)
display(test_feature_df.loc[[pair]])
clf.predict_proba(test_feature_df.loc[[pair]])

Hard cases attention:

In [None]:
prob_df = pd.DataFrame(prob_y[:,1], columns=['prob'], index=test_feature_df.index)
hard_prob_df = prob_df.loc[(prob_df['prob'] >= cls_threshold) & (prob_df['prob'] <= cls_threshold + 0.01)]
hard_prob_df = hard_prob_df.loc[hard_prob_df.index.intersection(test_pos_pair_set)]
hard_prob_df

In [None]:
import seaborn as sns
from entity_embed import default_tokenizer

def display_attention(id_, field):
    tokens = default_tokenizer(test_record_dict[id_][field])
    attn_scores = test_attn_scores_dict[id_][:len(tokens)]
    attn_df = pd.DataFrame(dict(zip(tokens, attn_scores)), index=[id_])
    cm = sns.light_palette("red", as_cmap=True)
    display(attn_df.style.background_gradient(cmap=cm, axis=1))

def display_pair_attention(pair, field):
    left_id, right_id = pair
    display_attention(left_id, field)
    display_attention(right_id, field)

for pair in hard_prob_df.index:
    display_pair_attention(pair, 'title')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score

plt.figure(figsize=(16,10))

disp = plot_precision_recall_curve(clf, test_feature_df, test_true_y)