# Record Linkage Baseline with TF-IDF

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '../..')

In [4]:
import entity_embed

In [5]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

## Load Dataset

In [6]:
import urllib
import tempfile

dataset_url = 'https://dbs.uni-leipzig.de/file/Amazon-GoogleProducts.zip'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [7]:
import zipfile
import os

td = tempfile.TemporaryDirectory()

with zipfile.ZipFile(tf.name, "r") as zf:
    zf.extractall(td.name)

os.listdir(td.name)

['Amazon.csv', 'GoogleProducts.csv', 'Amzon_GoogleProducts_perfectMapping.csv']

In [8]:
import csv

from entity_embed.data_utils.utils import Enumerator

id_enumerator = Enumerator()
record_dict = {}
source_field = '__source'
left_source = 'amazon'
right_source = 'google'

with open(f'{td.name}/Amazon.csv', newline='', encoding="latin1") as f:
    for record in csv.DictReader(f):
        record['id'] = id_enumerator[record["id"]]
        record['name'] = record.pop('title')  # in Amazon, name is called title
        record[source_field] = left_source
        record_dict[record['id']] = record

with open(f'{td.name}/GoogleProducts.csv', newline='', encoding="latin1") as f:
    for record in csv.DictReader(f):
        record['id'] = id_enumerator[record["id"]]
        record[source_field] = right_source
        record_dict[record['id']] = record

In [9]:
pos_pair_set = set()

with open(f'{td.name}/Amzon_GoogleProducts_perfectMapping.csv', newline='') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idAmazon']]
        id_right = id_enumerator[row['idGoogleBase']]
        pos_pair_set.add((id_left, id_right))

len(pos_pair_set)

1300

In [10]:
from entity_embed.data_utils import utils

cluster_mapping, cluster_dict = utils.id_pairs_to_cluster_mapping_and_dict(pos_pair_set, record_dict)
len(cluster_dict)

3290

In [11]:
cluster_dict[4]

[262, 2485, 2488]

In [12]:
[cluster_mapping[id_] for id_ in cluster_dict[4]]

[4, 4, 4]

In [13]:
cluster_field = 'cluster'
utils.assign_clusters(record_dict, cluster_field, cluster_mapping)

for id_ in cluster_dict[4]:
    display(record_dict[id_])

{'id': 262,
 'description': 'sp linux we 50 lic/cd 3.0c',
 'manufacturer': 'hewlett packard (consumables)',
 'price': '0',
 'name': 'hp sp linux we 50 lic/cd 3.0c ( t3586a )',
 '__source': 'amazon',
 'cluster': 4}

{'id': 2485,
 'name': 'sp linux we 50 lic/cd 3.0c',
 'description': '',
 'manufacturer': '',
 'price': '69216.95',
 '__source': 'google',
 'cluster': 4}

{'id': 2488,
 'name': 'sp linux we 50 lic/cd 3.0c',
 'description': '',
 'manufacturer': '',
 'price': '69216.95',
 '__source': 'google',
 'cluster': 4}

In [14]:
from entity_embed.data_utils.utils import cluster_dict_to_id_pairs

len(cluster_dict_to_id_pairs(cluster_dict) - pos_pair_set)

253

In [15]:
from entity_embed.data_utils import utils

train_record_dict, valid_record_dict, test_record_dict = utils.split_record_dict_on_clusters(
    record_dict=record_dict,
    cluster_field=cluster_field,
    train_proportion=0.2,
    valid_proportion=0.2,
    random_seed=random_seed)

10:05:26 INFO:Singleton cluster sizes (train, valid, test):(437, 437, 1311)
10:05:26 INFO:Plural cluster sizes (train, valid, test):(221, 221, 663)


In [16]:
len(train_record_dict), len(valid_record_dict), len(test_record_dict)

(926, 912, 2751)

In [17]:
import os

td.cleanup()
os.remove(tf.name)

## Preprocess

In [18]:
field_list = ['name', 'description', 'manufacturer', 'price']

In [19]:
import unidecode
import itertools
from entity_embed import default_tokenizer

def clean_str(s):
    max_tokens = 100
    max_chars = 1000
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = default_tokenizer(s)[:max_tokens]
    return ' '.join(s_tokens)[:max_chars]

for record in record_dict.values():
    for field in field_list:
        record[field] = clean_str(record[field])

## True positive pair sets

In [20]:
left_id_set, right_id_set = utils.record_dict_to_left_right_id_set(
    record_dict=record_dict,
    source_field=source_field,
    left_source=left_source,
)
train_cluster_dict = utils.record_dict_to_cluster_dict(
    train_record_dict, cluster_field
)
valid_cluster_dict = utils.record_dict_to_cluster_dict(
    valid_record_dict, cluster_field
)
test_cluster_dict = utils.record_dict_to_cluster_dict(
    test_record_dict, cluster_field
)

train_pos_pair_set = utils.cluster_dict_to_id_pairs(
    train_cluster_dict,
    left_id_set=left_id_set,
    right_id_set=right_id_set
)
valid_pos_pair_set = utils.cluster_dict_to_id_pairs(
    valid_cluster_dict,
    left_id_set=left_id_set,
    right_id_set=right_id_set
)
test_pos_pair_set = utils.cluster_dict_to_id_pairs(
    test_cluster_dict,
    left_id_set=left_id_set,
    right_id_set=right_id_set
)

In [21]:
len(left_id_set), len(right_id_set)

(1363, 3226)

In [22]:
len(train_pos_pair_set), len(valid_pos_pair_set), len(test_pos_pair_set)

(268, 254, 778)

## TF-IDF

In [23]:
train_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in train_record_dict.items()}
valid_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in valid_record_dict.items()}
test_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in test_record_dict.items()}

train_name_dict[262]

'hp sp linux we 50 lic / cd 3 . 0c ( t3586a ) sp linux we 50 lic / cd 3 . 0c hewlett packard ( consumables ) 0'

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2,4),
    min_df=2
)

In [25]:
train_tfidf_matrix = tfidf_vectorizer.fit_transform(
    itertools.chain(train_name_dict.values(), valid_name_dict.values()))
train_tfidf_matrix

<1838x29779 sparse matrix of type '<class 'numpy.float64'>'
	with 1094687 stored elements in Compressed Sparse Row format>

In [26]:
test_tfidf_matrix = tfidf_vectorizer.transform(test_name_dict.values())
test_tfidf_matrix

<2751x29779 sparse matrix of type '<class 'numpy.float64'>'
	with 1609033 stored elements in Compressed Sparse Row format>

In [27]:
%%time

import pynndescent

index = pynndescent.NNDescent(test_tfidf_matrix, random_state=random_seed, metric="cosine")

CPU times: user 2min 40s, sys: 49.5 s, total: 3min 30s
Wall time: 1min


In [28]:
index.neighbor_graph

(array([[   0,  829, 1418, ..., 1729, 2529, 1782],
        [   1,   21,  125, ...,   34, 2747, 1569],
        [   2,  875, 1467, ...,  183,  576, 1473],
        ...,
        [2748, 1766, 2747, ..., 1581, 1527, 2485],
        [2749, 1131, 2750, ..., 1621, 2295, 2337],
        [2750, 2228, 2268, ..., 1292, 1817, 2432]]),
 array([[0.00000000e+00, 3.00228744e-01, 3.51883112e-01, ...,
         8.80558527e-01, 8.81367141e-01, 8.83136569e-01],
        [3.57627751e-07, 6.52536948e-01, 6.64479614e-01, ...,
         7.41805508e-01, 7.42226777e-01, 7.43839719e-01],
        [1.19209271e-07, 6.36853172e-01, 7.35978994e-01, ...,
         8.84838734e-01, 8.85170582e-01, 8.85556467e-01],
        ...,
        [2.38418528e-07, 5.37369222e-02, 2.16594446e-01, ...,
         7.66124577e-01, 7.67505854e-01, 7.68192396e-01],
        [0.00000000e+00, 6.06169811e-01, 6.34850939e-01, ...,
         8.81643174e-01, 8.82741866e-01, 8.83194879e-01],
        [0.00000000e+00, 2.94851373e-01, 4.22057558e-01, ...,
    

In [29]:
sim_threshold = 0.3

found_neighbors, found_distances = index.neighbor_graph
found_sims = 1 - found_distances
found_pair_set = set()
test_ids = list(test_name_dict.keys())

for left_id, neighbors, sims in zip(test_ids, found_neighbors.tolist(), found_sims.tolist()):
    for right_i, sim in zip(neighbors, sims):
        right_id = test_ids[right_i]
        
        if left_id == right_id:
            continue
        if sim >= sim_threshold:
            if left_id in left_id_set:
                found_pair = (left_id, right_id)
            else:
                found_pair = (right_id, left_id)
            found_pair_set.add(found_pair)
            
len(found_pair_set)

26644

In [30]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(found_pair_set), len(test_record_dict))

9.685205379861868

In [31]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(found_pair_set, test_pos_pair_set)

(0.023044587899714756, 0.7892030848329049)