# Deduplication Baseline with TF-IDF

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '../..')

In [4]:
import entity_embed

In [5]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

## Load Dataset

In [6]:
import urllib
import tempfile

dataset_url = 'https://www.informatik.uni-leipzig.de/~saeedi/musicbrainz-20-A01.csv.dapo'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [7]:
import csv

record_dict = {}
cluster_field = 'CID'

with open(tf.name, newline='') as f:
    for current_record_id, record in enumerate(csv.DictReader(f)):
        record['id'] = current_record_id
        record[cluster_field] = int(record[cluster_field])  # convert cluster_field to int
        record_dict[current_record_id] = record

In [8]:
record_dict[83]

{'TID': '84',
 'CID': 9369,
 'CTID': '4',
 'SourceID': '4',
 'id': 83,
 'number': '1',
 'title': '001-Berimbou',
 'length': '2m 23sec',
 'artist': 'Astrud Gilberto',
 'album': 'Look to the Rainbow (2008)',
 'year': 'null',
 'language': ' Eng.'}

In [9]:
cluster_total = len(set(record[cluster_field] for record in record_dict.values()))
cluster_total

10000

In [10]:
from entity_embed.data_utils import utils

train_record_dict, valid_record_dict, test_record_dict = utils.split_record_dict_on_clusters(
    record_dict=record_dict,
    cluster_field=cluster_field,
    train_proportion=0.2,
    valid_proportion=0.2,
    random_seed=random_seed)

10:05:22 INFO:Singleton cluster sizes (train, valid, test):(1000, 1000, 3000)
10:05:22 INFO:Plural cluster sizes (train, valid, test):(1000, 1000, 3000)


In [11]:
len(train_record_dict), len(valid_record_dict), len(test_record_dict)

(3845, 3876, 11654)

In [12]:
import os

os.remove(tf.name)

## Preprocessing

In [13]:
field_list = ['number', 'title', 'artist', 'album', 'year', 'language']

In [14]:
import unidecode

def clean_str(s):
    return unidecode.unidecode(s).lower().strip()

for record in record_dict.values():
    for field in field_list:
        record[field] = clean_str(record[field])

In [15]:
utils.subdict(record_dict[83], field_list)

{'number': '1',
 'title': '001-berimbou',
 'artist': 'astrud gilberto',
 'album': 'look to the rainbow (2008)',
 'year': 'null',
 'language': 'eng.'}

## True positive pair sets

In [16]:
train_cluster_dict = utils.record_dict_to_cluster_dict(
    train_record_dict, cluster_field
)
valid_cluster_dict = utils.record_dict_to_cluster_dict(
    valid_record_dict, cluster_field
)
test_cluster_dict = utils.record_dict_to_cluster_dict(
    test_record_dict, cluster_field
)

train_pos_pair_set = utils.cluster_dict_to_id_pairs(
    train_cluster_dict,
)
valid_pos_pair_set = utils.cluster_dict_to_id_pairs(
    valid_cluster_dict,
)
test_pos_pair_set = utils.cluster_dict_to_id_pairs(
    test_cluster_dict,
)

In [17]:
len(train_pos_pair_set), len(valid_pos_pair_set), len(test_pos_pair_set)

(3193, 3256, 9801)

## TF-IDF


In [18]:
train_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in train_record_dict.items()}
valid_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in valid_record_dict.items()}
test_name_dict = {id_: " ".join(record[f] for f in field_list) for id_, record in test_record_dict.items()}

test_name_dict[83]

'1 001-berimbou astrud gilberto look to the rainbow (2008) null eng.'

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2,4),
    min_df=2
)

In [20]:
import itertools

train_tfidf_matrix = tfidf_vectorizer.fit_transform(
    itertools.chain(train_name_dict.values(), valid_name_dict.values()))
train_tfidf_matrix

<7721x52124 sparse matrix of type '<class 'numpy.float64'>'
	with 1450315 stored elements in Compressed Sparse Row format>

In [21]:
test_tfidf_matrix = tfidf_vectorizer.transform(test_name_dict.values())
test_tfidf_matrix

<11654x52124 sparse matrix of type '<class 'numpy.float64'>'
	with 2121492 stored elements in Compressed Sparse Row format>

In [22]:
%%time

import pynndescent

index = pynndescent.NNDescent(test_tfidf_matrix, random_state=random_seed, metric="cosine")

CPU times: user 3min 15s, sys: 22.2 s, total: 3min 37s
Wall time: 51.4 s


In [23]:
index.neighbor_graph

(array([[    0,  7339,  7340, ...,   712,  1834,  4256],
        [    1,  6845,  9429, ...,  6308,  9878, 10920],
        [    2,  6869,  4952, ...,  3900,  3748, 11034],
        ...,
        [11651, 11650,  5900, ...,  8553,  5467,  4001],
        [11652, 11653,   866, ...,  2270,  5665,  9133],
        [11653, 11652,  5381, ...,   264,  9131,  1672]]),
 array([[1.19209271e-07, 7.43295037e-01, 7.46758094e-01, ...,
         8.85854316e-01, 8.86087947e-01, 8.86470205e-01],
        [0.00000000e+00, 7.20315974e-01, 7.38866140e-01, ...,
         8.41928307e-01, 8.42788403e-01, 8.45785791e-01],
        [0.00000000e+00, 7.57447585e-01, 7.59738244e-01, ...,
         8.60484660e-01, 8.67965105e-01, 8.69241992e-01],
        ...,
        [0.00000000e+00, 2.28383144e-01, 7.96069282e-01, ...,
         8.81466243e-01, 8.82256843e-01, 8.84302362e-01],
        [1.19209271e-07, 5.64501771e-02, 7.45982821e-01, ...,
         8.80752954e-01, 8.81273372e-01, 8.82306334e-01],
        [0.00000000e+00, 5.645

In [24]:
sim_threshold = 0.3

found_neighbors, found_distances = index.neighbor_graph
found_sims = 1 - found_distances
found_pair_set = set()
test_ids = list(test_name_dict.keys())

for left_id, neighbors, sims in zip(test_ids, found_neighbors.tolist(), found_sims.tolist()):
    for right_i, sim in zip(neighbors, sims):
        right_id = test_ids[right_i]
        
        if left_id == right_id:
            continue
        if sim >= sim_threshold:
            found_pair_set.add(tuple(sorted([left_id, right_id])))
            
len(found_pair_set)

17787

In [25]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(found_pair_set), len(test_record_dict))

1.5262570791144672

In [26]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(found_pair_set, test_pos_pair_set)

(0.5414628661381908, 0.9826548311396797)