## Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
# libgomp issue, must import n2 before torch
from n2 import HnswIndex

In [4]:
import sys

sys.path.insert(0, '..')

In [5]:
import os
home_dir = os.getenv('HOME')

https://dbs.uni-leipzig.de/research/projects/object_matching/benchmark_datasets_for_entity_resolution

In [6]:
from collections import defaultdict
import itertools

def Enumerator(start=0, initial=()):
    return defaultdict(itertools.count(start).__next__, initial)

In [7]:
import glob
import csv
from tqdm.auto import tqdm

id_enumerator = Enumerator()
row_dict = {}
left_id_set = set()
right_id_set = set()
rows_total = 1363 + 3226
clusters_total = 1300

with tqdm(total=rows_total) as pbar:
    with open(f'{home_dir}/Downloads//Amazon-GoogleProducts/Amazon.csv', encoding="latin1") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['name'] = row.pop('title')
            row['source'] = 'google'
            row_dict[row['id']] = row
            left_id_set.add(row['id'])
            pbar.update(1)
    
    with open(f'{home_dir}/Downloads/Amazon-GoogleProducts/GoogleProducts.csv', encoding="latin1") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['source'] = 'amazon'
            row_dict[row['id']] = row
            right_id_set.add(row['id'])
            pbar.update(1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




In [8]:
true_pair_set = set()

with open(f'{home_dir}/Downloads/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idAmazon']]
        id_right = id_enumerator[row['idGoogleBase']]
        true_pair_set.add(tuple(sorted([id_left, id_right])))

len(true_pair_set)

1300

In [9]:
from entity_embed.data_utils.utils import id_pairs_to_cluster_mapping_and_dict

cluster_mapping, cluster_dict = id_pairs_to_cluster_mapping_and_dict(true_pair_set)
len(cluster_mapping)

2404

In [10]:
len(cluster_dict)

1105

In [11]:
# TODO: deal with this difference
# from entity_embed.data_utils.utils import cluster_dict_to_id_pairs

# assert len(true_pair_set - cluster_dict_to_id_pairs(cluster_dict)) == 0

In [12]:
cluster_attr = 'cluster_id'
max_cluster_id = max(cluster_mapping.values())

for row_id, row in tqdm(row_dict.items()):
    try:
        row[cluster_attr] = cluster_mapping[row_id]
    except KeyError:
        row[cluster_attr] = max_cluster_id
        max_cluster_id += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




In [13]:
[row_dict[row_id] for row_id in next(iter(true_pair_set))]

[{'id': 938,
  'description': 'improve your typing skills today! typing instructor deluxe has a progressive design that has been developed for over 19 years. typing instructor deluxe can provide the right lessons tests strengthening exercises practice material and typing games for your skill level. you can even build your own personal typing plan to focus on specific areas you would like to improve. if you think learning has to be all hard work and no fun think again! for beginning to advanced typists kids to adults typing instructor deluxe will motivate you to improve your typing speed and accuracy using a travel theme and exciting typing challenges.educates entertains and motivates: choose from many typing plans or build your ownnavigate easily and choose your typing materialnew! dynamic learning methodsave your results and reports to track progresslearn voice-touch typing (dictation)3 unique travel themescolorful photos and musicten exciting games300+ magazine articles',
  'manufact

## Preprocess

In [14]:
attr_list = ['name', 'description', 'manufacturer', 'price']

In [15]:
import unidecode
from entity_embed.data_utils.one_hot_encoders import default_tokenizer

def clean_str(s):
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = itertools.islice((s_part[:30] for s_part in default_tokenizer(s)), 0, 30)
    return ' '.join(s_tokens)[:300]

for row in tqdm(row_dict.values()):
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




## Init Data Module

In [16]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [17]:
alphabet = list('0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ')

In [18]:
attr_info_dict = {
    'name': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'description': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'manufacturer': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'price': {
        'is_multitoken': False,
        'tokenizer': None,
        'alphabet': alphabet,  # compute
        'max_str_len': None,  # compute
    }
}

In [19]:
from entity_embed import build_row_encoder

row_encoder = build_row_encoder(attr_info_dict, row_dict=row_dict)
row_encoder.attr_info_dict

20:17:00 INFO:For attr='name', computing actual alphabet and max_str_len
20:17:00 INFO:For attr='name', using actual_max_str_len=26
20:17:00 INFO:For attr='description', computing actual alphabet and max_str_len
20:17:00 INFO:actual_max_str_len=29 must be pair to enable NN pooling. Updating to 30
20:17:00 INFO:For attr='description', using actual_max_str_len=30
20:17:00 INFO:For attr='manufacturer', computing actual alphabet and max_str_len
20:17:00 INFO:actual_max_str_len=15 must be pair to enable NN pooling. Updating to 16
20:17:00 INFO:For attr='manufacturer', using actual_max_str_len=16
20:17:00 INFO:For attr='price', computing actual alphabet and max_str_len
20:17:00 INFO:For attr='price', using actual_max_str_len=14


{'name': OneHotEncodingInfo(is_multitoken=True, tokenizer=<function default_tokenizer at 0x7f78c9555d30>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=26),
 'description': OneHotEncodingInfo(is_multitoken=True, tokenizer=<function default_tokenizer at 0x7f78c9555d30>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=30),
 'manufacturer': OneHotEncodingIn

In [20]:
from entity_embed import LinkageDataModule

train_cluster_len = 200
valid_cluster_len = 200
datamodule = LinkageDataModule(
    row_dict=row_dict,
    cluster_attr=cluster_attr,
    row_encoder=row_encoder,
    pos_pair_batch_size=45,
    neg_pair_batch_size=1225,
    row_batch_size=16,
    train_cluster_len=train_cluster_len,
    valid_cluster_len=valid_cluster_len,
    test_cluster_len=clusters_total - valid_cluster_len - train_cluster_len,
    only_plural_clusters=True,
    left_id_set=left_id_set,
    right_id_set=right_id_set,
    log_empty_vals=False,
    random_seed=random_seed
)

## Training

In [21]:
from entity_embed import LinkageEmbed

ann_k = 100
sim_threshold = 0.3
model = LinkageEmbed(
    datamodule,
    ann_k=ann_k,
    sim_threshold=sim_threshold,
    use_mask=True
)

In [22]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

max_epochs = 50
early_stop_callback = EarlyStopping(
   monitor='valid_recall',
   min_delta=0.00,
   patience=10,
   verbose=True,
   mode='max'
)
tb_log_dir = 'tb_logs'
tb_name = 'amzn-googl'
trainer = pl.Trainer(
    gpus=1,
    max_epochs=max_epochs,
    check_val_every_n_epoch=1,
    callbacks=[early_stop_callback],
    logger=TensorBoardLogger(tb_log_dir, name=tb_name)
)

20:17:00 INFO:GPU available: True, used: True
20:17:00 INFO:TPU available: None, using: 0 TPU cores
20:17:00 INFO:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [23]:
trainer.fit(model, datamodule)

20:17:00 INFO:Train pair count: 292
20:17:00 INFO:Valid pair count: 265
20:17:00 INFO:Test pair count: 998
20:17:02 INFO:
  | Name        | Type           | Params
-----------------------------------------------
0 | blocker_net | BlockerNet     | 3.3 M 
1 | losser      | NTXentLoss     | 0     
2 | miner       | BatchHardMiner | 0     
-----------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

In [24]:
model.blocker_net.get_signature_weights()

{'name': 0.293094664812088,
 'description': 0.24212001264095306,
 'manufacturer': 0.22635327279567719,
 'price': 0.23843204975128174}

## Testing

In [25]:
trainer.test(ckpt_path='best')

20:17:40 INFO:Train pair count: 292
20:17:40 INFO:Valid pair count: 265
20:17:40 INFO:Test pair count: 998


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': 0.04090143958998065,
 'test_pair_entity_ratio': 25.33745123537061,
 'test_precision': 0.02088839847057918,
 'test_recall': 0.9760191846522782}
--------------------------------------------------------------------------------


[{'test_precision': 0.02088839847057918,
  'test_recall': 0.9760191846522782,
  'test_f1': 0.04090143958998065,
  'test_pair_entity_ratio': 25.33745123537061}]

## Testing manually 

In [26]:
# Only call this if test above wasn't run
# datamodule.setup(stage='test')

In [27]:
test_row_dict = datamodule.test_row_dict
test_vector_dict = model.predict(
    row_dict=test_row_dict,
    batch_size=16
)

HBox(children=(HTML(value='# batch embedding'), FloatProgress(value=0.0, max=97.0), HTML(value='')))




In [28]:
embedding_size = model.blocker_net.embedding_size
test_true_pair_set = datamodule.test_true_pair_set

In [29]:
assert len(test_vector_dict) == len(test_row_dict)

In [30]:
%%time

from entity_embed import ANNLinkageIndex

ann_index = ANNLinkageIndex(embedding_size=embedding_size)
test_left_vector_dict, test_right_vector_dict = datamodule.separate_dict_left_right(test_vector_dict)
ann_index.insert_vector_dict(left_vector_dict=test_left_vector_dict, right_vector_dict=test_right_vector_dict)
ann_index.build()

CPU times: user 329 ms, sys: 5.52 ms, total: 335 ms
Wall time: 51.1 ms


In [31]:
%%time

found_pair_set = ann_index.search_pairs(
    k=ann_k,
    sim_threshold=sim_threshold,
    left_vector_dict=test_left_vector_dict,
    right_vector_dict=test_right_vector_dict,
)

CPU times: user 638 ms, sys: 1.23 ms, total: 639 ms
Wall time: 110 ms


In [32]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(found_pair_set), len(test_row_dict))

25.33745123537061

In [33]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(found_pair_set, test_true_pair_set)

(0.02088839847057918, 0.9760191846522782)

In [34]:
false_positives = list(found_pair_set - test_true_pair_set)
len(false_positives)

38155

In [35]:
false_negatives = list(test_true_pair_set - found_pair_set)
len(false_negatives)

20

In [36]:
cos_similarity = lambda a, b: np.dot(a, b)

In [37]:
for (id_left, id_right) in false_negatives[:10]:
    display(
        (
            cos_similarity(test_vector_dict[id_left], test_vector_dict[id_right]),
            row_dict[id_left], row_dict[id_right]
        )
    )

(0.2991227,
 {'id': 1183,
  'description': 'create the perfect pet for your sims to train play with and love throughout a lifetime . pick everything from their paws to their personality . choose from dozens of',
  'manufacturer': 'aspyr media',
  'price': '34 . 99',
  'name': 'sims 2 pets expansion pack',
  'source': 'google',
  'cluster_id': 1183},
 {'id': 2255,
  'name': 'sims 2 pets for mac',
  'description': 'system requirements : requires the full version of the sims 2 for mac os x to play operating system : mac os x 10 . 3 . 9 or later',
  'manufacturer': '',
  'price': '34 . 99',
  'source': 'amazon',
  'cluster_id': 1183})

(0.10590295,
 {'id': 125,
  'description': '',
  'manufacturer': 'compaq computer',
  'price': '0',
  'name': 'compaq comp . rapid deployment pk - flexible lic kit ( 302127 - b21 )',
  'source': 'google',
  'cluster_id': 125},
 {'id': 3231,
  'name': 'hewlett packard 302127 - b21 prol essentials rdp v1 . x 1u flex lic min qty 5',
  'description': 'prol essentials rdp v1 . x 1u flex lic min qty 5',
  'manufacturer': '',
  'price': '119 . 73',
  'source': 'amazon',
  'cluster_id': 125})

(0.28858703,
 {'id': 248,
  'description': 'sbs cal 03 20 clt adpk devi',
  'manufacturer': 'microsoft software',
  'price': '0',
  'name': 'microsoft windows small business server cal 2003 license pack 20 client addpack device',
  'source': 'google',
  'cluster_id': 248},
 {'id': 2996,
  'name': 'windows sbs cal 2003 20 - clt addpak device cal - microsoft - t74 - 00003',
  'description': "small businesses are doing more with less in today ' s business environment . information technology ( it ) professionals can help small businesses do more by deploying windows small",
  'manufacturer': '',
  'price': '1413 . 42 gbp',
  'source': 'amazon',
  'cluster_id': 248})

(0.22127046,
 {'id': 1208,
  'description': "tune tools for ipod lets you ove your songs from your ipod to another machine . you love your ipod but have you discovered that you can ' t move",
  'manufacturer': 'valusoft',
  'price': '19 . 99',
  'name': 'tune tools for ipod ( win / mac )',
  'source': 'google',
  'cluster_id': 1208},
 {'id': 1903,
  'name': 'valusoft tune tools for ipod',
  'description': 'windows : windows 2000 / xp pentium 500 mhz 128 mb ram 100 mb free hard - disk space cd - rom drive working usb 2 . 0 or firewire',
  'manufacturer': '',
  'price': '21 . 99',
  'source': 'amazon',
  'cluster_id': 1208})

(-0.0022251802,
 {'id': 845,
  'description': "iplaymusic ' s beginner guitar lessons is the first guitar learning program optimized for the mac and video ipod . the software takes advantage of ilife applications resident on today",
  'manufacturer': 'iplaymusic',
  'price': '49 . 99',
  'name': 'iplaymusic beginner guitar lessons for the mac and ipod',
  'source': 'google',
  'cluster_id': 845},
 {'id': 2324,
  'name': "wingnuts 2 : raina ' s revenge",
  'description': 'system requirements : mac os x 10 . 4 + g4 / g5 / intel 800 + mhz cpu 512 mb ram 32 mb video card 950 mb hard drive',
  'manufacturer': '',
  'price': '28 . 99',
  'source': 'amazon',
  'cluster_id': 845})

(0.2885472,
 {'id': 870,
  'description': '- marketing information : tinyterm provides accurate emulation and total flexibility . now you can use your desktop pc to access all your legacy data and applications . with tinyterm',
  'manufacturer': 'century software',
  'price': '219 . 63',
  'name': 'tiny term emulator v4 . 3x',
  'source': 'google',
  'cluster_id': 870},
 {'id': 3851,
  'name': 'century software tt - 1 - century tinyterm v . 4 . 3x - emulation - 1 user ( s ) - english french german spanish italian polish portuguese -',
  'description': 'century software tt - 1 : tinyterm provides accurate emulation and total flexibility . now you can use your desktop pc to access all your legacy data and applications .',
  'manufacturer': '',
  'price': '120 . 97',
  'source': 'amazon',
  'cluster_id': 870})

(0.23250811,
 {'id': 66,
  'description': 'children - ages 3 to 8 everyone knows theres no such thing as ghosts .. so who is haunting the schoolhouse taking the toys and practically scaring the scales off',
  'manufacturer': 'humongous entertainment',
  'price': '29 . 95',
  'name': 'freddi fish 2 : the haunted schoolhouse',
  'source': 'google',
  'cluster_id': 66},
 {'id': 2937,
  'name': 'freddi fish 2 - case of haunted schl hse',
  'description': "children - ages 3 to 8 everyone knows there ' s no such thing as ghosts ... so who is haunting the schoolhouse taking the toys & practically scaring the",
  'manufacturer': '',
  'price': '7 . 5',
  'source': 'amazon',
  'cluster_id': 66})

(0.0718785,
 {'id': 529,
  'description': "zoo tycoon 2 : marine mania is a great new expansion where you ' ll help your zoo make a big splash ! add killer whales manta rays and other",
  'manufacturer': 'microsoft',
  'price': '19 . 99',
  'name': 'zoo tycoon 2 : marine mania expansion',
  'source': 'google',
  'cluster_id': 529},
 {'id': 2963,
  'name': 'zoo tycoon for windows',
  'description': 'everybody likes the zoo . and why not casual walks exotic animals and fried foods are good fun . managing a zoo of your very own is even better .',
  'manufacturer': '',
  'price': '25 . 99',
  'source': 'amazon',
  'cluster_id': 529})

(0.24428658,
 {'id': 799,
  'description': '- marketing information : hp digital sending software 4 . 0 improves core business processes . digital sending streamlines critical business document handling and integrates with existing it infrastructures to',
  'manufacturer': 'hewlett packard ( consumables )',
  'price': '630 . 36',
  'name': 'hp dss software - ( v . 4 . 0 ) - complete package ( t1936aa ua0 )',
  'source': 'google',
  'cluster_id': 799},
 {'id': 3955,
  'name': 'hp t1936aa uao digital sending software 4 . 0 ( 10 device license )',
  'description': 'hp dss 4 . 0 is the entry - level member of a family of server - based software products that enables paper documents to be incorporated into electronic business',
  'manufacturer': 'hp',
  'price': '369 . 99',
  'source': 'amazon',
  'cluster_id': 799})

(0.29537284,
 {'id': 475,
  'description': '- marketing information : tinyterm is a powerful terminal emulation software applications designed for the mixed - host enterprise . they connect windows users to host applications running on ibm',
  'manufacturer': 'century software',
  'price': '705 . 21',
  'name': 'tinyterm v4 . x 5u windows terminal emulation ( tt - 5 )',
  'source': 'google',
  'cluster_id': 475},
 {'id': 3825,
  'name': 'century software tt - 5 - century tinyterm v . 4 . 2x - complete product - emulation - 5 user ( s ) - complete product - standard -',
  'description': 'century software tt - 5 : tinyterm is a powerful terminal emulation software applications designed for the mixed - host enterprise . they connect windows users to host applications running',
  'manufacturer': '',
  'price': '395 . 97',
  'source': 'amazon',
  'cluster_id': 475})