## Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
# libgomp issue, must import n2 before torch
from n2 import HnswIndex

In [4]:
import sys

sys.path.insert(0, '..')

In [5]:
import os
home_dir = os.getenv('HOME')

https://dbs.uni-leipzig.de/research/projects/object_matching/benchmark_datasets_for_entity_resolution

In [6]:
from collections import defaultdict
import itertools

def Enumerator(start=0, initial=()):
    return defaultdict(itertools.count(start).__next__, initial)

In [7]:
import glob
import csv
from tqdm.auto import tqdm

id_enumerator = Enumerator()
row_dict = {}
left_id_set = set()
right_id_set = set()
rows_total = 1363 + 3226
clusters_total = 1300

with tqdm(total=rows_total) as pbar:
    with open(f'{home_dir}/Downloads//Amazon-GoogleProducts/Amazon.csv', encoding="latin1") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['name'] = row.pop('title')
            row['source'] = 'google'
            row_dict[row['id']] = row
            left_id_set.add(row['id'])
            pbar.update(1)
    
    with open(f'{home_dir}/Downloads/Amazon-GoogleProducts/GoogleProducts.csv', encoding="latin1") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['source'] = 'amazon'
            row_dict[row['id']] = row
            right_id_set.add(row['id'])
            pbar.update(1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




In [8]:
true_pair_set = set()

with open(f'{home_dir}/Downloads/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idAmazon']]
        id_right = id_enumerator[row['idGoogleBase']]
        true_pair_set.add(tuple(sorted([id_left, id_right])))

len(true_pair_set)

1300

In [9]:
from entity_embed.data_utils.utils import id_pairs_to_cluster_mapping_and_dict

cluster_mapping, cluster_dict = id_pairs_to_cluster_mapping_and_dict(true_pair_set)
len(cluster_mapping)

2404

In [10]:
len(cluster_dict)

1105

In [11]:
# TODO: deal with this difference
# from entity_embed.data_utils.utils import cluster_dict_to_id_pairs

# assert len(true_pair_set - cluster_dict_to_id_pairs(cluster_dict)) == 0

In [12]:
cluster_attr = 'cluster_id'
max_cluster_id = max(cluster_mapping.values())

for row_id, row in tqdm(row_dict.items()):
    try:
        row[cluster_attr] = cluster_mapping[row_id]
    except KeyError:
        row[cluster_attr] = max_cluster_id
        max_cluster_id += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




In [13]:
[row_dict[row_id] for row_id in next(iter(true_pair_set))]

[{'id': 938,
  'description': 'improve your typing skills today! typing instructor deluxe has a progressive design that has been developed for over 19 years. typing instructor deluxe can provide the right lessons tests strengthening exercises practice material and typing games for your skill level. you can even build your own personal typing plan to focus on specific areas you would like to improve. if you think learning has to be all hard work and no fun think again! for beginning to advanced typists kids to adults typing instructor deluxe will motivate you to improve your typing speed and accuracy using a travel theme and exciting typing challenges.educates entertains and motivates: choose from many typing plans or build your ownnavigate easily and choose your typing materialnew! dynamic learning methodsave your results and reports to track progresslearn voice-touch typing (dictation)3 unique travel themescolorful photos and musicten exciting games300+ magazine articles',
  'manufact

## Preprocess

In [14]:
attr_list = ['name', 'description', 'manufacturer', 'price']

In [15]:
import unidecode
from entity_embed.data_utils.one_hot_encoders import default_tokenizer

def clean_str(s):
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = itertools.islice((s_part[:30] for s_part in default_tokenizer(s)), 0, 30)
    return ' '.join(s_tokens)[:300]

for row in tqdm(row_dict.values()):
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4589.0), HTML(value='')))




## Init Data Module

In [16]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [17]:
alphabet = list('0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ')

In [18]:
attr_info_dict = {
    'name': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'description': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'manufacturer': {
        'is_multitoken': True,
        'tokenizer': default_tokenizer,
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'price': {
        'is_multitoken': False,
        'tokenizer': None,
        'alphabet': alphabet,  # compute
        'max_str_len': None,  # compute
    }
}

In [19]:
from entity_embed import build_row_encoder

row_encoder = build_row_encoder(attr_info_dict, row_dict=row_dict)
row_encoder.attr_info_dict

19:31:04 INFO:For attr='name', computing actual alphabet and max_str_len
19:31:04 INFO:For attr='name', using actual_max_str_len=26
19:31:04 INFO:For attr='description', computing actual alphabet and max_str_len
19:31:04 INFO:actual_max_str_len=29 must be pair to enable NN pooling. Updating to 30
19:31:04 INFO:For attr='description', using actual_max_str_len=30
19:31:04 INFO:For attr='manufacturer', computing actual alphabet and max_str_len
19:31:04 INFO:actual_max_str_len=15 must be pair to enable NN pooling. Updating to 16
19:31:04 INFO:For attr='manufacturer', using actual_max_str_len=16
19:31:04 INFO:For attr='price', computing actual alphabet and max_str_len
19:31:04 INFO:For attr='price', using actual_max_str_len=14


{'name': OneHotEncodingInfo(is_multitoken=True, tokenizer=<function default_tokenizer at 0x7f94565e7ca0>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=26),
 'description': OneHotEncodingInfo(is_multitoken=True, tokenizer=<function default_tokenizer at 0x7f94565e7ca0>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=30),
 'manufacturer': OneHotEncodingIn

In [20]:
from entity_embed import LinkageDataModule

train_cluster_len = 500
valid_cluster_len = 500
datamodule = LinkageDataModule(
    row_dict=row_dict,
    cluster_attr=cluster_attr,
    row_encoder=row_encoder,
    pos_pair_batch_size=45,
    neg_pair_batch_size=1225,
    row_batch_size=16,
    train_cluster_len=train_cluster_len,
    valid_cluster_len=valid_cluster_len,
    test_cluster_len=clusters_total - valid_cluster_len - train_cluster_len,
    only_plural_clusters=True,
    left_id_set=left_id_set,
    right_id_set=right_id_set,
    log_empty_vals=False,
    random_seed=random_seed
)

## Training

In [21]:
from entity_embed import LinkageEmbed

ann_k = 100
sim_threshold = 0.3
model = LinkageEmbed(
    datamodule,
    ann_k=ann_k,
    sim_threshold=sim_threshold,
    use_mask=True
)

In [22]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

max_epochs = 50
early_stop_callback = EarlyStopping(
   monitor='valid_recall',
   min_delta=0.00,
   patience=10,
   verbose=True,
   mode='max'
)
tb_log_dir = 'tb_logs'
tb_name = 'amzn-googl'
trainer = pl.Trainer(
    gpus=1,
    max_epochs=max_epochs,
    check_val_every_n_epoch=1,
    callbacks=[early_stop_callback],
    logger=TensorBoardLogger(tb_log_dir, name=tb_name)
)

19:31:04 INFO:GPU available: True, used: True
19:31:04 INFO:TPU available: None, using: 0 TPU cores
19:31:04 INFO:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [23]:
trainer.fit(model, datamodule)

19:31:04 INFO:Train pair count: 735
19:31:04 INFO:Valid pair count: 687
19:31:04 INFO:Test pair count: 133
19:31:06 INFO:
  | Name        | Type           | Params
-----------------------------------------------
0 | blocker_net | BlockerNet     | 3.3 M 
1 | losser      | NTXentLoss     | 0     
2 | miner       | BatchHardMiner | 0     
-----------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

In [24]:
model.blocker_net.get_signature_weights()

{'name': 0.31021249294281006,
 'description': 0.24755840003490448,
 'manufacturer': 0.2097455859184265,
 'price': 0.23248352110385895}

## Testing

In [25]:
trainer.test(ckpt_path='best')

19:32:14 INFO:Train pair count: 735
19:32:14 INFO:Valid pair count: 687
19:32:14 INFO:Test pair count: 133


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': 0.2885085574572127,
 'test_pair_entity_ratio': 3.071748878923767,
 'test_precision': 0.17226277372262774,
 'test_recall': 0.8872180451127819}
--------------------------------------------------------------------------------


[{'test_precision': 0.17226277372262774,
  'test_recall': 0.8872180451127819,
  'test_f1': 0.2885085574572127,
  'test_pair_entity_ratio': 3.071748878923767}]

## Testing manually 

In [26]:
# Only call this if test above wasn't run
# datamodule.setup(stage='test')

In [27]:
test_row_dict = datamodule.test_row_dict
test_vector_dict = model.predict(
    row_dict=test_row_dict,
    batch_size=16
)

HBox(children=(HTML(value='# batch embedding'), FloatProgress(value=0.0, max=14.0), HTML(value='')))




In [28]:
embedding_size = model.blocker_net.embedding_size
test_true_pair_set = datamodule.test_true_pair_set

In [29]:
assert len(test_vector_dict) == len(test_row_dict)

In [30]:
%%time

from entity_embed import ANNLinkageIndex

ann_index = ANNLinkageIndex(embedding_size=embedding_size)
test_left_vector_dict, test_right_vector_dict = datamodule.separate_dict_left_right(test_vector_dict)
ann_index.insert_vector_dict(left_vector_dict=test_left_vector_dict, right_vector_dict=test_right_vector_dict)
ann_index.build()

CPU times: user 164 ms, sys: 0 ns, total: 164 ms
Wall time: 21.8 ms


In [31]:
%%time

found_pair_set = ann_index.search_pairs(
    k=ann_k,
    sim_threshold=sim_threshold,
    left_vector_dict=test_left_vector_dict,
    right_vector_dict=test_right_vector_dict,
)

CPU times: user 73.6 ms, sys: 0 ns, total: 73.6 ms
Wall time: 10.8 ms


In [32]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(found_pair_set), len(test_row_dict))

3.071748878923767

In [33]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(found_pair_set, test_true_pair_set)

(0.17226277372262774, 0.8872180451127819)

In [34]:
false_positives = list(found_pair_set - test_true_pair_set)
len(false_positives)

567

In [35]:
false_negatives = list(test_true_pair_set - found_pair_set)
len(false_negatives)

15

In [36]:
cos_similarity = lambda a, b: np.dot(a, b)

In [37]:
for (id_left, id_right) in false_negatives[:10]:
    display(
        (
            cos_similarity(test_vector_dict[id_left], test_vector_dict[id_right]),
            row_dict[id_left], row_dict[id_right]
        )
    )

(0.62557834,
 {'id': 1943,
  'name': 'adobe dreamweaver cs3 academic',
  'description': 'system requirements powerpc g4 or g5 or intel core processor mac os x v10 . 4 . 8 512mb ram ( 1gb recommended ) 1 . 4gb free hard -',
  'manufacturer': '',
  'price': '195 . 99',
  'source': 'amazon',
  'cluster_id': 1199},
 {'id': 3026,
  'name': 'adobe 38040450 - dreamweaver cs3 - complete product - web development - 1 user - complete product - academic - universal english - mac intel - based mac',
  'description': 'adobe 38040450 : quickly and easily design develop and maintain websites and web applications - from start to finish - with adobe dreamweaver cs3 software . built for both designers',
  'manufacturer': '',
  'price': '182 . 97',
  'source': 'amazon',
  'cluster_id': 1199})

(0.93099445,
 {'id': 1732,
  'name': 'adobe cs3 design premium upsell',
  'description': 'system requirements powerpc g4 or g5 or intel core processor mac os x v10 . 4 . 8 java runtime environment 1 . 5 1gb ram 6 . 3gb free',
  'manufacturer': '',
  'price': '1639 . 99',
  'source': 'amazon',
  'cluster_id': 597},
 {'id': 1884,
  'name': 'adobe cs3 design premium',
  'description': 'system requirements powerpc g4 or g5 or intel core processor mac os x v10 . 4 . 8 java runtime environment 1 . 5 1gb ram 6 . 3gb free',
  'manufacturer': '',
  'price': '1865 . 99',
  'source': 'amazon',
  'cluster_id': 597})

(0.6772689,
 {'id': 4063,
  'name': 'global software a1055 i love the usa',
  'description': "i love the usa get packed ! you ' re needed for a secret mission . and you ' ll track down clues from coast to coast to solve the",
  'manufacturer': 'global software',
  'price': '11 . 99',
  'source': 'amazon',
  'cluster_id': 836},
 {'id': 4132,
  'name': 'global software a1055 - i love the usa ( win 95 98 me nt 2000 xp / mac 8 . 6 - 9 . x ( classic ) x v10',
  'description': 'global software a1055 : get packed ! you re needed for a secret mission . and you ll track down clues from coast to coast to solve the mystery .',
  'manufacturer': '',
  'price': '8 . 79',
  'source': 'amazon',
  'cluster_id': 836})

(0.64615613,
 {'id': 3299,
  'name': "kutoka interactive 61208 mia ' s math adventure : just in time !",
  'description': "mia ' s math adventure tells a captivating story with educational activities . games focus on developing math skills such as fractions geometry logic and mental computation . oh no",
  'manufacturer': 'kutoka interactive',
  'price': '24 . 99',
  'source': 'amazon',
  'cluster_id': 8},
 {'id': 3691,
  'name': 'kutoka interactive 61208 - mias math adventure ( just in time ) ( win 95 98 me 2000 xp / mac 8 . 6 - 9 . x ( classic',
  'description': 'kutoka interactive 61208 : mia s math adventure proposes a captivating story including numerous educational activities all of which focus on developing math skills such as fractions geometry logic mental',
  'manufacturer': '',
  'price': '18 . 97',
  'source': 'amazon',
  'cluster_id': 8})

(0.8329443,
 {'id': 2322,
  'name': 'microsoft windows vista business retail no open box returns',
  'description': 'system requirements pc with pentium 1 ghz 32 - bit ( x86 ) or 64 - bit ( x64 ) processor 1 gb of ram or more recommended 40 gb',
  'manufacturer': '',
  'price': '299 . 99',
  'source': 'amazon',
  'cluster_id': 584},
 {'id': 4181,
  'name': 'microsoft windows vista business ( pc )',
  'description': 'key features : for business efficiency reliability and security easy navigation enhanced connectivity ...',
  'manufacturer': '',
  'price': '299 . 99',
  'source': 'amazon',
  'cluster_id': 584})

(0.5326197,
 {'id': 3347,
  'name': 'sonicwall gms 1000n - incremental lic upg',
  'description': 'today enterprises and service providers face increasing security challenges in their distributed networks from security and virus attacks to enforcing security policies . as a distributed network grows and branches',
  'manufacturer': '',
  'price': '62920 . 89',
  'source': 'amazon',
  'cluster_id': 17},
 {'id': 3348,
  'name': 'sonicwall gms 1000 upgrade',
  'description': 'sonicwall global management system ( sonicwall gms ) enables distributed enterprises and service providers to manage and monitor thousands of sonicwall internet security appliances from a central location with a',
  'manufacturer': '',
  'price': '63074 . 12',
  'source': 'amazon',
  'cluster_id': 17})

(0.48137528,
 {'id': 1917,
  'name': 'onone software plug - in suite - full',
  'description': 'the photoshop plug - in suite is a collection of software plug - ins that get you back to shooting . for image resizing color correction masking and border effects',
  'manufacturer': '',
  'price': '393 . 68',
  'source': 'amazon',
  'cluster_id': 958},
 {'id': 3808,
  'name': 'onone software pps - 30211 - on1 plug - in suite v . 3 . 0 for adobe photoshop - complete product - image collection / editing / archive -',
  'description': 'onone software pps - 30211 : plug - in suite 3 combines 4 essential tools that save you time and money so you can get back to shooting . plug',
  'manufacturer': '',
  'price': '365 . 97',
  'source': 'amazon',
  'cluster_id': 958})

(0.89121866,
 {'id': 1626,
  'name': 'adobe photoshop cs3 extended for mac',
  'description': 'system requirements powerpc g4 or g5 or intel core processor mac os x v10 . 4 . 8 512mb ram 64mb vram 2gb free hard - disk space ( additional',
  'manufacturer': '',
  'price': '935 . 99',
  'source': 'amazon',
  'cluster_id': 264},
 {'id': 3969,
  'name': 'adobe photoshop cs3 extended software full version for macintosh',
  'description': 'adorama camera : ideal for film video and multimedia professionals and graphic and web designers using 3d and motion as well as professionals in engineering and science adobe photoshop cs3',
  'manufacturer': '',
  'price': '969',
  'source': 'amazon',
  'cluster_id': 264})

(0.7179624,
 {'id': 2790,
  'name': 'punch software 85100 - punch ! master landscape pro v10 and home design',
  'description': 'punch software 85100 : the new ! master landscape pro and home design v10 includes a more robust interface for editing specifying precise dimensions displaying options and more . new',
  'manufacturer': '',
  'price': '59 . 97',
  'source': 'amazon',
  'cluster_id': 1115},
 {'id': 3318,
  'name': 'punch software - 85100 - master landscape professional & home design v 10 . 0',
  'description': 'the new ! master landscape pro and home design v10 includes a more robust interface for editing specifying precise dimensions displaying options and more . new enhancements for pool design',
  'manufacturer': '',
  'price': '61 . 8',
  'source': 'amazon',
  'cluster_id': 1115})

(0.6772689,
 {'id': 4132,
  'name': 'global software a1055 - i love the usa ( win 95 98 me nt 2000 xp / mac 8 . 6 - 9 . x ( classic ) x v10',
  'description': 'global software a1055 : get packed ! you re needed for a secret mission . and you ll track down clues from coast to coast to solve the mystery .',
  'manufacturer': '',
  'price': '8 . 79',
  'source': 'amazon',
  'cluster_id': 836},
 {'id': 4256,
  'name': 'global software a1055 i love the usa',
  'description': "i love the usa get packed ! you ' re needed for a secret mission . and you ' ll track down clues from coast to coast to solve the",
  'manufacturer': 'global software',
  'price': '11 . 99',
  'source': 'amazon',
  'cluster_id': 836})