## Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
# libgomp issue, must import n2 before torch
from n2 import HnswIndex

In [4]:
import sys

sys.path.insert(0, '..')

In [5]:
import os
home_dir = os.getenv('HOME')

https://dbs.uni-leipzig.de/research/projects/object_matching/benchmark_datasets_for_entity_resolution

In [6]:
from collections import defaultdict
import itertools

def Enumerator(start=0, initial=()):
    return defaultdict(itertools.count(start).__next__, initial)

In [7]:
import glob
import csv
from tqdm.auto import tqdm

id_enumerator = Enumerator()
row_dict = {}
left_id_set = set()
right_id_set = set()
rows_total = 2616 + 64263
clusters_total = 5347

with tqdm(total=rows_total) as pbar:
    with open(f'{home_dir}/Downloads/DBLP-Scholar/DBLP1.csv', encoding="latin1") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['source'] = 'dblp'
            row_dict[row['id']] = row
            left_id_set.add(row['id'])
            pbar.update(1)
    
    with open(f'{home_dir}/Downloads/DBLP-Scholar/Scholar.csv', encoding="utf_8_sig") as f:
        for row in csv.DictReader(f):
            row['id'] = id_enumerator[row["id"]]
            row['source'] = 'scholar'
            row_dict[row['id']] = row
            right_id_set.add(row['id'])
            pbar.update(1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66879.0), HTML(value='')))




In [8]:
true_pair_set = set()

with open(f'{home_dir}/Downloads/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idDBLP']]
        id_right = id_enumerator[row['idScholar']]
        true_pair_set.add((id_left, id_right))

len(true_pair_set)

5347

In [9]:
from entity_embed.data_utils.utils import id_pairs_to_cluster_mapping_and_dict

cluster_mapping, cluster_dict = id_pairs_to_cluster_mapping_and_dict(true_pair_set)
len(cluster_mapping)

7626

In [10]:
len(cluster_dict)

2351

In [11]:
# TODO: deal with this difference
# from entity_embed.data_utils.utils import cluster_dict_to_id_pairs

# assert len(true_pair_set - cluster_dict_to_id_pairs(cluster_dict)) == 0

In [12]:
cluster_attr = 'cluster_id'
max_cluster_id = max(cluster_mapping.values())

for row_id, row in tqdm(row_dict.items()):
    try:
        row[cluster_attr] = cluster_mapping[row_id]
    except KeyError:
        row[cluster_attr] = max_cluster_id
        max_cluster_id += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66879.0), HTML(value='')))




In [13]:
[row_dict[row_id] for row_id in next(iter(true_pair_set))]

[{'id': 619,
  'title': "Report on the Second IEEE Metadata Conference (Metadata '97)",
  'authors': 'N/A',
  'venue': 'N/A',
  'year': '1998',
  'source': 'dblp',
  'cluster_id': 619},
 {'id': 39525,
  'title': "Report on the Second IEEE Metadata Conference (Metadata '97)",
  'authors': 'R Musick, C Miller',
  'venue': 'SIGMOD RECORD,',
  'year': '1998',
  'source': 'scholar',
  'cluster_id': 619}]

## Preprocess

In [14]:
attr_list = ['title', 'authors', 'venue', 'year']

In [15]:
import unidecode
from entity_embed import default_tokenizer

def clean_str(s):
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = itertools.islice((s_part[:30] for s_part in default_tokenizer(s)), 0, 30)
    return ' '.join(s_tokens)[:300]

for row in tqdm(row_dict.values()):
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66879.0), HTML(value='')))




## Init Data Module

In [16]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [17]:
alphabet = list('0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ')

In [18]:
attr_info_dict = {
    'title': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'authors': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'venue': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'year': {
        'field_type': "STRING",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    }
}

In [19]:
from entity_embed import build_row_numericalizer

row_numericalizer = build_row_numericalizer(attr_info_dict, row_dict=row_dict)
row_numericalizer.attr_info_dict

10:19:39 INFO:For attr='title', computing actual alphabet and max_str_len
10:19:40 INFO:For attr='title', using actual_max_str_len=30
10:19:40 INFO:For attr='authors', computing actual alphabet and max_str_len
10:19:40 INFO:For attr='authors', using actual_max_str_len=30
10:19:40 INFO:For attr='venue', computing actual alphabet and max_str_len
10:19:40 INFO:For attr='venue', using actual_max_str_len=30
10:19:40 INFO:For attr='year', computing actual alphabet and max_str_len
10:19:40 INFO:For attr='year', using actual_max_str_len=4


{'title': NumericalizeInfo(field_type=<FieldType.MULTITOKEN: 'multitoken'>, tokenizer=<function default_tokenizer at 0x7fb533b23160>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=30, vocab=None),
 'authors': NumericalizeInfo(field_type=<FieldType.MULTITOKEN: 'multitoken'>, tokenizer=<function default_tokenizer at 0x7fb533b23160>, alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}'

In [20]:
from entity_embed import LinkageDataModule

train_cluster_len = 200
valid_cluster_len = 200
datamodule = LinkageDataModule(
    row_dict=row_dict,
    cluster_attr=cluster_attr,
    row_numericalizer=row_numericalizer,
    batch_size=20,
    row_batch_size=16,
    train_cluster_len=train_cluster_len,
    valid_cluster_len=valid_cluster_len,
    test_cluster_len=clusters_total - valid_cluster_len - train_cluster_len,
    only_plural_clusters=True,
    left_id_set=left_id_set,
    right_id_set=right_id_set,
    random_seed=random_seed
)

## Training

In [21]:
from entity_embed import LinkageEmbed

ann_k = 10
model = LinkageEmbed(
    datamodule,
    ann_k=ann_k,
    use_mask=True
)

In [22]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

max_epochs = 50
early_stop_callback = EarlyStopping(
   monitor='valid_recall_at_0.3',
   min_delta=0.00,
   patience=10,
   verbose=True,
   mode='max'
)
tb_log_dir = 'tb_logs'
tb_name = 'dblp-scholar'
trainer = pl.Trainer(
    gpus=1,
    max_epochs=max_epochs,
    check_val_every_n_epoch=1,
    callbacks=[early_stop_callback],
    logger=TensorBoardLogger(tb_log_dir, name=tb_name),
)

10:19:41 INFO:GPU available: True, used: True
10:19:41 INFO:TPU available: None, using: 0 TPU cores
10:19:41 INFO:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [23]:
trainer.fit(model, datamodule)

10:19:41 INFO:Train pair count: 861
10:19:41 INFO:Valid pair count: 1449
10:19:41 INFO:Test pair count: 11456
10:19:43 INFO:
  | Name        | Type       | Params
-------------------------------------------
0 | blocker_net | BlockerNet | 3.5 M 
1 | losser      | SupConLoss | 0     
-------------------------------------------
3.5 M     Trainable params
0         Non-trainable params
3.5 M     Total params


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

In [24]:
model.blocker_net.get_signature_weights()

{'title': 0.28378745913505554,
 'authors': 0.27267441153526306,
 'venue': 0.2283594012260437,
 'year': 0.2151786983013153}

## Testing

In [25]:
trainer.test(ckpt_path='best')

10:20:24 INFO:Train pair count: 861
10:20:24 INFO:Valid pair count: 1449
10:20:24 INFO:Test pair count: 11456


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1_at_0.3': 0.17474354439334983,
 'test_f1_at_0.5': 0.2542853833680797,
 'test_f1_at_0.7': 0.861747615393049,
 'test_f1_at_0.9': 0.6464855286473715,
 'test_pair_entity_ratio_at_0.3': 7.316695624703838,
 'test_pair_entity_ratio_at_0.5': 4.734165218764808,
 'test_pair_entity_ratio_at_0.7': 0.7197915021323645,
 'test_pair_entity_ratio_at_0.9': 0.3487600694992892,
 'test_precision_at_0.3': 0.09598031173092698,
 'test_precision_at_0.5': 0.14650340317629787,
 'test_precision_at_0.7': 0.8624094799210007,
 'test_precision_at_0.9': 0.9913949275362319,
 'test_recall_at_0.3': 0.9741454864154251,
 'test_recall_at_0.5': 0.9620946538124452,
 'test_recall_at_0.7': 0.8610867659947414,
 'test_recall_at_0.9': 0.4796231375985977}
--------------------------------------------------------------------------------


[{'test_precision_at_0.3': 0.09598031173092698,
  'test_recall_at_0.3': 0.9741454864154251,
  'test_f1_at_0.3': 0.17474354439334983,
  'test_pair_entity_ratio_at_0.3': 7.316695624703838,
  'test_precision_at_0.5': 0.14650340317629787,
  'test_recall_at_0.5': 0.9620946538124452,
  'test_f1_at_0.5': 0.2542853833680797,
  'test_pair_entity_ratio_at_0.5': 4.734165218764808,
  'test_precision_at_0.7': 0.8624094799210007,
  'test_recall_at_0.7': 0.8610867659947414,
  'test_f1_at_0.7': 0.861747615393049,
  'test_pair_entity_ratio_at_0.7': 0.7197915021323645,
  'test_precision_at_0.9': 0.9913949275362319,
  'test_recall_at_0.9': 0.4796231375985977,
  'test_f1_at_0.9': 0.6464855286473715,
  'test_pair_entity_ratio_at_0.9': 0.3487600694992892}]

## Testing manually 

In [26]:
# Only call this if test above wasn't run
# datamodule.setup(stage='test')

In [27]:
test_row_dict = datamodule.test_row_dict
test_left_vector_dict, test_right_vector_dict = model.predict(
    row_dict=test_row_dict,
    left_id_set=left_id_set,
    right_id_set=right_id_set,
    batch_size=16
)

HBox(children=(HTML(value='# batch embedding'), FloatProgress(value=0.0, max=396.0), HTML(value='')))




In [28]:
embedding_size = model.blocker_net.embedding_size
test_true_pair_set = datamodule.test_true_pair_set

In [29]:
assert (len(test_left_vector_dict) + len(test_right_vector_dict)) == len(test_row_dict)

In [30]:
%%time

from entity_embed import ANNLinkageIndex

ann_index = ANNLinkageIndex(embedding_size=embedding_size)
ann_index.insert_vector_dict(left_vector_dict=test_left_vector_dict, right_vector_dict=test_right_vector_dict)
ann_index.build()

CPU times: user 1.36 s, sys: 0 ns, total: 1.36 s
Wall time: 205 ms


In [31]:
%%time

sim_threshold = 0.3
found_pair_set = ann_index.search_pairs(
    k=ann_k,
    sim_threshold=sim_threshold,
    left_vector_dict=test_left_vector_dict,
    right_vector_dict=test_right_vector_dict,
)

CPU times: user 1.79 s, sys: 11.1 ms, total: 1.8 s
Wall time: 229 ms


In [32]:
from entity_embed.evaluation import pair_entity_ratio

pair_entity_ratio(len(found_pair_set), len(test_row_dict))

7.316537671773811

In [33]:
from entity_embed.evaluation import precision_and_recall

precision_and_recall(found_pair_set, test_true_pair_set)

(0.09598238380000432, 0.9741454864154251)

In [34]:
false_positives = list(found_pair_set - test_true_pair_set)
len(false_positives)

41875

In [35]:
false_negatives = list(test_true_pair_set - found_pair_set)
len(false_negatives)

118

In [36]:
cos_similarity = lambda a, b: np.dot(a, b)

In [37]:
for (id_left, id_right) in false_negatives[:10]:
    display(
        (
            cos_similarity(test_left_vector_dict[id_left], test_right_vector_dict[id_right]),
            row_dict[id_left], row_dict[id_right]
        )
    )

(0.32486036,
 {'id': 798,
  'title': 'metu interoperable database system',
  'authors': 'a dogac , c dengi , e kilic , g ozhan , f ozcan , s nural , c evrendilek , u halici , i arpinar , p koksal ,',
  'venue': 'sigmod record',
  'year': '1995',
  'source': 'dblp',
  'cluster_id': 798},
 {'id': 9007,
  'title': 'metu object - oriented database system , demo description',
  'authors': 'a dogac',
  'venue': 'proceedings of acm sigmod intl . conf . on management of data , & hellip ;,',
  'year': '',
  'source': 'scholar',
  'cluster_id': 798})

(0.37666428,
 {'id': 1421,
  'title': 'query caching and optimization in distributed mediator systems',
  'authors': 's adali , k candan , y papakonstantinou , v subrahmanian',
  'venue': 'sigmod conference',
  'year': '1996',
  'source': 'dblp',
  'cluster_id': 1421},
 {'id': 58897,
  'title': 'papakonstantinou , and vs subrahmanian . query caching and optimization in distributed mediator',
  'authors': 'sa acps , ks candan',
  'venue': 'proceedings of the acm sigmod international conference on',
  'year': '',
  'source': 'scholar',
  'cluster_id': 1421})

(0.04969258,
 {'id': 1852,
  'title': 'metu object - oriented dbms',
  'authors': 'a dogac , i arpinar , c evrendilek , c ozkan , i altintas , i durusoy , m altinel , t okay , y saygin',
  'venue': 'sigmod conference',
  'year': '1994',
  'source': 'dblp',
  'cluster_id': 798},
 {'id': 45849,
  'title': 'metu interoperable database system',
  'authors': 'al dogac aet',
  'venue': 'proceedings of the 1996 acm sigmod int . conf . on management',
  'year': '',
  'source': 'scholar',
  'cluster_id': 798})

(0.3656819,
 {'id': 623,
  'title': 'data - driven understanding and refinement of schema mappings',
  'authors': 'l yan , r miller , l haas , r fagin',
  'venue': 'sigmod conference',
  'year': '2001',
  'source': 'dblp',
  'cluster_id': 623},
 {'id': 11778,
  'title': 'data - drivenunderstandingand refinementof schema mappings',
  'authors': 'll yan , rj miller , lm haas , r fagin',
  'venue': 'proceedings of the acm sigmod international conference on & hellip ;,',
  'year': '',
  'source': 'scholar',
  'cluster_id': 623})

(0.056210835,
 {'id': 1486,
  'title': 'metu interoperable database system',
  'authors': 'a dogac , u halici , e kilic , g ozhan , f ozcan , s nural , c dengi , s mancuhan , i arpinar , p koksal ,',
  'venue': 'sigmod conference',
  'year': '1996',
  'source': 'dblp',
  'cluster_id': 798},
 {'id': 31474,
  'title': 'metu object - oriented dbms kernel',
  'authors': 'a dogac , a altinel , c ozkan',
  'venue': 'proc . of intl . conf on database and expert systems & hellip ;,',
  'year': '1995',
  'source': 'scholar',
  'cluster_id': 798})

(0.3459232,
 {'id': 876,
  'title': 'tutorial : designing an ultra highly available dbms',
  'authors': 'n / a',
  'venue': 'n / a',
  'year': '2000',
  'source': 'dblp',
  'cluster_id': 876},
 {'id': 13200,
  'title': 'designing an ultra highly available dbms',
  'authors': 'o torbjornsen , s bratsberg',
  'venue': 'proceedings of acmsigmod conference , dallas , tx , may ,',
  'year': '',
  'source': 'scholar',
  'cluster_id': 876})

(0.40271276,
 {'id': 1588,
  'title': 'towards an effective calculus for object query languages',
  'authors': 'l fegaras , d maier',
  'venue': 'sigmod conference',
  'year': '1995',
  'source': 'dblp',
  'cluster_id': 1588},
 {'id': 12063,
  'title': 'towards an eectivecalculusfor objectquery languages',
  'authors': 'l fegaras , d maier',
  'venue': 'proc . of the acm sigmod int . conference on management of & hellip ;,',
  'year': '',
  'source': 'scholar',
  'cluster_id': 1588})

(0.31105143,
 {'id': 1851,
  'title': 'quest : a project on database mining',
  'authors': 'r agrawal , m carey , c faloutsos , s ghosh , m houtsma , t imielinski , b iyer , a mahboob , h miranda , r srikant ,',
  'venue': 'sigmod conference',
  'year': '1994',
  'source': 'dblp',
  'cluster_id': 1851},
 {'id': 31651,
  'title': 'quest : a project on database mining',
  'authors': 'r agrawal',
  'venue': '',
  'year': '',
  'source': 'scholar',
  'cluster_id': 1851})

(0.43007416,
 {'id': 1382,
  'title': 'data warehousing and olap for decision support ( tutorial )',
  'authors': 'n / a',
  'venue': 'n / a',
  'year': '1997',
  'source': 'dblp',
  'cluster_id': 565},
 {'id': 52428,
  'title': 'an overview of data warehousing and olap technology',
  'authors': 'sc cd971 , u dayal',
  'venue': 'sigmod record , vol26 ( 1 ),',
  'year': '',
  'source': 'scholar',
  'cluster_id': 565})

(0.2055665,
 {'id': 1403,
  'title': 'lessons from wall street : case studies in configuration , tuning , and distribution ( tutorial )',
  'authors': 'n / a',
  'venue': 'n / a',
  'year': '1997',
  'source': 'dblp',
  'cluster_id': 1403},
 {'id': 25600,
  'title': 'lessons from wall street : case studies in database tuning , configuration , and replication',
  'authors': 'd shasha',
  'venue': 'proc . 1997 acm sigmod',
  'year': '',
  'source': 'scholar',
  'cluster_id': 1403})