In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '../..')

## Load Dataset

In [4]:
from entity_embed.benchmarks import DBLP_ACM_StructuredBenchmark

benchmark = DBLP_ACM_StructuredBenchmark(data_dir_path="../data/")
benchmark

14:19:16 INFO:Extracting DBLP-ACM-Structured...
14:19:16 INFO:Reading DBLP-ACM-Structured row_dict...
14:19:16 INFO:Reading DBLP-ACM-Structured train.csv...
14:19:17 INFO:Reading DBLP-ACM-Structured valid.csv...
14:19:17 INFO:Reading DBLP-ACM-Structured test.csv...


<DBLP_ACM_StructuredBenchmark> from http://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/Structured/DBLP-ACM/dblp_acm_exp_data.zip

## Preprocess

In [5]:
field_list = ['title', 'authors', 'venue', 'year']

In [6]:
import unidecode

def clean_str(s):
    return unidecode.unidecode(s).lower().strip()

for record_dict in [benchmark.train_record_dict, benchmark.valid_record_dict, benchmark.test_record_dict]:
    for record in record_dict.values():
        for field in field_list:
            record[field] = clean_str(record[field])

  0%|          | 0/4910 [00:00<?, ?it/s]

## Init Data Module

In [7]:
import torch
import numpy as np

random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [10]:
from entity_embed import PairNumericalizer

pair_numericalizer = PairNumericalizer(field_list)

14:19:17 INFO:For attr=title, computing actual max_str_len
14:19:17 INFO:For attr=title, using actual_max_str_len=18
14:19:17 INFO:Loading vectors from .vector_cache/wiki.en.vec.pt
14:19:21 INFO:For attr=authors, computing actual max_str_len
14:19:21 INFO:For attr=authors, using actual_max_str_len=18
14:19:21 INFO:For attr=venue, computing actual max_str_len
14:19:21 INFO:actual_max_str_len=13 must be even to enable NN pooling. Updating to 14
14:19:21 INFO:For attr=venue, using actual_max_str_len=14
14:19:21 INFO:Loading vectors from .vector_cache/wiki.en.vec.pt
14:19:24 INFO:For attr=year, computing actual max_str_len
14:19:24 INFO:For attr=year, using actual_max_str_len=4


{'title': AttrConfig(source_attr='title', field_type=<FieldType.MULTITOKEN: 'multitoken'>, tokenizer='entity_embed.data_utils.numericalizer.default_tokenizer', alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' '], max_str_len=18, vocab=None, n_channels=8, embed_dropout_p=0.2, use_attention=True),
 'semantic_title': AttrConfig(source_attr='title', field_type=<FieldType.SEMANTIC: 'semantic_multitoken'>, tokenizer='entity_embed.data_utils.numericalizer.default_tokenizer', alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '!', '"', '#', '$', '%', '&', "'"

In [11]:
batch_size = 32
eval_batch_size = 256
datamodule = benchmark.build_matcher_datamodule(
    pair_numericalizer=pair_numericalizer,
    batch_size=batch_size,
    eval_batch_size=eval_batch_size,
    random_seed=random_seed
)

## Training

In [12]:
from entity_embed import Matcher

model = Matcher(
    pair_numericalizer=pair_numericalizer
)

In [13]:
trainer = model.fit(
    datamodule,
    min_epochs=5,
    max_epochs=100,
    check_val_every_n_epoch=1,
    early_stop_monitor="valid_f1_at_0.5",
    tb_save_dir='../tb_logs',
    tb_name=f'matcher-{benchmark.dataset_name}'
)

GPU available: True, used: True
14:19:24 INFO:GPU available: True, used: True
TPU available: None, using: 0 TPU cores
14:19:24 INFO:TPU available: None, using: 0 TPU cores


In [None]:
model.validate(datamodule)

## Testing

In [15]:
model.test(datamodule)

14:51:23 INFO:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_f1_at_0.3': 0.944444477558136,
  'test_f1_at_0.4': 0.9525862336158752,
  'test_f1_at_0.5': 0.953613817691803,
  'test_precision_at_0.3': 0.8983739614486694,
  'test_precision_at_0.4': 0.913223147392273,
  'test_precision_at_0.5': 0.9151138663291931,
  'test_recall_at_0.3': 0.9954954981803894,
  'test_recall_at_0.4': 0.9954954981803894,
  'test_recall_at_0.5': 0.9954954981803894}]