# CLI Record Linkage CSV Generation

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '../..')

In [4]:
random_seed = 42

## Load Dataset

In [5]:
import urllib
import tempfile

dataset_url = 'https://dbs.uni-leipzig.de/file/Amazon-GoogleProducts.zip'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [6]:
import zipfile
import os

td = tempfile.TemporaryDirectory()

with zipfile.ZipFile(tf.name, "r") as zf:
    zf.extractall(td.name)

os.listdir(td.name)

['Amazon.csv', 'GoogleProducts.csv', 'Amzon_GoogleProducts_perfectMapping.csv']

In [7]:
import csv

from entity_embed.data_utils.utils import Enumerator

id_enumerator = Enumerator()
record_dict = {}
source_field = '__source'
left_source = 'amazon'
right_source = 'google'

with open(f'{td.name}/Amazon.csv', newline='', encoding="latin1") as f:
    for record in csv.DictReader(f):
        record['id'] = id_enumerator[record["id"]]
        record[source_field] = left_source
        del record['description']  # drop description, for benchmarking
        record_dict[record['id']] = record

with open(f'{td.name}/GoogleProducts.csv', newline='', encoding="latin1") as f:
    for record in csv.DictReader(f):
        record['id'] = id_enumerator[record["id"]]
        record['title'] = record.pop('name')  # in Google, title is called name
        record[source_field] = right_source
        del record['description']  # drop description, for benchmarking
        record_dict[record['id']] = record

In [8]:
pos_pair_set = set()

with open(f'{td.name}/Amzon_GoogleProducts_perfectMapping.csv', newline='') as f:
    for record in csv.DictReader(f):
        id_left = id_enumerator[record['idAmazon']]
        id_right = id_enumerator[record['idGoogleBase']]
        pos_pair_set.add((id_left, id_right))

len(pos_pair_set)

1300

In [9]:
from entity_embed.data_utils import utils

cluster_mapping, cluster_dict = utils.id_pairs_to_cluster_mapping_and_dict(pos_pair_set, record_dict)
len(cluster_dict)

3290

In [10]:
cluster_field = 'cluster'

utils.assign_clusters(record_dict=record_dict, cluster_field=cluster_field, cluster_mapping=cluster_mapping)

In [11]:
from entity_embed.data_utils import utils

train_record_dict, valid_record_dict, test_record_dict = utils.split_record_dict_on_clusters(
    record_dict=record_dict,
    cluster_field=cluster_field,
    train_proportion=0.5,
    valid_proportion=0.2,
    random_seed=random_seed)

12:43:09 INFO:Singleton cluster sizes (train, valid, test):(1092, 437, 656)
12:43:09 INFO:Plural cluster sizes (train, valid, test):(552, 221, 332)


In [12]:
from entity_embed.data_utils import utils

test_record_dict, unlabeled_record_dict, rest_dict = utils.split_record_dict_on_clusters(
    record_dict=test_record_dict,
    cluster_field=cluster_field,
    train_proportion=0.5,
    valid_proportion=0.5,
    random_seed=random_seed)

unlabeled_record_dict.update(rest_dict)

12:43:09 INFO:Singleton cluster sizes (train, valid, test):(328, 328, 0)
12:43:09 INFO:Plural cluster sizes (train, valid, test):(166, 166, 0)


In [13]:
del record_dict

In [14]:
import os

td.cleanup()
os.remove(tf.name)

## Preprocess

In [15]:
all_record_dicts = [
    train_record_dict,
    valid_record_dict,
    test_record_dict,
    unlabeled_record_dict
]

In [16]:
field_list = ['title', 'manufacturer', 'price']

In [17]:
import unidecode

def clean_str(s):
    return unidecode.unidecode(s).lower().strip()

for record_dict_ in all_record_dicts:
    for record in record_dict_.values():
        for field in field_list:
            record[field] = clean_str(record[field])

## CSV Generation

In [18]:
import random

rnd = random.Random(random_seed)

fieldnames = ['id', *field_list, '__source']

def write_csv(filepath, record_dict_, fieldnames, include_labels):
    if include_labels:
        fieldnames = [cluster_field] + fieldnames
    
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for record in record_dict_.values():
            writer.writerow({k: v for k, v in record.items() if k in fieldnames})
                
write_csv('../../example-data/rl-train.csv', train_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-valid.csv', valid_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-test.csv', test_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-unlabeled.csv', unlabeled_record_dict, fieldnames, include_labels=False)

## JSON Generation

In [19]:
unlabeled_pos_pairs = utils.cluster_dict_to_id_pairs(
    cluster_dict=utils.record_dict_to_cluster_dict(unlabeled_record_dict, cluster_field),
    left_id_set={record['id'] for record in unlabeled_record_dict.values() if record[source_field] == left_source},
    right_id_set={record['id'] for record in unlabeled_record_dict.values() if record[source_field] == right_source}
)
len(unlabeled_pos_pairs)

187

In [20]:
import json

with open('../../example-data/rl-unlabeled-pos-pairs.json', 'w', encoding='utf-8') as f:
    json.dump(list(unlabeled_pos_pairs), f, indent=4)

In [21]:
from entity_embed.data_utils.field_config_parser import DEFAULT_ALPHABET

alphabet = DEFAULT_ALPHABET
field_config_dict = {
    'title': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'title_semantic': {
        'key': 'title',
        'field_type': "SEMANTIC",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
    },
    'manufacturer': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'price': {
        'field_type': "STRING",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    }
}

In [22]:
with open('../../example-data/rl-field-config.json', 'w', encoding='utf-8') as f:
    json.dump(field_config_dict, f, indent=4)

```bash
entity_embed_train \
    --field_config_json example-data/rl-field-config.json \
    --train_csv example-data/rl-train.csv \
    --valid_csv example-data/rl-valid.csv \
    --test_csv example-data/rl-test.csv \
    --unlabeled_csv example-data/rl-unlabeled.csv \
    --csv_encoding utf-8 \
    --cluster_field cluster \
    --source_field __source \
    --left_source amazon \
    --embedding_size 300 \
    --lr 0.001 \
    --min_epochs 5 \
    --max_epochs 100 \
    --early_stop_monitor valid_recall_at_0.3 \
    --early_stop_min_delta 0 \
    --early_stop_patience 20 \
    --early_stop_mode max \
    --tb_save_dir tb_logs \
    --tb_name rl-example \
    --check_val_every_n_epoch 1 \
    --batch_size 32 \
    --eval_batch_size 64 \
    --num_workers -1 \
    --multiprocessing_context fork \
    --sim_threshold 0.3 \
    --sim_threshold 0.5 \
    --sim_threshold 0.7 \
    --ann_k 100 \
    --m 64 \
    --max_m0 64 \
    --ef_construction 150 \
    --ef_search -1 \
    --random_seed 42 \
    --model_save_dir trained-models/rl/ \
    --use_gpu 1
```

```bash
entity_embed_predict \
    --model_save_filepath "trained-models/rl/...fill-here..." \
    --unlabeled_csv example-data/rl-unlabeled.csv \
    --csv_encoding utf-8 \
    --source_field __source \
    --left_source amazon \
    --eval_batch_size 50 \
    --num_workers -1 \
    --multiprocessing_context fork \
    --sim_threshold 0.3 \
    --ann_k 100 \
    --m 64 \
    --max_m0 64 \
    --ef_construction 150 \
    --ef_search -1 \
    --random_seed 42 \
    --output_json example-data/rl-prediction.json \
    --use_gpu 1
```

In [24]:
from entity_embed.evaluation import evaluate_output_json

precision, recall, f1, pe_ratio = evaluate_output_json(
    unlabeled_csv_filepath='../../example-data/rl-unlabeled.csv',
    output_json_filepath='../../example-data/rl-prediction.json',
    pos_pair_json_filepath='../../example-data/rl-unlabeled-pos-pairs.json'
)
print("precision", precision)
print("recall", recall)
print("f1", f1)
print("pe_ratio", pe_ratio) 

precision 0.14876690533015116
recall 1.0
f1 0.25900277008310246
pe_ratio 1.8458149779735682
