# CLI Record Linkage CSV Generation

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '..')

In [4]:
random_seed = 42

## Load Dataset

In [5]:
import urllib
import tempfile

dataset_url = 'https://dbs.uni-leipzig.de/file/Amazon-GoogleProducts.zip'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [6]:
import zipfile
import os

td = tempfile.TemporaryDirectory()

with zipfile.ZipFile(tf.name, "r") as zf:
    zf.extractall(td.name)

os.listdir(td.name)

['Amazon.csv', 'GoogleProducts.csv', 'Amzon_GoogleProducts_perfectMapping.csv']

In [7]:
import csv

from entity_embed.data_utils.utils import Enumerator

id_enumerator = Enumerator()
row_dict = {}
source_field = '__source'
left_source = 'amazon'
right_source = 'google'

with open(f'{td.name}/Amazon.csv', newline='', encoding="latin1") as f:
    for row in csv.DictReader(f):
        row['id'] = id_enumerator[row["id"]]
        row['name'] = row.pop('title')  # in Amazon, name is called title
        row[source_field] = left_source
        row_dict[row['id']] = row

with open(f'{td.name}/GoogleProducts.csv', newline='', encoding="latin1") as f:
    for row in csv.DictReader(f):
        row['id'] = id_enumerator[row["id"]]
        row[source_field] = right_source
        row_dict[row['id']] = row

In [8]:
pos_pair_set = set()

with open(f'{td.name}/Amzon_GoogleProducts_perfectMapping.csv', newline='') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idAmazon']]
        id_right = id_enumerator[row['idGoogleBase']]
        pos_pair_set.add((id_left, id_right))

len(pos_pair_set)

1300

In [9]:
from entity_embed.data_utils import utils

cluster_mapping, cluster_dict = utils.id_pairs_to_cluster_mapping_and_dict(pos_pair_set, row_dict)
len(cluster_dict)

3290

In [10]:
cluster_field = 'cluster'

utils.assign_clusters(row_dict=row_dict, cluster_field=cluster_field, cluster_mapping=cluster_mapping)

In [11]:
from entity_embed.data_utils import utils

train_row_dict, valid_row_dict, test_row_dict = utils.split_row_dict_on_clusters(
    row_dict=row_dict,
    cluster_field=cluster_field,
    train_proportion=0.2,
    valid_proportion=0.2,
    random_seed=random_seed)

13:35:55 INFO:Singleton cluster sizes (train, valid, test):(437, 437, 1311)
13:35:55 INFO:Plural cluster sizes (train, valid, test):(221, 221, 663)


In [12]:
from entity_embed.data_utils import utils

test_row_dict, unlabeled_row_dict, rest_dict = utils.split_row_dict_on_clusters(
    row_dict=test_row_dict,
    cluster_field=cluster_field,
    train_proportion=0.5,
    valid_proportion=0.5,
    random_seed=random_seed)

unlabeled_row_dict.update(rest_dict)

13:35:55 INFO:Singleton cluster sizes (train, valid, test):(655, 655, 1)
13:35:55 INFO:Plural cluster sizes (train, valid, test):(331, 331, 1)


In [13]:
import os

td.cleanup()
os.remove(tf.name)

## Preprocess

In [14]:
all_row_dicts = [
    train_row_dict,
    valid_row_dict,
    test_row_dict,
    unlabeled_row_dict
]

In [15]:
field_list = ['name', 'description', 'manufacturer', 'price']

In [16]:
import unidecode
import itertools
from entity_embed import default_tokenizer

def clean_str(s):
    max_tokens = 30
    max_chars = 1000
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = default_tokenizer(s)[:max_tokens]
    return ' '.join(s_tokens)[:max_chars]

for row_dict_ in all_row_dicts:
    for row in row_dict_.values():
        for field in field_list:
            row[field] = clean_str(row[field])

## CSV Generation

In [17]:
import random

rnd = random.Random(random_seed)

fieldnames = ['id', *field_list, '__source']

def write_csv(filepath, row_dict_, fieldnames, include_labels):
    if include_labels:
        fieldnames = [cluster_field] + fieldnames
    
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in row_dict_.values():
            writer.writerow({k: v for k, v in row.items() if k in fieldnames})
                
write_csv('../../example-data/rl-train.csv', train_row_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-valid.csv', valid_row_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-test.csv', test_row_dict, fieldnames, include_labels=True)
write_csv('../../example-data/rl-unlabeled.csv', unlabeled_row_dict, fieldnames, include_labels=False)

## JSON Generation

In [18]:
unlabeled_pos_pairs = utils.cluster_dict_to_id_pairs(
    cluster_dict=utils.row_dict_to_cluster_dict(unlabeled_row_dict, cluster_field),
    left_id_set={row['id'] for row in row_dict.values() if row[source_field] == left_source},
    right_id_set={row['id'] for row in row_dict.values() if row[source_field] == right_source}
)
len(unlabeled_pos_pairs)

381

In [19]:
import json

with open('../../example-data/rl-unlabeled-pos-pairs.json', 'w', encoding='utf-8') as f:
    json.dump(list(unlabeled_pos_pairs), f, indent=4)

In [20]:
from entity_embed.data_utils.field_config_parser import DEFAULT_ALPHABET

alphabet = DEFAULT_ALPHABET
field_config_dict = {
    'name': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'semantic_name': {
        'key': 'name',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
    },
    'description': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'semantic_description': {
        'key': 'description',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
    },
    'manufacturer': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'price': {
        'field_type': "STRING",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    }
}

In [21]:
with open('../../example-data/rl-field-config.json', 'w', encoding='utf-8') as f:
    json.dump(field_config_dict, f, indent=4)

```bash
entity_embed_train \
    --field_config_json example-data/rl-field-config.json \
    --train_csv example-data/rl-train.csv \
    --valid_csv example-data/rl-valid.csv \
    --test_csv example-data/rl-test.csv \
    --unlabeled_csv example-data/rl-unlabeled.csv \
    --csv_encoding utf-8 \
    --cluster_field cluster \
    --source_field __source \
    --left_source amazon \
    --embedding_size 300 \
    --lr 0.001 \
    --min_epochs 5 \
    --max_epochs 100 \
    --early_stop_monitor valid_recall_at_0.3 \
    --early_stop_min_delta 0 \
    --early_stop_patience 20 \
    --early_stop_mode max \
    --tb_save_dir tb_logs \
    --tb_name rl-example \
    --check_val_every_n_epoch 1 \
    --batch_size 32 \
    --eval_batch_size 64 \
    --num_workers -1 \
    --multiprocessing_context fork \
    --sim_threshold 0.3 \
    --sim_threshold 0.5 \
    --sim_threshold 0.7 \
    --ann_k 100 \
    --m 64 \
    --max_m0 64 \
    --ef_construction 150 \
    --ef_search -1 \
    --random_seed 42 \
    --model_save_dir trained-models/rl/
```

```bash
entity_embed_predict \
    --model_save_filepath "trained-models/rl/...fill-here..." \
    --field_config_json example-data/rl-example-field-config.json \
    --ann_k 100 \
    --ef_search -1 \
    --ef_construction 150 \
    --max_m0 64 \
    --m 64 \
    --multiprocessing_context fork \
    --num_workers -1 \
    --sim_threshold 0.3 \
    --random_seed 42 \
    --eval_batch_size 50 \
    --csv_encoding utf-8 \
    --unlabeled_csv example-data/rl-unlabeled.csv \
    --output_json example-data/rl-prediction.json
```