# CLI Deduplication CSV Generation

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '../..')

In [4]:
random_seed = 42

## Load Dataset

In [5]:
import urllib
import tempfile

dataset_url = 'https://www.informatik.uni-leipzig.de/~saeedi/musicbrainz-20-A01.csv.dapo'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [6]:
import csv

record_dict = {}

with open(tf.name, newline="") as f:
    for current_record_id, record in enumerate(csv.DictReader(f)):
        record["id"] = current_record_id
        record["cluster"] = int(record.pop("CID"))  # rename CID to "cluster" and convert to int
        record_dict[record["id"]] = record

In [7]:
from entity_embed.data_utils import utils

cluster_field = 'cluster'
cluster_dict = utils.record_dict_to_cluster_dict(record_dict, cluster_field)
cluster_mapping = {
    id_: cluster_id for cluster_id, cluster in cluster_dict.items() for id_ in cluster
}
len(cluster_dict)

10000

In [8]:
from entity_embed.data_utils import utils

train_record_dict, valid_record_dict, test_record_dict = utils.split_record_dict_on_clusters(
    record_dict=record_dict,
    cluster_field=cluster_field,
    train_proportion=0.2,
    valid_proportion=0.2,
    random_seed=random_seed)

12:30:01 INFO:Singleton cluster sizes (train, valid, test):(1000, 1000, 3000)
12:30:01 INFO:Plural cluster sizes (train, valid, test):(1000, 1000, 3000)


In [9]:
from entity_embed.data_utils import utils

test_record_dict, unlabeled_record_dict, rest_dict = utils.split_record_dict_on_clusters(
    record_dict=test_record_dict,
    cluster_field=cluster_field,
    train_proportion=0.5,
    valid_proportion=0.5,
    random_seed=random_seed)

unlabeled_record_dict.update(rest_dict)

12:30:01 INFO:Singleton cluster sizes (train, valid, test):(1500, 1500, 0)
12:30:01 INFO:Plural cluster sizes (train, valid, test):(1500, 1500, 0)


In [10]:
del record_dict

In [11]:
import os

tf.close()
os.remove(tf.name)

## Preprocess

In [12]:
all_record_dicts = [
    train_record_dict,
    valid_record_dict,
    test_record_dict,
    unlabeled_record_dict
]

In [13]:
field_list = ['number', 'title', 'artist', 'album', 'year', 'language']

In [14]:
import unidecode

def clean_str(s):
    return unidecode.unidecode(s).lower().strip()

for record_dict_ in all_record_dicts:
    for record in record_dict_.values():
        for field in field_list:
            record[field] = clean_str(record[field])

## CSV Generation

In [15]:
import random

rnd = random.Random(random_seed)

fieldnames = ['id', *field_list]

def write_csv(filepath, record_dict_, fieldnames, include_labels):
    if include_labels:
        fieldnames = [cluster_field] + fieldnames
    
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for record in record_dict_.values():
            writer.writerow({k: v for k, v in record.items() if k in fieldnames})
                
write_csv('../../example-data/er-train.csv', train_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/er-valid.csv', valid_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/er-test.csv', test_record_dict, fieldnames, include_labels=True)
write_csv('../../example-data/er-unlabeled.csv', unlabeled_record_dict, fieldnames, include_labels=False)

## JSON Generation

In [16]:
unlabeled_pos_pairs = utils.cluster_dict_to_id_pairs(
    cluster_dict=utils.record_dict_to_cluster_dict(unlabeled_record_dict, cluster_field),
)
len(unlabeled_pos_pairs)

4932

In [17]:
import json

with open('../../example-data/er-unlabeled-pos-pairs.json', 'w', encoding='utf-8') as f:
    json.dump(list(unlabeled_pos_pairs), f, indent=4)

In [18]:
from entity_embed.data_utils.field_config_parser import DEFAULT_ALPHABET

alphabet = DEFAULT_ALPHABET
field_config_dict = {
    'number': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'title': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'title_semantic': {
        'key': 'title',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
    },
    'artist': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'album': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'album_semantic': {
        'key': 'album',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
    },
    'year': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'language': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
}

In [19]:
with open('../../example-data/er-field-config.json', 'w', encoding='utf-8') as f:
    json.dump(field_config_dict, f, indent=4)

```bash
entity_embed_train \
    --field_config_json example-data/er-field-config.json \
    --train_csv example-data/er-train.csv \
    --valid_csv example-data/er-valid.csv \
    --test_csv example-data/er-test.csv \
    --unlabeled_csv example-data/er-unlabeled.csv \
    --csv_encoding utf-8 \
    --cluster_field cluster \
    --embedding_size 300 \
    --lr 0.001 \
    --min_epochs 5 \
    --max_epochs 100 \
    --early_stop_monitor valid_recall_at_0.3 \
    --early_stop_min_delta 0 \
    --early_stop_patience 20 \
    --early_stop_mode max \
    --tb_save_dir tb_logs \
    --tb_name er-example \
    --check_val_every_n_epoch 1 \
    --batch_size 32 \
    --eval_batch_size 64 \
    --num_workers -1 \
    --multiprocessing_context fork \
    --sim_threshold 0.3 \
    --sim_threshold 0.5 \
    --sim_threshold 0.7 \
    --ann_k 100 \
    --m 64 \
    --max_m0 64 \
    --ef_construction 150 \
    --ef_search -1 \
    --random_seed 42 \
    --model_save_dir trained-models/er/ \
    --use_gpu 1
```

```bash
entity_embed_predict \
    --model_save_filepath "trained-models/er/...fill-here..." \
    --unlabeled_csv example-data/er-unlabeled.csv \
    --csv_encoding utf-8 \
    --eval_batch_size 50 \
    --num_workers -1 \
    --multiprocessing_context fork \
    --sim_threshold 0.3 \
    --ann_k 100 \
    --m 64 \
    --max_m0 64 \
    --ef_construction 150 \
    --ef_search -1 \
    --random_seed 42 \
    --output_json example-data/er-prediction.json \
    --use_gpu 1
```

In [21]:
from entity_embed.evaluation import evaluate_output_json

precision, recall, f1, pe_ratio = evaluate_output_json(
    unlabeled_csv_filepath='../../example-data/er-unlabeled.csv',
    output_json_filepath='../../example-data/er-prediction.json',
    pos_pair_json_filepath='../../example-data/er-unlabeled-pos-pairs.json'
)
print("precision", precision)
print("recall", recall)
print("f1", f1)
print("pe_ratio", pe_ratio) 

precision 0.027437097685239557
recall 0.994728304947283
f1 0.05340125502745712
pe_ratio 30.638965044551064
