# CLI Record Linkage CSV Generation

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '..')

In [4]:
random_seed = 42

## Load Dataset

In [5]:
import urllib
import tempfile

dataset_url = 'https://dbs.uni-leipzig.de/file/Amazon-GoogleProducts.zip'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [6]:
import zipfile
import os

td = tempfile.TemporaryDirectory()

with zipfile.ZipFile(tf.name, "r") as zf:
    zf.extractall(td.name)

os.listdir(td.name)

['Amazon.csv', 'GoogleProducts.csv', 'Amzon_GoogleProducts_perfectMapping.csv']

In [7]:
import csv

from entity_embed.data_utils.utils import Enumerator

id_enumerator = Enumerator()
row_dict = {}
left_id_set = set()
right_id_set = set()

with open(f'{td.name}/Amazon.csv', newline='', encoding="latin1") as f:
    for row in csv.DictReader(f):
        row["id_at_source"] = row["id"]
        row['id'] = id_enumerator[row["id"]]
        row['name'] = row.pop('title')  # in Amazon, name is called title
        row['__source'] = 'amazon'  # for easy debugging, not used by entity-embed
        row_dict[row['id']] = row
        left_id_set.add(row['id'])

with open(f'{td.name}/GoogleProducts.csv', newline='', encoding="latin1") as f:
    for row in csv.DictReader(f):
        row["id_at_source"] = row["id"]
        row['id'] = id_enumerator[row["id"]]
        row['__source'] = 'google'  # for easy debugging, not used by entity-embed
        row_dict[row['id']] = row
        right_id_set.add(row['id'])

In [8]:
true_pair_set = set()

with open(f'{td.name}/Amzon_GoogleProducts_perfectMapping.csv', newline='') as f:
    for row in csv.DictReader(f):
        id_left = id_enumerator[row['idAmazon']]
        id_right = id_enumerator[row['idGoogleBase']]
        true_pair_set.add((id_left, id_right))

len(true_pair_set)

1300

In [9]:
from entity_embed.data_utils.utils import id_pairs_to_cluster_mapping_and_dict

cluster_mapping, cluster_dict = id_pairs_to_cluster_mapping_and_dict(true_pair_set)
len(cluster_dict)

1105

In [10]:
# import os

# td.cleanup()
# os.remove(tf.name)

## Preprocess

In [11]:
attr_list = ['name', 'description', 'manufacturer', 'price']

In [12]:
import unidecode
import itertools
from entity_embed import default_tokenizer

def clean_str(s):
    max_tokens = 100
    max_chars = 1000
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = default_tokenizer(s)[:max_tokens]
    return ' '.join(s_tokens)[:max_chars]

for row in row_dict.values():
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

## CSV Generation

In [13]:
import random

rnd = random.Random(random_seed)

fieldnames = ['cluster', 'name', 'description', 'manufacturer', 'price', 'id_at_source', '__source']
cluster_dict_list_all = list(cluster_dict.items())
rnd.shuffle(cluster_dict_list_all)
cluster_dict_list_train = cluster_dict_list_all[:600]
cluster_dict_list_eval = cluster_dict_list_all[600:]

def write_csv(filepath, cluster_dict_list, include_labels):
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for cluster_id, cluster_list in cluster_dict_list:
            for id_ in cluster_list:
                row = row_dict[id_]
                if include_labels:
                    row['cluster'] = cluster_id
                writer.writerow({k: v for k, v in row.items() if k in fieldnames})
                
write_csv('../example-data/rl-example-labeled.csv', cluster_dict_list_train, include_labels=True)
write_csv('../example-data/rl-example-unlabeled.csv', cluster_dict_list_eval, include_labels=True)

In [14]:
with open('../example-data/rl-example-labels-for-unlabeled.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['cluster', 'id_at_source', '__source'])
    writer.writeheader()
    for cluster_id, cluster_list in cluster_dict_list_eval:
        for id_ in cluster_list:
            row = row_dict[id_]
            writer.writerow({
                'cluster': cluster_id,
                'id_at_source': row['id_at_source'],
                '__source': row['__source']
            })

## JSON Generation

In [18]:
from entity_embed.data_utils.helpers import DEFAULT_ALPHABET

alphabet = DEFAULT_ALPHABET
attr_info_dict = {
    'name': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
        'use_mask': True,
    },
    'semantic_name': {
        'source_attr': 'name',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
        'use_mask': True,
    },
    'description': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
        'use_mask': True,
    },
    'semantic_description': {
        'source_attr': 'description',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
        'use_mask': True,
    },
    'manufacturer': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
        'use_mask': True,
    },
    'price': {
        'field_type': "STRING",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
        'use_mask': True,
    }
}

In [19]:
import json

with open('../example-data/rl-example-attr-info.json', 'w', encoding='utf-8') as f:
    json.dump(attr_info_dict, f, indent=4)