# CLI Deduplication CSV Generation

## Boilerplate

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [3]:
import sys

sys.path.insert(0, '..')

In [4]:
random_seed = 42

## Load Dataset

In [5]:
import urllib
import tempfile

dataset_url = 'https://www.informatik.uni-leipzig.de/~saeedi/musicbrainz-20-A01.csv.dapo'
tf = tempfile.NamedTemporaryFile(mode='r', delete=False)
tf.close()

urllib.request.urlretrieve(dataset_url, tf.name);

In [6]:
import csv

row_dict = {}

with open(tf.name, newline="") as f:
    for current_row_id, row in enumerate(csv.DictReader(f)):
        row["id_at_source"] = row["id"]
        row["id"] = current_row_id
        row["cluster"] = int(row.pop("CID"))  # rename CID to "cluster" and convert to int
        row_dict[row["id"]] = row

In [7]:
from entity_embed.data_utils import utils

cluster_attr = 'cluster'
cluster_dict = utils.row_dict_to_cluster_dict(row_dict, cluster_attr)
cluster_mapping = {
    id_: cluster_id for cluster_id, cluster in cluster_dict.items() for id_ in cluster
}
len(cluster_dict)

10000

In [8]:
import os

tf.close()
os.remove(tf.name)

## Preprocess

In [9]:
attr_list = ['number', 'title', 'artist', 'album', 'year', 'language']

In [10]:
import unidecode
import itertools
from entity_embed import default_tokenizer

def clean_str(s):
    max_tokens = 100
    max_chars = 1000
    s = unidecode.unidecode(s).lower().strip()
    s_tokens = default_tokenizer(s)[:max_tokens]
    return ' '.join(s_tokens)[:max_chars]

for row in row_dict.values():
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

## CSV Generation

In [11]:
import random

rnd = random.Random(random_seed)

labeled_len = 8000
fieldnames = [cluster_attr, *attr_list, 'id_at_source']
cluster_dict_list_all = list(cluster_dict.items())
rnd.shuffle(cluster_dict_list_all)
cluster_dict_list_train = cluster_dict_list_all[:labeled_len]
cluster_dict_list_eval = cluster_dict_list_all[labeled_len:]

def write_csv(filepath, cluster_dict_list, fieldnames, include_labels):
    if not include_labels:
        fieldnames = [k for k in fieldnames if k != cluster_attr]
    
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for cluster_id, cluster_list in cluster_dict_list:
            for id_ in cluster_list:
                row = row_dict[id_]
                writer.writerow({k: v for k, v in row.items() if k in fieldnames})
                
write_csv('../example-data/er-example-labeled.csv', cluster_dict_list_train, fieldnames, include_labels=True)
write_csv('../example-data/er-example-unlabeled.csv', cluster_dict_list_eval, fieldnames, include_labels=False)

In [12]:
with open('../example-data/er-example-labels-for-unlabeled.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['cluster', 'id_at_source'])
    writer.writeheader()
    for cluster_id, cluster_list in cluster_dict_list_eval:
        for id_ in cluster_list:
            row = row_dict[id_]
            writer.writerow({
                'cluster': cluster_id,
                'id_at_source': row['id_at_source'],
            })

## JSON Generation

In [13]:
from entity_embed.data_utils.helpers import DEFAULT_ALPHABET

alphabet = DEFAULT_ALPHABET
attr_info_dict = {
    'number': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'title': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'use_mask': True,
        'max_str_len': None,  # compute
    },
    'title_semantic': {
        'source_attr': 'title',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
        'use_mask': True,
    },
    'artist': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'use_mask': True,
        'max_str_len': None,  # compute
    },
    'album': {
        'field_type': "MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'alphabet': alphabet,
        'use_mask': True,
        'max_str_len': None,  # compute
    },
    'album_semantic': {
        'source_attr': 'album',
        'field_type': "SEMANTIC_MULTITOKEN",
        'tokenizer': "entity_embed.default_tokenizer",
        'vocab': "fasttext.en.300d",
        'use_mask': True,
    },
    'year': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
    'language': {
        'field_type': "STRING",
        'alphabet': alphabet,
        'max_str_len': None,  # compute
    },
}

In [14]:
import json

with open('../example-data/er-example-attr-info.json', 'w', encoding='utf-8') as f:
    json.dump(attr_info_dict, f, indent=4)