# Data preparation for classifier training

This notebook shows how to prepare the data for the classifier trainig.

    Input: 
        Text file with two TAB separated columns. The first column contains the label, the second - the sentence.
    Output: 
        .json files (with train and test split) with embeddings obtained from the different pre-trained embedding models:
            1) word-level fastText embeddings obtained from the model _wiki.en.bin_
            2) sentence-level BERT uncased embeddings obtained from the model _uncased_L-12_H-768_A-12_
            3) sentence-level BERT cased embeddings obtained from the model _cased_L-12_H-768_A-12_
            
    Prior to running this notebook containers that serve embeddings should be started!
    Code of the BertVectorizer and FastTextVectorizer is located at '../../src/ClassicalNLP/DataVectorizer'

In [27]:
import sys
import os
import json
from sklearn.model_selection import train_test_split
from typing import List, Tuple, Dict
sys.path.append("../../src/ClassicalNLP/DataVectorizer/DatasetVectorizer/")
import argparse
import requests
from collections import defaultdict

In [28]:
def unpack_data(data: List[Tuple[str, str]]) -> List[Dict[str, str]]:
    return [{
        "sentence": sentence,
        "class": sentence_type,
    } for sentence, sentence_type in data]

In [29]:
class Vectorizer(object):
    def __init__(self, port=""):
        self.address = "http://localhost"
        self.port = port
        self.urlPrefix = f"{self.address}:{self.port}/"

    def get(self, q):
        payload = {"q": q}
        r = requests.get(self.urlPrefix + "vectorize", params=payload)
        return r.json()

    Specify the directory and the name of the _.tsv_ file with your data!
    The result _.json_ files will be saved in the same directory.

In [61]:
datadir = "C:/tmp/neasqc"
dsName = "reviews"

In [62]:
input_file = f"{datadir}/{dsName}.tsv"

In [63]:
dataset: List[Tuple[str, str]] = []

In [64]:
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        cols=line.split('\t')
        if len(cols) == 2:
            sent = cols[1].rstrip()
            dataset.append((sent, cols[0].rstrip()))


In [65]:
classes = [item[1] for item in dataset]
classes

['4',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '3',
 '5',
 '5',
 '5',
 '5',
 '3',
 '5',
 '5',
 '3',
 '5',
 '5',
 '5',
 '5',
 '3',
 '5',
 '4',
 '3',
 '3',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '5',
 '5',
 '2',
 '1',
 '5',
 '5',
 '1',
 '5',
 '4',
 '5',
 '4',
 '1',
 '5',
 '3',
 '4',
 '5',
 '5',
 '4',
 '5',
 '5',
 '1',
 '4',
 '5',
 '3',
 '1',
 '4',
 '1',
 '5',
 '5',
 '3',
 '3',
 '5',
 '5',
 '5',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '4',
 '3',
 '5',
 '5',
 '4',
 '5',
 '1',
 '5',
 '1',
 '1',
 '4',
 '1',
 '2',
 '1',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '3',
 '2',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '5',
 '5',
 '1',
 '5',
 '4',
 '1',
 '5',
 '3',
 '5',
 '2',
 '4',
 '2',
 '5',
 '5',
 '4',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '5',
 '5',
 '3',
 '2',
 '1',
 '2',
 '1',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '4',
 '2',
 '5',
 '4'

In [66]:
train_data, test_data = train_test_split(dataset, train_size=0.9, random_state=1, stratify=classes)

In [67]:
my_result = defaultdict(list)
for element in test_data:
    my_result[element[1]].append(element[0])

my_result = dict(my_result)
result_dictionary = dict()

for key in my_result:
    result_dictionary[key] = len(list(set(my_result[key]))) / len(test_data)
print(f"*** Proportion of classes in {len(test_data)} examples of test data ***")
print(json.dumps(result_dictionary, indent=4, sort_keys=True))

*** Proportion of classes in 5926 examples of test data ***
{
    "1": 0.11592980087748903,
    "2": 0.0661491731353358,
    "3": 0.08521768477894026,
    "4": 0.13297333783327708,
    "5": 0.5997300033749579
}


In [68]:
my_result = defaultdict(list)
for element in train_data:
    my_result[element[1]].append(element[0])

my_result = dict(my_result)
result_dictionary = dict()

for key in my_result:
    result_dictionary[key] = len(list(set(my_result[key]))) / len(train_data)
print(f"*** Proportion of classes in {len(train_data)} examples of train data ***")
print(json.dumps(result_dictionary, indent=4, sort_keys=True))

*** Proportion of classes in 53329 examples of train data ***
{
    "1": 0.11599692475013595,
    "2": 0.0662491327420353,
    "3": 0.08518817153893754,
    "4": 0.13289204747885766,
    "5": 0.5996737234900336
}


In [69]:
test_data = unpack_data(test_data)

In [70]:
train_data = unpack_data(train_data)

    Make sure that containers serving embeddings available from the localhost through the different ports!

In [None]:
vcdict = {}
vcdict["FASTTEXT"] = Vectorizer(port="11111") # wiki.en.bin
vcdict["BERT_UNCASED"] = Vectorizer(port="12345") # uncased_L-12_H-768_A-12
vcdict["BERT_CASED"] = Vectorizer(port="22222") # cased_L-12_H-768_A-12

In [None]:
for key, vectorizer in vcdict.items():
    ds = {"train_data": train_data, "test_data": test_data}
    for item in train_data:
        item["sentence_vectorized"] = vectorizer.get(item["sentence"])
    for item in test_data:
        item["sentence_vectorized"] = vectorizer.get(item["sentence"])
    with open(f"{datadir}/{dsName}_{key}.json", "w", encoding="utf-8") as f:
        json.dump(ds, f, indent=2, ensure_ascii=False)