# Data preparation for classifier training

This notebook shows how to prepare the data for the classifier trainig.

    Input: 
        Text file with two TAB separated columns. The first column contains the label, the second - the sentence.
    Output: 
        .json files (with train and test split) with embeddings obtained from the different pre-trained embedding models:
            1) word-level fastText embeddings: model cc.en.300.bin
                (https://fasttext.cc/docs/en/crawl-vectors.html)
            2) sentence-level transformer embeddings: model all-mpnet-base-v2
                (https://www.sbert.net/docs/pretrained_models.html#model-overview)
            3) sentence-level transformer embeddings: model all-distilroberta-v1
                (https://www.sbert.net/docs/pretrained_models.html#model-overview)
            4) sentence-level BERT cased embeddings: model BERT-Base, Cased
                (https://github.com/google-research/bert#pre-trained-models)
            5) sentence-level BERT uncased embeddings: model BERT-Base, Uncased
                (https://github.com/google-research/bert#pre-trained-models)
                
        (.._traintest.json files contain train/test split without embedding vectors)
            
  

In [1]:
import sys
import os
import json
from sklearn.model_selection import train_test_split
from typing import List, Tuple, Dict
sys.path.append("./data_vectorisation/")
from Embeddings import Embeddings
import argparse
import requests
from collections import defaultdict

In [2]:
def unpack_data(data: List[Tuple[str, str]]) -> List[Dict[str, str]]:
    return [{
        "sentence": sentence,
        "class": sentence_type,
    } for sentence, sentence_type in data]

    Specify the directory and the name of the _.tsv_ file with your data!
    The result _.json_ files will be saved in the same directory.

In [3]:
datadir = "../datasets"
dsName = "reviews"
#dsName = "labelled_newscatcher_dataset"

In [4]:
input_file = f"{datadir}/{dsName}.tsv"

In [5]:
dataset: List[Tuple[str, str]] = []

In [6]:
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        cols=line.split('\t')
        if len(cols) == 2:
            sent = cols[1].rstrip()
            dataset.append((sent, cols[0].rstrip()))


In [7]:
classes = [item[1] for item in dataset]
classes

['4',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '3',
 '5',
 '5',
 '5',
 '5',
 '3',
 '5',
 '5',
 '3',
 '5',
 '5',
 '5',
 '5',
 '3',
 '5',
 '4',
 '3',
 '3',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '5',
 '5',
 '2',
 '1',
 '5',
 '5',
 '1',
 '5',
 '4',
 '5',
 '4',
 '1',
 '5',
 '3',
 '4',
 '5',
 '5',
 '4',
 '5',
 '5',
 '1',
 '4',
 '5',
 '3',
 '1',
 '4',
 '1',
 '5',
 '5',
 '3',
 '3',
 '5',
 '5',
 '5',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '4',
 '3',
 '5',
 '5',
 '4',
 '5',
 '1',
 '5',
 '1',
 '1',
 '4',
 '1',
 '2',
 '1',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '3',
 '2',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '1',
 '5',
 '5',
 '1',
 '5',
 '4',
 '1',
 '5',
 '3',
 '5',
 '2',
 '4',
 '2',
 '5',
 '5',
 '4',
 '5',
 '5',
 '5',
 '5',
 '2',
 '5',
 '5',
 '5',
 '3',
 '2',
 '1',
 '2',
 '1',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '4',
 '2',
 '5',
 '4'

In [8]:
train_data, test_data = train_test_split(dataset, train_size=0.9, random_state=1, stratify=classes)

In [9]:
my_result = defaultdict(list)
for element in test_data:
    my_result[element[1]].append(element[0])

my_result = dict(my_result)
result_dictionary = dict()

for key in my_result:
    result_dictionary[key] = len(list(set(my_result[key]))) / len(test_data)
print(f"*** Proportion of classes in {len(test_data)} examples of test data ***")
print(json.dumps(result_dictionary, indent=4, sort_keys=True))

*** Proportion of classes in 5926 examples of test data ***
{
    "1": 0.11592980087748903,
    "2": 0.0661491731353358,
    "3": 0.08521768477894026,
    "4": 0.13297333783327708,
    "5": 0.5997300033749579
}


In [10]:
my_result = defaultdict(list)
for element in train_data:
    my_result[element[1]].append(element[0])

my_result = dict(my_result)
result_dictionary = dict()

for key in my_result:
    result_dictionary[key] = len(list(set(my_result[key]))) / len(train_data)
print(f"*** Proportion of classes in {len(train_data)} examples of train data ***")
print(json.dumps(result_dictionary, indent=4, sort_keys=True))

*** Proportion of classes in 53329 examples of train data ***
{
    "1": 0.11599692475013595,
    "2": 0.0662491327420353,
    "3": 0.08518817153893754,
    "4": 0.13289204747885766,
    "5": 0.5996737234900336
}


In [11]:
test_data = unpack_data(test_data)

In [12]:
train_data = unpack_data(train_data)

In [13]:
with open(f"{datadir}/{dsName}_traintest.json", "w", encoding="utf-8") as f:
        json.dump({"train_data": train_data, "test_data": test_data}, f, indent=1, ensure_ascii=False)

In [14]:
def ObtainEmbeddings(train_data, test_data, key, path, embtype):
    vectorizer = Embeddings(path=path,embtype=embtype)
        
    cnt = 0
    print(f"\n*** Getting vectors for {len(train_data)} examples of train data ***", end='\n')
    for item in train_data:
        item["sentence_vectorized"] = vectorizer.getEmbeddingVector(item["sentence"])
        cnt = cnt + 1
        if cnt % 50 == 0:
            print (str(cnt),end=' ')
                
    cnt = 0
    print(f"\n*** Getting vectors for {len(test_data)} examples of test data ***", end='\n')
    for item in test_data:
        item["sentence_vectorized"] = vectorizer.getEmbeddingVector(item["sentence"])
        cnt = cnt + 1
        if cnt % 50 == 0:
            print (str(cnt),end=' ')
        
    with open(f"{datadir}/{dsName}_{key}.json", "w", encoding="utf-8") as f:
        json.dump({"train_data": train_data, "test_data": test_data}, f, indent=2, ensure_ascii=False)

In [15]:
#ObtainEmbeddings(train_data, test_data, 'FASTTEXT', 'cc.en.300.bin', 'fasttext')

In [16]:
#ObtainEmbeddings(train_data, test_data, 'all-mpnet-base', 'all-mpnet-base-v2', 'transformer')

In [17]:
#ObtainEmbeddings(train_data, test_data, 'all-distilroberta', 'all-distilroberta-v1', 'transformer')

In [18]:
#ObtainEmbeddings(train_data, test_data, 'BERT_UNCASED', 'bert-base-uncased', 'bert')

In [19]:
ObtainEmbeddings(train_data, test_data, 'BERT_CASED', 'bert-base-cased', 'bert')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-cased loaded!

*** Getting vectors for 53329 examples of train data ***
50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 3050 3100 3150 3200 3250 3300 3350 3400 3450 3500 3550 3600 3650 3700 3750 3800 3850 3900 3950 4000 4050 4100 4150 4200 4250 4300 4350 4400 4450 4500 4550 4600 4650 4700 4750 4800 4850 4900 4950 5000 5050 5100 5150 5200 5250 5300 5350 5400 5450 5500 5550 5600 5650 5700 5750 5800 5850 5900 5950 6000 6050 6100 6150 6200 6250 6300 6350 6400 6450 6500 6550 6600 6650 6700 6750 6800 6850 6900 6950 7000 7050 7100 7150 7200 7250 7300 7350 7400 7450 7500 7550 7600 7650 7700 7750 7800 7850 7900 7950 8000 8050 8100 8150 8200 8250 8300 8350 8400 8450 8500 8550 8600 8650 8700 8750 8800 8850 8900 8950 9000 9050 9100 9150 9200 9250 9300 9350 940