<a href="https://colab.research.google.com/github/undacmic/dataset_split/blob/main/Dataset_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages - Load data in memory

In [None]:
!pip install pydantic

Collecting pydantic
  Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[K     |████████████████████████████████| 10.9 MB 5.2 MB/s 
Installing collected packages: pydantic
Successfully installed pydantic-1.9.0


## Import packages

In [None]:
from typing import List, Tuple, Optional, Dict
import random
import pickle
import json
import math
import pydantic
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pprint

## Load Dataset Dict

In [None]:
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IV_nodlm-dw-EWl1DtngkATgAldEdAGO' -O dataset.pickle



In [None]:
in_file = open("dataset.pickle", "rb")
dict_dataset = pickle.load(in_file)

# Create pydantic structures that match dataset json


In [None]:
class Data(pydantic.BaseModel):
    user_id: str
    literal: str
    synsets: str
    correct_synset_id: str
    sentence: str
    text_prefix: str
    text: str
    text_postfix: str

In [None]:
class Literal(pydantic.BaseModel):
  literal: str
  data: List[Data] = []

In [None]:
class Dataset(pydantic.BaseModel):
  dataset: str # train, test, validation
  literals: List[Literal] = []

# Load Dataset in memory (pydantic format)

Migreaza datele din format dictionar in format pydantic (creaza un obiect dataset pe care il popouleaza cu literale care sunt populate cu datele specifice fiecarui literal)

In [None]:
literals = []
for key in dict_dataset.keys():
  literal_values = []
  for values in dict_dataset[key]:
    data = Data(**values)
    literal_values.append(data)
  literal = Literal(literal=key, data=literal_values)
  literals.append(literal)
dataset = Dataset(dataset="total", literals=literals)

# Explore Dataset

Dataset Length

In [None]:
print(len(dataset.literals))

5186


Average number of sentences per literal

In [None]:
all_sentences : int  = 0
for literal in dataset.literals:
  all_sentences += len(literal.data)

print(all_sentences/len(dataset.literals))

34.463941380640186


Average number of synset per literal

In [None]:
all_synsets : int  = 0
for literal in dataset.literals:
  all_synsets += len(literal.data[0].synsets.split(" "))

print(all_synsets/len(dataset.literals))

5.744118781334362


Plot synsets number over literal name

In [None]:
literals_name = []
synsets_number = []
for literal in dataset.literals:
  literals_name.append(literal.literal)
  synsets_number.append(len(literal.data[0].synsets.split(" ")))

fig = px.bar(x=literals_name, y=synsets_number)
fig.show()

Plot number of sentences over literal name

In [None]:
literals_name = []
sentences_number = []
for literal in dataset.literals:
  literals_name.append(literal.literal)
  sentences_number.append(len(literal.data))

fig = px.bar(x=literals_name, y=sentences_number)
fig.show()

View how many sentences are invalid per literal

In [None]:
total_number_invalid_sentences: int = 0 
literals_name = []
invalid_sentences_number = []
for literal in dataset.literals:
  literals_name.append(literal.literal)
  num_invalid = 0
  for sentence in literal.data:
    if sentence.correct_synset_id == "-1":
      num_invalid += 1
  total_number_invalid_sentences += num_invalid
  invalid_sentences_number.append(num_invalid)

fig = px.bar(x=literals_name, y=invalid_sentences_number)
fig.show()

Average number of invalid sentences per literal

In [None]:
print(total_number_invalid_sentences/len(dataset.literals))

5.125915927497108


Min and Max sentences number

In [None]:
literals_name = []
sentences_number = []
for literal in dataset.literals:
  literals_name.append(literal.literal)
  sentences_number.append(len(literal.data))

print(f"Min number of sentences: {min(sentences_number)}")
print(f"Word with minimum number of sentences: {literals_name[sentences_number.index(min(sentences_number))]}")

print(f"Max number of sentences: {max(sentences_number)}")
print(f"Word with maximum number of sentences: {literals_name[sentences_number.index(max(sentences_number))]}")

Min number of sentences: 1
Word with minimum number of sentences: exuberanță
Max number of sentences: 646
Word with maximum number of sentences: apariție


Min and Max synsets number

In [None]:
literals_name = []
synsets_number = []
for literal in dataset.literals:
  literals_name.append(literal.literal)
  synsets_number.append(len(literal.data[0].synsets.split(" "))-2) # -2 because splitted list is ["X-id", "Y-id", "-1", ""] ( "-1" amd "" are trash ) 

print(f"Min number of synsets: {min(synsets_number)}")
print(f"Word with minimum number of synsets: {literals_name[synsets_number.index(min(synsets_number))]}")

print(f"Max number of synsets: {max(synsets_number)}")
print(f"Word with maximum number of synsets: {literals_name[synsets_number.index(max(synsets_number))]}")

Min number of synsets: 2
Word with minimum number of synsets: secol
Max number of synsets: 34
Word with maximum number of synsets: drept


% of literals that have x number of synsets

In [None]:
dict_number_synsets = {}
for literal in dataset.literals:
  literals_name.append(literal.literal)
  key = str(len(literal.data[0].synsets.split(" ")) - 2) # la split vom avea o lista de forma ["X-id", "Y-id", "-1", "" ], iar pe noi nu ne intereseaza "-1" si ""
  if key in dict_number_synsets.keys():
    dict_number_synsets[key] += 1
  else:
    dict_number_synsets[key] = 1
values = [ (dict_number_synsets[x] * 100)/len(dataset.literals) for x in sorted(dict_number_synsets.keys()) ]
labels = [ f"{x} synsets" for x in sorted(dict_number_synsets.keys()) ]

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(textposition='inside')
fig.show()

% of literals that have x number of sentences

In [None]:
dict_number_synsets = {}
for literal in dataset.literals:
  literals_name.append(literal.literal)
  key = str(len(literal.data))
  if key in dict_number_synsets.keys():
    dict_number_synsets[key] += 1
  else:
    dict_number_synsets[key] = 1
values = [ (dict_number_synsets[x] * 100)/len(dataset.literals) for x in sorted(dict_number_synsets.keys()) ]
labels = [ f"{x} sentences" for x in sorted(dict_number_synsets.keys()) ]

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(textposition='inside')
fig.show()

# Create a naive split function

In [None]:
def calculate_proportions(length: int, x:int, y:int, z:int) -> Tuple[int, int, int]:
  """
  Function that calculate the proportions
  The priority order is TEST > VALIDATION > TRAIN

  OBS: this function is tested on https://wiki.mta.ro/c/4/ia/hw/2021/t2 table

  x, y, z = [0, 100]
  
  lengh - len of size to be splitted
  x     - train percent
  y     - validation percent
  z     - test percent 
  """

  z_len = math.ceil(length*z/100)
  y_len = min(length-z_len, math.ceil(length*y/100))
  x_len = length - z_len - y_len

  return x_len, y_len, z_len





def split_function(dataset: Dataset, x:int, y:int, z:int) -> Tuple[Dataset, Dataset, Dataset]:
  """
  Function that splits the dataset in non-heuristic way

  x, y, z = [0, 100]

  dataset - input dataset (Dataset datatype)
  x       - training %
  y       - validation %
  z       - testing %
  """

  # initialize datasets
  train_dataset = Dataset(dataset="train")
  validation_dataset = Dataset(dataset="validation")
  test_dataset = Dataset(dataset="test")

  # itearate dataset and split in x, y, z in this order of proportions
  for literal in dataset.literals:
    x_len, y_len, z_len = calculate_proportions(length=len(literal.data), x=x, y=y, z=z)

    train_dataset.literals.append(Literal(literal=literal.literal, data=literal.data[:x_len]))
    validation_dataset.literals.append(Literal(literal=literal.literal, data=literal.data[x_len:x_len+y_len]))
    test_dataset.literals.append(Literal(literal=literal.literal, data=literal.data[x_len+y_len:]))

  return train_dataset, validation_dataset, test_dataset

# Split Dataset

In [None]:
def split_function2(dataset: Dataset, x:int, y:int, z:int, activate_graph: bool = True) -> Tuple[Dataset, Dataset, Dataset]:
  """
  Function that splits the dataset in non-heuristic way

  x, y, z = [0, 100]

  dataset - input dataset (Dataset datatype)
  x       - training %
  y       - validation %
  z       - testing %
  """

  # initialize datasets
  train_dataset = Dataset(dataset="train")
  validation_dataset = Dataset(dataset="validation")
  test_dataset = Dataset(dataset="test")

  # itearate dataset and split in x, y, z in this order of proportions
  literal_array = []
  no_synsets_each_word_array = []
  for literal_data in dataset.literals:
    literal_array.append(literal_data.literal)
    no_synsets_each_word_array.append(len(literal_data.data[0].synsets.split(' ')))

  if activate_graph is True:
    fig = px.bar(x=literal_array, y=no_synsets_each_word_array, title="Initial distribution")
    fig.show()

  sorted_literal = [x for _, x in sorted(zip(no_synsets_each_word_array, literal_array), reverse=True)]
  sorted_synsets = [_ for _, x in sorted(zip(no_synsets_each_word_array, literal_array), reverse=True)]

  correct_literals = []
  correct_synsets = []
  unique_synsets = list(set(sorted_synsets))
  unique_synsets.sort(reverse=True)
  for i in unique_synsets:
    current_literals = []
    current_synsets = []
    for j in range(len(sorted_synsets)):
      if sorted_synsets[j] == i:
        current_literals.append(sorted_literal[j])
        current_synsets.append(sorted_synsets[j])
    current_literals.sort()
    correct_literals += current_literals
    correct_synsets += current_synsets
    
  if activate_graph is True:
    fig = px.bar(x=correct_literals, y=correct_synsets, title="Sorted descending by no synsets, alphabetically ascending for each synset value")
    fig.show()

  for i in range(len(correct_literals)):
    train, validation, test = [], [], []
    for literal_value in dataset.literals:
      if literal_value.literal == correct_literals[i]:
        synsets_values = literal_value.data[0].synsets.strip().split(' ')
        synsets_data_complete = [[] for _ in range(len(synsets_values))]
        existing_sentences = []
        for literal_data in literal_value.data:
          for j in range(len(synsets_values)):
            if literal_data.correct_synset_id == synsets_values[j] and literal_data.sentence not in existing_sentences:
              existing_sentences.append(literal_data.sentence)
              synsets_data_complete[j].append(literal_data)
              break
        train_literal = Literal(literal=correct_literals[i])
        validation_literal = Literal(literal=correct_literals[i])
        test_literal = Literal(literal=correct_literals[i])
        for j in synsets_data_complete:
          if len(j) != 0:
            random.shuffle(j)
            x_len, y_len, z_len = calculate_proportions(length=len(j), x=x, y=y, z=z)
            train_literal.data.extend(j[:x_len])
            validation_literal.data.extend(j[x_len:x_len+y_len])
            test_literal.data.extend(j[x_len+y_len:])
        train_dataset.literals.append(train_literal)
        validation_dataset.literals.append(validation_literal)
        test_dataset.literals.append(test_literal)
        
  return train_dataset, validation_dataset, test_dataset

In [None]:
x = 70
y = 15
z = 15
train_dataset, validation_dataset, test_dataset = split_function2(dataset, x, y, z)

In [None]:
x_, y_, z_ = 0, 0, 0
for i in train_dataset.literals:
  x_ += len(i.data)
for i in validation_dataset.literals:
  y_ += len(i.data)
for i in test_dataset.literals:
  z_ += len(i.data)
total = 0
for i in dataset.literals:
  total += len(i.data)
print(f"Train      proportion should be {x} and is {x_ * 100 / total}")
print(f"Validation proportion should be {y} and is {y_ * 100 / total}")
print(f"Test       proportion should be {z} and is {z_ * 100 / total}")

print(f"Sum should be 100: {x_ * 100 / total + y_ * 100 / total + z_ * 100 / total}")

Train      proportion should be 70 and is 60.29541766910983
Validation proportion should be 15 and is 19.036535556425893
Test       proportion should be 15 and is 20.604822917249482
Sum should be 100: 99.9367761427852


# Clean dataset

In [None]:
def pydantic_to_dict(dataset: Dataset) -> Dict[str, List[Dict[str,str]]]:
  dataset_dict = {}
  for literal in dataset.literals:
    data_list = []
    for data in literal.data:
      data_list.append(data.dict())
    dataset_dict[literal.literal] = data_list

  return dataset_dict

In [None]:
def clean_dataset(dataset: Dataset) -> Dataset:
  """
  Descriere:
  Aceasta functie va elimina toate propozitiile care au ca si correct_synset_id valoarea -1
  Totodata literalele care au toate propozitiile invalide (-1 la correct_synset_id) vor fi eliminate din lista

  Input: Dataset
  Output: Dataset
  """
  dict_dataset = pydantic_to_dict(dataset)
  
  def dict_to_pydantic_filtered(dataset: Dataset) -> Dataset:
    literals = []
    for key in dict_dataset.keys():
      literal_values = []
      for values in dict_dataset[key]:
        data = Data(**values)
        if data.correct_synset_id != "-1":
          literal_values.append(data)
      literal = Literal(literal=key, data=literal_values)
      if len(literal.data) != 0:
        literals.append(literal)
    dataset = Dataset(dataset=dataset.dataset, literals=literals)

    return dataset

  dataset = dict_to_pydantic_filtered(dataset)
  return dataset


In [None]:
train_dataset_clean = clean_dataset(train_dataset)
test_dataset_clean = clean_dataset(test_dataset)
validation_dataset_clean = clean_dataset(validation_dataset)
dataset_clean = clean_dataset(dataset)

In [None]:
invalid_percent = len(dataset_clean.literals)*100/len(dataset.literals)
values = [ invalid_percent, 100-invalid_percent ]
labels = [ "Invalid Literals %", "Valid Literals %" ]

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(textposition='inside')
fig.show()

# Save datasets in JSON format

## Write Dataset to json - out format

Json Structure
```
{
  "dataset": "name", # test, train, validation
  "literals" : [
    {
      "litera": "name",
      "data" :[
        {
          "correct_synset_id": "ENG30-08191230-n",
          "sentence": "În calitate de membri ai Forței Speciale de Intervenție a Armatei Macedonene din Irak, aceștia au participat la o operațiune de salvare a soldaților americani.",
          "synsets": "ENG30-08183290-n ENG30-08198398-n ENG30-08199025-n ENG30-08191230-n -1 ",
          "text": "Armatei",
          "text_prefix": "În calitate de membri ai Forței Speciale de Intervenție a ",
          "text_postfix": " Macedonene din Irak, aceștia au participat la o operațiune de salvare a soldaților americani.",
          "user_id": "Ceaus Alexandru"
        }.
        {
          ...
        }
      ]
    },
    {
      ...
    }
  ]
}
```

To .json file

In [None]:
with open("train_dataset_our.json", "wb") as out:
  out.write(json.dumps(train_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'))
with open("test_dataset_our.json", "wb") as out:
  out.write(json.dumps(test_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'))
with open("validation_dataset_our.json", "wb") as out:
  out.write(json.dumps(validation_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'))

To .pickle file

In [None]:
with open("train_dataset_our.pickle", "wb") as out:
  pickle.dump(json.dumps(train_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'), out)
with open("test_dataset_our.pickle", "wb") as out:
  pickle.dump(json.dumps(test_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'), out)
with open("validation_dataset_our.pickle", "wb") as out:
  pickle.dump(json.dumps(validation_dataset.dict(), indent=4, ensure_ascii=False).encode('utf8'), out)

## Write Dataset to json - asked format

Json Structure
```
{
  "literal" : [
        {
          "correct_synset_id": "ENG30-08191230-n",
          "sentence": "În calitate de membri ai Forței Speciale de Intervenție a Armatei Macedonene din Irak, aceștia au participat la o operațiune de salvare a soldaților americani.",
          "synsets": "ENG30-08183290-n ENG30-08198398-n ENG30-08199025-n ENG30-08191230-n -1 ",
          "text": "Armatei",
          "text_prefix": "În calitate de membri ai Forței Speciale de Intervenție a ",
          "text_postfix": " Macedonene din Irak, aceștia au participat la o operațiune de salvare a soldaților americani.",
          "user_id": "Ceaus Alexandru"
        }.
        {
          ...
        }
  ],
  "literal" : [
      {
        ...
      }
  ]
}
```

To .json file

In [None]:
# for _ in train_dataset.literals[:10]:
#   print(_)

train_dict      = pydantic_to_dict(train_dataset)
test_dict       = pydantic_to_dict(test_dataset)
validation_dict = pydantic_to_dict(validation_dataset)

with open("train_dataset_asked.json", "wb") as out:
  pickle.dump(json.dumps(train_dict, indent=4, ensure_ascii=False).encode('utf8'), out)
with open("validation_dataset_asked.json", "wb") as out:
  pickle.dump(json.dumps(validation_dict, indent=4, ensure_ascii=False).encode('utf8'), out)
with open("test_dataset_asked.json", "wb") as out:
  pickle.dump(json.dumps(test_dict, indent=4, ensure_ascii=False).encode('utf8'), out)

To .pickle file

In [None]:
train_dict      = pydantic_to_dict(train_dataset)
test_dict       = pydantic_to_dict(test_dataset)
validation_dict = pydantic_to_dict(validation_dataset)

with open("train_dataset.json", "wb") as out:
  out.write(json.dumps(train_dict, indent=4, ensure_ascii=False).encode('utf8'))
with open("validation_dataset.json", "wb") as out:
  out.write(json.dumps(validation_dict, indent=4, ensure_ascii=False).encode('utf8'))
with open("test_dataset.json", "wb") as out:
  out.write(json.dumps(test_dict, indent=4, ensure_ascii=False).encode('utf8'))

# Find some distribution metrics that fits our dataset

In [None]:
def metric(train_dataset_clean: Dataset, test_dataset_clean: Dataset, validation_dataset_clean: Dataset):
  def get_sysents_distribution_per_literal(dataset: Dataset) -> Dict[str, int]:
    dict_ = {}
    for key in dict_dataset.keys():
      dict_[key] = 0

    for literal in dataset.literals:
      synsets_dict = {}
      synsets_size = len(literal.data[0].synsets.split(" ")) - 2
      for literal_prop in literal.data:
        synsets_dict[literal_prop.correct_synset_id] = 0
      dict_[literal.literal] = len(synsets_dict.keys())/synsets_size
    return dict_


  num_synsets_train = get_sysents_distribution_per_literal(train_dataset_clean)
  num_synsets_test = get_sysents_distribution_per_literal(test_dataset_clean)
  num_synsets_validation = get_sysents_distribution_per_literal(validation_dataset_clean)


  total_sum = 0
  for key in num_synsets_train.keys():
    total_sum += (num_synsets_train[key] + num_synsets_test[key] + num_synsets_validation[key])/3

  print(f"Total distribution metric is: {total_sum/len(num_synsets_test.keys())}")

# Proof distribution over train, test and validation dataset

## This metric shows the proportion of literals

In [None]:
def test_proportions(x:int, y:int, z:int):
  train_dataset, validation_dataset, test_dataset = split_function2(dataset=dataset, x=x, y=y, z=z, activate_graph=False)

  x_, y_, z_ = 0, 0, 0
  for i in train_dataset.literals:
    x_ += len(i.data)
  for i in validation_dataset.literals:
    y_ += len(i.data)
  for i in test_dataset.literals:
    z_ += len(i.data)
  total = 0
  for i in dataset.literals:
    total += len(i.data)

  x_ = x_ * 100 / total
  y_ = y_ * 100 / total
  z_ = z_ * 100 / total

  print(f"Train      proportion should be {x} and is {x_}")
  print(f"Validation proportion should be {y} and is {y_}")
  print(f"Test       proportion should be {z} and is {z_}")

  print(f"Sum should be 100: {x_ + y_ + z_}")

  print(f"Metric - {(x_/x + y_/y + z_/z)/3}")


x = 70
y = 15
z = 15
test_proportions(x, y, z)

x = 80
y = 10
z = 10
test_proportions(x, y, z)

x = 70
y = 20
z = 10
test_proportions(x, y, z)

x = 90
y = 5
z = 5
test_proportions(x, y, z)

x = 34
y = 33
z = 33
test_proportions(x, y, z)

x = 98
y = 1
z = 1
test_proportions(x, y, z)



Train      proportion should be 70 and is 60.29541766910983
Validation proportion should be 15 and is 19.036535556425893
Test       proportion should be 15 and is 20.604822917249482
Sum should be 100: 99.9367761427852
Metric - 1.1680401137123566
Train      proportion should be 80 and is 69.60666927768142
Validation proportion should be 10 and is 14.3809097521401
Test       proportion should be 10 and is 15.949197112963688
Sum should be 100: 99.9367761427852
Metric - 1.3010313508271323
Train      proportion should be 70 and is 61.0188552565322
Validation proportion should be 20 and is 22.96872377328932
Test       proportion should be 10 and is 15.949197112963688
Sum should be 100: 99.9367761427852
Metric - 1.2050179440656696
Train      proportion should be 90 and is 77.08722654282997
Validation proportion should be 5 and is 10.640631119565825
Test       proportion should be 5 and is 12.208918480389414
Sum should be 100: 99.9367761427852
Metric - 1.8088115531186084
Train      proportion 

## This metric shows the proportion of synsets

In [None]:
def test_proportions_synsets(x:int, y:int, z:int):
  train_dataset, validation_dataset, test_dataset = split_function2(dataset=dataset, x=x, y=y, z=z, activate_graph=False)
  
  train_dataset_clean = clean_dataset(train_dataset)
  test_dataset_clean = clean_dataset(test_dataset)
  validation_dataset_clean = clean_dataset(validation_dataset)
  
  print(f"x={x}, y={y}, z={z}")
  metric(train_dataset_clean=train_dataset_clean, test_dataset_clean=test_dataset_clean, validation_dataset_clean=validation_dataset_clean)



x = 70
y = 15
z = 15
test_proportions_synsets(x, y, z)

x = 80
y = 10
z = 10
test_proportions_synsets(x, y, z)

x = 70
y = 20
z = 10
test_proportions_synsets(x, y, z)

x = 90
y = 5
z = 5
test_proportions_synsets(x, y, z)

x = 34
y = 33
z = 33
test_proportions_synsets(x, y, z)

x = 98
y = 1
z = 1
test_proportions_synsets(x, y, z)

x=70, y=15, z=15
Total distribution metric is: 0.7526571257789066
x=80, y=10, z=10
Total distribution metric is: 0.7526571257789066
x=70, y=20, z=10
Total distribution metric is: 0.7526571257789066
x=90, y=5, z=5
Total distribution metric is: 0.7526571257789066
x=34, y=33, z=33
Total distribution metric is: 0.7346321485704785
x=98, y=1, z=1
Total distribution metric is: 0.7526571257789066


# Other statistics

## Literals that appears in test but not in train and/or in validation dataset

In [None]:
train_dict      = pydantic_to_dict(train_dataset_clean)
test_dict       = pydantic_to_dict(test_dataset_clean)
validation_dict = pydantic_to_dict(validation_dataset_clean)

literal_not_in_train = []
literal_not_in_validation = []

for key in test_dict.keys():
  if key not in train_dict.keys():
    literal_not_in_train.append(key)

for key in test_dict.keys():
  if key not in validation_dict.keys():
    literal_not_in_validation.append(key)

pp = pprint.PrettyPrinter(indent=4)


print("Percentage of literals that are in test but not in train:")
print(len(literal_not_in_train)*100/len(test_dict.keys()))

print("Percentage of literals that are in test but not in validation:")
print(len(literal_not_in_validation)*100/len(test_dict.keys()))


print(f"Literals that are not in validation but in test: {literal_not_in_validation}")
# pp.pprint(literal_not_in_validation)


print(f"Literals that are not in train but in test: {literal_not_in_train}")
# pp.pprint(literal_not_in_train)


print(f"Literals that are in validation but not in training dataset: {list(set(literal_not_in_train) - set(literal_not_in_validation))}")


Percentage of literals that are in test but not in train:
9.121029039173651
Percentage of literals that are in test but not in validation:
5.262132137984798
Literals that are not in validation but in test: ['șir', 'contrar', 'cutezanță', 'fixitate', 'noutate', 'băgăreț', 'cicoare', 'circumspecție', 'impasibilitate', 'impertinență', 'impolitețe', 'ireverență', 'neîncredere', 'tezaur', 'admonestare', 'amestecătură', 'aviditate', 'bleg', 'bocitoare', 'brambureală', 'chior', 'ciur', 'clamă', 'contramandare', 'crampon', 'destăinuire', 'dulcegărie', 'fățărnicie', 'gentilețe', 'ghimber', 'ieșitură', 'irosire', 'lamelă', 'legato', 'lingușitor', 'Leon', 'acvatintă', 'adagio', 'adresant', 'aglutinare', 'asin', 'aurolac', 'bezea', 'bifurcare', 'brahman', 'bujor', 'bătător', 'bășică', 'calandru', 'castană', 'cedru', 'chefliu', 'chibzuială', 'cinteză', 'cleveteală', 'coborâș', 'corijare', 'cremene', 'crinolină', 'cuirasă', 'cuișor', 'cătină', 'deferență', 'dezmierdare', 'dirham', 'disociație', 'dom

# Project perspective: 
- add support for incremental literal append
