<a href="https://colab.research.google.com/github/vittot/CLinkaRT-2023-Polimi/blob/main/Umberto_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [None]:
!pip install transformers==4.27.0

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import Dataset
from datasets import concatenate_datasets

In [None]:
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer

In [1]:
import pandas as pd
import os
import pickle

# LOAD DATA

In [None]:
os.chdir('/content/drive/MyDrive/_Polimi/EVALITA')

In [None]:
df_it = pd.read_csv('df_ita_translated.csv')
df_fr = pd.read_csv('res_fr.csv')
df_en = pd.read_csv('res_en.csv')
df_es = pd.read_csv('df_es_translated.csv')
df_bq = pd.read_csv('df_basque_translated.csv')

In [None]:
dataset_it = Dataset.from_pandas(pd.DataFrame(df_it.sample(frac=1)['text_translated'].dropna()))
dataset_en = Dataset.from_pandas(pd.DataFrame(df_en.sample(frac=1)['text_translated'].dropna()))
dataset_fr = Dataset.from_pandas(pd.DataFrame(df_fr.sample(frac=1)['text_translated'].dropna()))
dataset_es = Dataset.from_pandas(pd.DataFrame(df_es.sample(frac=1)['text_translated'].dropna()))
dataset_bq = Dataset.from_pandas(pd.DataFrame(df_bq.sample(frac=1)['text_translated'].dropna()))

In [None]:
dataset = concatenate_datasets([dataset_it, dataset_en, dataset_fr, dataset_es, dataset_bq])

In [None]:
#dataset = dataset['train'].train_test_split(test_size=0.1, seed=1234)
dataset = dataset.train_test_split(test_size=0.1, seed=1234)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text_translated', '__index_level_0__'],
        num_rows: 45038
    })
    test: Dataset({
        features: ['text_translated', '__index_level_0__'],
        num_rows: 5005
    })
})

# PREPARE DATA

In [None]:
model_checkpoint = "Musixmatch/umberto-commoncrawl-cased-v1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
def get_training_corpus():
    return (
        dataset["train"][i : i + 1000]["text_translated"]
        for i in range(0, len(dataset["train"]), 1000)
    )

In [None]:
training_corpus = get_training_corpus()

In [None]:
def tokenize_function(e):
    #print(e)
    return tokenizer(e['text_translated'])

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4,remove_columns=["text_translated", '__index_level_0__'])

Map (num_proc=4):   0%|          | 0/45038 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5005 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 45038
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5005
    })
})

In [None]:
with open('/content/drive/MyDrive/_Polimi/tokenized_dataset_e3c.bin', 'wb') as fp:
  pickle.dump(tokenized_datasets, fp)

In [None]:
block_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    #print(examples.keys())
    concatenated_examples = {}
    try:
      for k in examples.keys():
        concatenated_examples[k] = sum(examples[k], [])
    except Exception as e:
      print(k)
      print(e)
    #concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    if total_length >= block_size:
      total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/45038 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5005 [00:00<?, ? examples/s]

In [None]:
with open('/content/drive/MyDrive/_Polimi/lm_dataset_EVALITA.bin', 'wb') as fp:
  pickle.dump(lm_datasets, fp)

# TRAIN

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    output_dir='/content/drive/MyDrive/_Polimi/umberto-finetuned-EVALITA',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_strategy="epoch",
    overwrite_output_dir=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

In [None]:
trainer.train()