## **Setup**
- import provided data as `main-dataset`
- copy script which sets seed for multiple packages in the workind directory
- set random seed, load datasets, and split the training dataset into a smaller training subset and a test subset

In [None]:
from shutil import copyfile
copyfile(src="../input/main-dataset/random_seed_setter.py", dst="../working/random_seed_setter.py")

import random_seed_setter
random_seed_setter.set_random_seeds(42)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

train_dataset = pd.read_csv("/kaggle/input/main-dataset/train.csv")
prediction_dataset = pd.read_csv("/kaggle/input/main-dataset/test.csv")

train_data, test_data = train_test_split(train_dataset, train_size=0.8)

## **Stemming**
(not actually used when fine-tuning the LLM, but given to the Random Forest)

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("romanian")

def stem(text):
    return " ".join([stemmer.stem(word) for word in str(text).split()])

In [None]:
train_dataset["title"] = train_dataset["title"].apply(stem)
train_dataset["content"] = train_dataset["content"].apply(stem)
train_dataset.to_csv("train_stemmed.csv")

In [None]:
prediction_dataset["title"] = prediction_dataset["title"].apply(stem)
prediction_dataset["content"] = prediction_dataset["content"].apply(stem)
prediction_dataset.to_csv("test_stemmed.csv")

## **Creating a Hugging Face dataset**

In [None]:
# it would be improbable that the title of an article would belong to a different class than its content, so the two columns are merged
train_data["text"] = train_data["title"] + " " + train_data["content"].fillna("")
test_data["text"] = test_data["title"] + " " + test_data["content"].fillna("")

In [None]:
train_data.drop(["title", "content", "id"], axis=1, inplace=True)
test_data.drop(["title", "content", "id"], axis=1, inplace=True)

In [None]:
# most Hugging Face models expect the class field to be named `label`
train_data.rename(columns = {"class": "label"}, inplace=True)
test_data.rename(columns = {"class": "label"}, inplace=True)

In [None]:
train_data["class"] = train_data["class"].astype(int)
test_data["class"] = test_data["class"].astype(int)

In [None]:
# some numerical data is mixed up in the dataset.
train_data["text"] = train_data["text"].astype(str)
test_data["text"] = test_data["text"].astype(str)

In [None]:
train_dict = train_data.to_dict(orient="list")
test_dict = test_data.to_dict(orient="list")

In [None]:
from datasets import Dataset

ds_train = Dataset.from_dict(train_dict)
ds_test = Dataset.from_dict(test_dict)

## **Fine-tuning a LLM**
The model used, `readerbench/RoBERT-small`, is trained on data written in Romanian, and is small enough to be fine-tuned with our limited computing resources.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-small")

def preprocess(data):
    return tokenizer(data["text"], truncation=True, padding=True)

In [None]:
tokenized_ds_train = ds_train.map(preprocess, batched=True)
tokenized_ds_test = ds_test.map(preprocess, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# not installed in Kaggle environment
!pip install evaluate

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(generated_predictions):
    predictions, labels = generated_predictions
    # predictions returned as a bidimensional matrix, with columns for classes, and rows for individual articles;
    # value at (i, j) in matrix indicates score for class j of entry i
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NON-SATIRE", 1: "SATIRE"}
label2id = {"NON-SATIRE": 0, "SATIRE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "readerbench/RoBERT-small", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
import wandb

# avoid wandb prompt for API token
wandb.init(mode='disabled')

In [None]:
training_args = TrainingArguments(
    output_dir="robert-small-satire-classification-intermediary-weights",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("robert-small-satire-classification")

In [None]:
from shutil import make_archive
make_archive("/kaggle/working/robert-small-satire-classification-intermediary-weights", "zip", "/kaggle/working/robert-small-satire-classification-intermediary-weights")
make_archive("/kaggle/working/robert-small-satire-classification", "zip", "/kaggle/working/robert-small-satire-classification")

## **Predictions**

In [None]:
# the archive containing the final version of the fine-tuned model should be manually imported as a dataset of the Kaggle notebook
tokenizer_fine_tuned = AutoTokenizer.from_pretrained("/kaggle/input/robert-small-satire-classification")

In [None]:
import torch

# make sure both the test dataset and the weights of the model are loaded into the same device
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model_fine_tuned = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/robert-small-satire-classification", num_labels=2, id2label=id2label, label2id=label2id).to(device)

In [None]:
prediction_dataset["text"] = prediction_dataset["title"] + " " + prediction_dataset["content"].fillna("")
prediction_dataset.drop(["title", "content", "id"], axis=1, inplace=True)
prediction_data = prediction_dataset["text"].tolist()

In [None]:
predictions = []

# the prediction dataset will be fed to the model in batches of 16
for i in range(len(prediction_data) // 16):
    # clear any cached data from the previous batch
    torch.cuda.empty_cache()

    # process the next batch, while returning the tensors in pytorch format, and then sending them to the available device
    tokenized_prediction_data = tokenizer_fine_tuned(prediction_data[i * 16:(i + 1) * 16], return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model_fine_tuned(**tokenized_prediction_data).logits
        predictions.append(logits)
        

In [None]:
# last batch, which might be incomplete, will be processed below
torch.cuda.empty_cache()

tokenized_prediction_data = tokenizer_fine_tuned(prediction_data[len(prediction_data) // 16 * 16:], return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
    logits = model_fine_tuned(**tokenized_prediction_data).logits
    predictions.append(logits)

In [None]:
predicted_labels = []
for logit_set in predictions:
    # the same format, as explained previously
    predicted_labels.extend(logit_set.argmax(axis=1).tolist())
    
prediction_dataframe = pd.DataFrame(predicted_labels, columns=["class"])
prediction_dataframe.to_csv("prediction.csv")