In [22]:
import random
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, recall_score, f1_score
import pandas as pd
import torch
from numba import cuda


class DatasetLoader:
    def __init__(self, ):
        pass
        # self.text_col = text_col
        # self.dataframe = dataframe
        # self.label_col = label_col

    def load_dataset(self, dataframe, text_col="text", label_col="label"):
        # Split the DataFrame into training and validation sets
        train_df = self.dataframe[self.dataframe['split'] == 'train']
        val_df = self.dataframe[self.dataframe['split'] == 'validation']
        # get the number of unique labels
        num_labels = len(train_df[self.label_col].unique())
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        print(f"\nDataset loaded. The dataset has {self.num_labels} labels, {len(train_df)} training items, {len(val_df)} validation items. \n{self.dataframe.head(3)}")
        return train_dataset, val_dataset, num_labels

class Classifier:
    def __init__(self, model_name = "camembert/camembert-base-ccnet-4gb", nb_labels = 2):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'
        self.f1 = None
        self.accuracy = None
        self.recall = None
        self.nb_labels = nb_labels
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name,num_labels=self.num_labels)
        self.model_size = self.get_model_size(self.model)[1]
        self.model = None
        print(f"\nModel loaded. We will finetune {self.model_name} with {self.nb_labels} labels.")

        # Additional configurations can be added here

    def get_model_size(self,model):
        # from https://camembert-model.fr/posts/tutorial/
        param_size = 0
        param_count = 0
        for param in model.parameters():
            param_count += param.nelement()
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        size_all_mb = (param_size + buffer_size) / 1024**2
        return param_count, f'{size_all_mb:.2f}'

    def tokenize_function(self, examples):
        if "camembert" in self.model_name:
            return self.tokenizer(examples[self.text_col], padding="max_length", truncation=True, max_length=512)
        else:
            return self.tokenizer(examples[self.text_col], padding="max_length", truncation=True)

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        recall = recall_score(labels, preds, average='weighted')
        f1 = f1_score(labels, preds, average='weighted')
        self.f1 = f1
        self.accuracy = accuracy
        self.recall = recall
        return {'accuracy': accuracy, 'recall': recall, 'f1': f1}

    def train(self, train_dataset, validation_dataset, epochs=2, batch_size=10, learning_rate=2e-5):
        model = AutoModelForSequenceClassification.from_pretrained(self.model_name,num_labels=self.num_labels)
        model.to(self.device)
        training_args = TrainingArguments(
            output_dir="./results",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            learning_rate=learning_rate,
            weight_decay=0.01,
            logging_strategy="epoch",
            evaluation_strategy="epoch"
        )
        training_args.set_save(strategy="epoch")


        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=validation_dataset,
            compute_metrics=self.compute_metrics,
            tokenizer=self.tokenizer
        )
        trainer.train()


In [29]:
# Toy data
data = {
    "review": [f"This is sentence {i}" for i in range(30)],
    "label": [random.randint(0, 1) for _ in range(30)],
    "split": ["train" if i < 25 else "validation" for i in range(30)]
}


df = pd.DataFrame(data)

# Real data

df_train = pd.read_csv("data/train-fr-sampled.txt", sep=",")
df_val = pd.read_csv("data/validation-fr-sampled.txt", sep=",")
df_train["split"] = "train"
df_val["split"] = "validation"
df = pd.concat([df_train, df_val], ignore_index=True)
df


print(f"{training_time} seconds")

print(classifier.f1, classifier.accuracy, classifier.recall)

In [30]:
# create a pandas dataframe to log model name, f1, accuracy, recall, training time, data_size, batch size, learning rate, epochs

def log_model():
    log = pd.DataFrame(columns=["model_name", "f1", "accuracy", "recall", "training_time", "data_size", "batch_size", "learning_rate", "epochs"])
    return log

# create a function to add data to the log
def add_to_log(log, model_name, f1, accuracy, recall, training_time, data_size, batch_size, learning_rate, epochs):
    log.loc[len(log)+1] = [model_name, f1, accuracy, recall, training_time, data_size, batch_size, learning_rate, epochs]
    return log

log = log_model()
add_to_log(log, "camembert/camembert-base-ccnet-4gb", classifier.f1, classifier.accuracy, classifier.recall, training_time, len(df), 10, 2e-5, 1)

for m in ["camembert-base","camembert/camembert-large","camembert/camembert-base-ccnet", "camembert/camembert-base-ccnet-4gb", "camembert/camembert-base-oscar-4gb", "camembert/camembert-base-wikipedia-4gb","flaubert/flaubert_small_cased","flaubert/flaubert_base_uncased","flaubert/flaubert_base_cased","flaubert/flaubert_large_cased"]:
    # Initialize the classifier
    epoch = 10
    batch_size = 10
    learning_rate = 2e-5
    classifier = Classifier(dataframe=df, model_name=m, text_col="review",label_col="label")
    import time
    start_time = time.time()
    classifier.train(epochs=epoch, batch_size=batch_size, learning_rate=learning_rate)
    end_time = time.time()
    training_time = end_time - start_time
    add_to_log(log, m, classifier.f1, classifier.accuracy, classifier.recall, training_time, len(df), 10, 2e-5, 1)


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]