In [6]:
# table for the pilot study on french models
import pandas as pd
import torch
import gc
import time

from nlpbaselines.classifier import Classifier, DatasetLoader
from nlpbaselines.utils.data import data_sentiment
from nlpbaselines.utils.log import log_model, add_to_log
from nlpbaselines.utils.file import fn_datetime, get_current_fn
from nlpbaselines.utils.report import report_gpu
from nlpbaselines.utils.file import get_current_fn
from nlpbaselines.variables import model_list


report_gpu()
filename = "class-one-line"

# Real data

df_train = pd.read_csv("data/train-fr-sampled.txt", sep=",")
df_val = pd.read_csv("data/validation-fr-sampled.txt", sep=",")
df_train["split"] = "train"
df_val["split"] = "validation"
df = pd.concat([df_train, df_val], ignore_index=True)

# Initialize the classifier
loader = DatasetLoader(text_col="text", label_col="label")
ds_train, ds_val = loader.load_dataset(df)

print("data ok")


Mac gpu is available
class-one-line

Dataset loaded. The dataset has 2 labels, 1600 training items, 200 validation items. 
                                                text  label  split
0  J’étais assez enthousiaste en entrant dans la ...      0  train
1  Un slasher qui a plutôt mal vieilli, dommage c...      0  train
2  J'attendais mieux de cette fameuse idylle entr...      0  train
data ok


In [7]:
log = log_model()
formatted_datetime = fn_datetime()

model_list = ["cmarkea/distilcamembert-base"]

for m in model_list:
    print(f"model {m} start")
    epoch = 3
    batch_size = 10
    learning_rate = 2e-5
    classifier = Classifier(model_name=m, num_labels=2, use_multi_gpu=False)
    start_time = time.time()
    classifier.train(
        train_dataset=ds_train,
        validation_dataset=ds_val,
        epochs=epoch,
        batch_size=batch_size,
        learning_rate=learning_rate,
        output_dir=f"./results/{filename}-{formatted_datetime}/{m}/",
    )
    print("F1 Score", classifier.f1)
    print("Accuracy Score", classifier.accuracy)
    print("Recall Score", classifier.recall)

    end_time = time.time()
    training_time = f"{end_time - start_time:.2f}"
    log = add_to_log(
        log,
        m,
        classifier.f1,
        classifier.accuracy,
        classifier.recall,
        training_time,
        len(df),
        10,
        2e-5,
        epoch,
        classifier.model_param,
        classifier.model_size,
    )
    print(f"model {m} done, results saved to {filename}.csv")
    log.to_csv(f"{filename}.csv", index=False)

model cmarkea/distilcamembert-base start


Downloading (…)okenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded. We will finetune cmarkea/distilcamembert-base with 2 labels.


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

data encoded


  0%|          | 0/480 [00:00<?, ?it/s]

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 