# Imports

!pip install transformers torch datasets optuna scikit-learn

In [None]:
!pip install transformers[torch] datasets optuna scikit-learn accelerate

In [None]:
import optuna
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding, TrainerCallback, TrainerState, TrainerControl
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import pipeline
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification
import torch
import os

# Datasets

In [None]:
# Import the train data
train = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/training_data.csv')
train.set_index('id', inplace=True)
display(train.head())

#Import the unlabel data
unlabel = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/unlabelled_test_data.csv')
unlabel.set_index('id', inplace=True)
display(unlabel.head())

In [None]:
# Initialize the Labedncoder
diffuculty_encoder = LabelEncoder()

# Fit and transform the labels
train['difficulty'] = diffuculty_encoder.fit_transform(train['difficulty'])

# Print the classes and their corresponding encoded values
for index, label in enumerate(diffuculty_encoder.classes_):
    print(f'{label}: {index}')


#Display train
display(train.head())

# Tokenisation

In [None]:
# Load the CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

In [None]:
def tokenize_function(examples):
    # Tokenize the text
    result = tokenizer(examples['sentence'], padding="max_length", truncation=True)
    # Ensure labels are included if they exist in the examples
    if 'difficulty' in examples:
        result['labels'] = examples['difficulty']
    return result

In [None]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.1)

display(train_df.head(1))
display(val_df.head(1))

In [None]:
# Convert pandas dataframe to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Optimization with optuna

In [None]:
# Check if GPU is available
use_gpu = torch.cuda.is_available()

# Define the RDStopping callback
class RDStoppingCallback(TrainerCallback):
    def __init__(self, threshold=0.01, patience=3):
        self.threshold = threshold
        self.patience = patience
        self.best_loss = None
        self.consecutive_below_threshold = 0

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        # Get the current validation loss
        logs = kwargs.get("metrics", {})
        val_loss = logs.get("eval_loss", None)

        if val_loss is not None:
            if self.best_loss is None:
                self.best_loss = val_loss

            # Calculate relative divergence
            relative_divergence = abs((self.best_loss - val_loss) / self.best_loss)

            if relative_divergence < self.threshold:
                self.consecutive_below_threshold += 1
                print(f"Relative divergence below threshold: {relative_divergence:.4f} for {self.consecutive_below_threshold} consecutive evaluations.")
            else:
                self.consecutive_below_threshold = 0

            # Update the best loss
            if val_loss < self.best_loss:
                self.best_loss = val_loss

            # Stop training if the threshold is not met for `patience` evaluations
            if self.consecutive_below_threshold >= self.patience:
                control.should_training_stop = True
                print(f"Stopping training as the relative divergence has been below the threshold for {self.patience} consecutive evaluations.")

In [None]:
# Use DataCollatorWithPadding to pad the sequences in a batch
data_collator = DataCollatorWithPadding(tokenizer)

# Define RDStopping callback
rd_stopping_callback = [RDStoppingCallback(threshold=0.01, patience=3)]

best_models = []

In [None]:
# Define the objective function
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 10)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 0.1)
    gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4])
    warmup_steps = trial.suggest_categorical('warmup_steps', [0, 100, 200, 500])
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine', 'cosine_with_restarts'])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        evaluation_strategy="steps",
        eval_steps=100,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type,
        logging_dir='./logs',
        logging_steps=100,
        save_steps=500,
        save_total_limit=3,
        seed=42,
        fp16=use_gpu,
        no_cuda=not use_gpu
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks=rd_stopping_callback
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    eval_loss = eval_results['eval_loss']

    # Clear GPU memory
    torch.cuda.empty_cache()

    # Save model and hyperparameters if it is one of the top 5
    if len(best_models) < 5:
        best_models.append((eval_loss, trial.params, f'./results/{trial.number}'))
        best_models.sort(key=lambda x: x[0])
    elif eval_loss < best_models[-1][0]:
        # Remove the worst model
        worst_model_path = best_models.pop()[2]
        if os.path.exists(worst_model_path):
            os.system(f'rm -r {worst_model_path}')
        best_models.append((eval_loss, trial.params, f'./results/{trial.number}'))
        best_models.sort(key=lambda x: x[0])

    # Return the evaluation loss for optimization
    return eval_results['eval_loss']

In [None]:
# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # Adjust the number of trials as needed


In [None]:

# Print the best hyperparameters
print(f"Best Hyperparameters: {study.best_params}")
# Print the best hyperparameters and corresponding loss
for i, (loss, params, model_path) in enumerate(best_models):
    print(f"Model {i+1} - Loss: {loss}, Hyperparameters: {params}, Model Path: {model_path}")

# Predictions

In [None]:
# Load the top 5 models and their tokenizers
model_paths = [
    './saved_models/CamemBERT_V1',
    './saved_models/CamemBERT_V2',
    './saved_models/CamemBERT_V3',
    './saved_models/CamemBERT_V4',
    './saved_models/CamemBERT_V5'
]

# Define the label names in the order of their corresponding indices (0 to 5)
label_names = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

display(unlabel.head())

In [None]:
# Function to predict the difficulty of a single sentence using a specific model and tokenizer
def predict_difficulty(nlp, sentence):
    results = nlp(sentence)
    predictions = [{label_names[i]: score for i, score in enumerate(result)} for result in results]
    best_prediction = max(predictions[0], key=lambda key: predictions[0][key]['score'])
    return best_prediction



In [None]:
# Iterate over the models and tokenizers
for model_path in model_paths:
    model_name = model_path.split('/')[-1]  # Extract model name from path
    model = CamembertForSequenceClassification.from_pretrained(model_path)
    tokenizer = CamembertTokenizer.from_pretrained(model_path)

    # Create a pipeline for text classification
    nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

    # Apply the prediction function to each sentence in the 'sentence' column of unlabel DataFrame
    unlabel['difficulty'] = unlabel['sentence'].apply(lambda sentence: predict_difficulty(nlp, sentence))

    # Export the predictions to a CSV file
    output_file = f'/mnt/data/{model_name}.csv'
    unlabel.drop('sentence', axis=1, inplace=True)  # Remove the column 'sentence' for the output
    unlabel.to_csv(output_file, index=False)

    # Reload the original unlabelled data for the next iteration
    unlabel = pd.read_csv('/mnt/data/unlabelled_data.csv')