# Imports

!pip install pandas scikit-learn transformers datasets

!pip uninstall transformers accelerate
!pip install transformers[torch] accelerate


!pip install accelerate -U

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import pipeline

## Databases

In [None]:
# Import the train data
train = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/training_data.csv')
train.set_index('id', inplace=True)
display(train.head())

#Import the unlabel data
unlabel = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/unlabelled_test_data.csv')
unlabel.set_index('id', inplace=True)
display(unlabel.head())

In [None]:
# Initialize the Labedncoder
diffuculty_encoder = LabelEncoder()

# Fit and transform the labels
train['difficulty'] = diffuculty_encoder.fit_transform(train['difficulty'])

# Print the classes and their corresponding encoded values
for index, label in enumerate(diffuculty_encoder.classes_):
    print(f'{label}: {index}')


#Display train
display(train.head())

# Camembert Model

In [None]:
# Load the CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

old tokenization of the data
```py

# Function to tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

In [None]:
def tokenize_function(examples):
    # Tokenize the text
    result = tokenizer(examples['sentence'], padding="max_length", truncation=True)
    # Ensure labels are included if they exist in the examples
    if 'difficulty' in examples:
        result['labels'] = examples['difficulty']
    return result

In [None]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.1)

display(train_df.head(1))
display(val_df.head(1))

In [None]:
# Convert pandas dataframe to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Set up the training arguments:
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model and logs
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"      # evaluate after each epoch
)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
trainer.evaluate()

In [None]:
# Save the model and the tokenizer in my directory
model.save_pretrained('./saved_models/CamemBERT_V1')
tokenizer.save_pretrained('./saved_models/CamemBERT_V1')

In [None]:
# Predictions on the unlabelled data

# Load the trained model
model = CamembertForSequenceClassification.from_pretrained('./saved_models/CamemBERT_V1')
tokenizer = CamembertTokenizer.from_pretrained('./saved_models/CamemBERT_V1')

# Define the label names in the order of their corresponding indices (0 to 5)
label_names = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# Create a pipeline for text classification
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)


# Predict
outputs = nlp("Je mange une pomme.")

# Decode the predictions
predictions = [{label_names[i]: score for i, score in enumerate(output)} for output in outputs]
print(predictions)

In [None]:
# Find the key with the highest score
best_prediction = max(predictions[0], key=lambda key: predictions[0][key]['score'])

print("Key with the highest score:", best_prediction)

In [None]:
# Function to predict the difficulty of a single sentence
def predict_difficulty(sentence):
    results = nlp(sentence)
    # Make the prediction for the sentence
    predictions = [{label_names[i]: score for i, score in enumerate(result)} for result in results]
    #Find the key with the highest score
    best_prediction = max(predictions[0], key=lambda key: predictions[0][key]['score'])
    return best_prediction


# Apply the prediction function to each sentence in the 'sentence' column of unlabel DataFrame
unlabel['difficulty'] = unlabel['sentence'].apply(predict_difficulty)


In [None]:
display(unlabel.head())

In [None]:
# Remove the column 'sentence'
unlabel.drop('sentence', axis=1, inplace=True)

#export the predictions to a csv file
unlabel.to_csv('CamemBERT_V1.csv')