<a href="https://colab.research.google.com/github/vgentile98/predict_text_difficulty/blob/main/models/camembert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries & Data

In [None]:
!pip install sentencepiece==0.2.0

Collecting sentencepiece==0.2.0
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.99
    Uninstalling sentencepiece-0.1.99:
      Successfully uninstalled sentencepiece-0.1.99
Successfully installed sentencepiece-0.2.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report
import joblib
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, CamembertTokenizer, CamembertForSequenceClassification, AdamW, MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import huggingface_hub
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from torch.nn.functional import cross_entropy
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
from time import time
import datetime
from google.colab import files, drive
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
from nltk.corpus import wordnet
from sklearn.utils import shuffle
from random import choice
import os
import huggingface_hub

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Camembert Model

In [None]:
class FrenchTextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_token_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure text is string
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
def prepare_class_weights(y_train, device):
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
def train_model(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = CrossEntropyLoss()(outputs.logits, labels)
            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
    return total_loss / len(data_loader), total_correct / len(data_loader.dataset)

In [None]:
def predict(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

In [None]:
def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords.words('french')]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word, lang='fra'):
        for l in syn.lemmas(lang='fra'):
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in 'abcdefghijklmnopqrstuvwxyzéèêëàâäôöûüïîç '])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [None]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load and prepare data
    training_data = pd.read_csv("https://raw.githubusercontent.com/vgentile98/text_difficulty_prediction/main/data/training_data.csv")
    test_data = pd.read_csv("https://raw.githubusercontent.com/vgentile98/text_difficulty_prediction/main/data/unlabelled_test_data.csv")

    label_encoder = LabelEncoder()
    training_data['difficulty_encoded'] = label_encoder.fit_transform(training_data['difficulty'])

    # Data Augmentation on training data
    augmented_sentences = []
    augmented_labels = []
    for text, label in zip(training_data['sentence'], training_data['difficulty_encoded']):
        if np.random.rand() < 0.1:  # augment 10% of the dataset
            augmented_text = synonym_replacement(text, 1)  # Simple augmentation: replace one word
            augmented_sentences.append(augmented_text)
            augmented_labels.append(label)

    augmented_data = pd.DataFrame({'sentence': augmented_sentences, 'difficulty_encoded': augmented_labels})
    training_data = pd.concat([training_data, augmented_data]).reset_index(drop=True)

    # Splitting training data into train and validation sets
    train_df, val_df = train_test_split(training_data, test_size=0.1, random_state=42)

    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    train_dataset = FrenchTextDataset(train_df['sentence'].values, train_df['difficulty_encoded'].values, tokenizer)
    val_dataset = FrenchTextDataset(val_df['sentence'].values, val_df['difficulty_encoded'].values, tokenizer)
    test_dataset = FrenchTextDataset(test_data['sentence'].values, tokenizer=tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Load and prepare model
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(label_encoder.classes_)).to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    class_weights = prepare_class_weights(train_df['difficulty_encoded'].values, device)
    loss_fn = CrossEntropyLoss(weight=class_weights)

    # Training and evaluation
    for epoch in range(6):
        train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
        val_loss, val_accuracy = evaluate_model(model, val_loader, device)
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.3f}, Validation Loss: {val_loss:.3f}, Validation Accuracy: {val_accuracy:.2f}')

    # Final prediction
    predictions = predict(model, test_loader, device)
    predicted_labels = label_encoder.inverse_transform(predictions)

    # Prepare submission
    submission_v3_8 = pd.DataFrame({
        'id': test_data['id'],
        'difficulty': predicted_labels
    })

    # Save the DataFrame to a CSV file
    submission_v3_8.to_csv('submission_v3_8.csv', index=False)
    files.download('submission_v3_8.csv')
    print("Submission file has been downloaded as 'submission_v3_8.csv'.")

    # Save model & tokenizer in Google Drive
    drive.mount('/content/drive')
    model_save_path = '/content/drive/My Drive/text_difficulty_model/camembert_model'
    tokenizer_save_path = '/content/drive/My Drive/text_difficulty_model/camembert_tokenizer'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(tokenizer_save_path)
    print("Model and tokenizer saved to Google Drive.")

In [None]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss: 1.274, Validation Loss: 1.118, Validation Accuracy: 0.52
Epoch 2: Train Loss: 0.906, Validation Loss: 0.989, Validation Accuracy: 0.60
Epoch 3: Train Loss: 0.663, Validation Loss: 1.012, Validation Accuracy: 0.63
Epoch 4: Train Loss: 0.416, Validation Loss: 1.157, Validation Accuracy: 0.61
Epoch 5: Train Loss: 0.268, Validation Loss: 1.283, Validation Accuracy: 0.63
Epoch 6: Train Loss: 0.211, Validation Loss: 1.341, Validation Accuracy: 0.62


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Submission file has been downloaded as 'submission_v3_8.csv'.
Mounted at /content/drive
Model and tokenizer saved to Google Drive.
