# Imports

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import pipeline
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification
import torch
from torch.nn import CrossEntropyLoss
import wandb
from sklearn.preprocessing import StandardScaler
import os

## Databases

In [None]:
# Import the train data
train = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/Data%20with%20Readability%20Metrics/train_metrics.csv')
train.set_index('id', inplace=True)
display(train.head())

#Import the unlabel data
unlabel = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/Data%20with%20Readability%20Metrics/unlabel_metrics.csv')
unlabel.set_index('id', inplace=True)
display(unlabel.head())

In [None]:
# Initialize the Labedncoder
diffuculty_encoder = LabelEncoder()

# Fit and transform the labels
train['difficulty'] = diffuculty_encoder.fit_transform(train['difficulty'])

# Print the classes and their corresponding encoded values
for index, label in enumerate(diffuculty_encoder.classes_):
    print(f'{label}: {index}')


#Display train
display(train.head())

In [None]:
# Drop the 'sentence' column from the training and testing sets
train_non_text = train.drop(columns=['sentence', 'difficulty']).reset_index(drop=True)


# Scale the non-text features
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train_non_text), columns=train_non_text.columns)

# Combine the scaled non-text features with the text features
X_train = pd.concat([train[['difficulty', 'sentence']], train_scaled], axis=1)

# Display the updated DataFrame
display(X_train.head())
display(X_train.shape)
display(X_train.columns)


# Wandb

In [None]:
import wandb
from transformers import Trainer, TrainingArguments, CamembertTokenizer
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split


# Load the CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Custom model class to include additional features
class CustomCamembertForSequenceClassification(torch.nn.Module):
    def __init__(self, base_model_name, num_labels, additional_features_dim):
        super(CustomCamembertForSequenceClassification, self).__init__()
        self.camembert = CamembertForSequenceClassification.from_pretrained(base_model_name, num_labels=num_labels)
        self.additional_features_proj = torch.nn.Linear(additional_features_dim, 768)  # Adjust size as needed
        self.classifier = torch.nn.Linear(768, num_labels)
        
    def forward(self, input_ids, attention_mask=None, labels=None, additional_features=None):
        outputs = self.camembert.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0]  # Use the first token's output (CLS token)
        additional_features_proj = self.additional_features_proj(additional_features)
        combined_output = pooled_output + additional_features_proj
        logits = self.classifier(combined_output)
        
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.camembert.num_labels), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

# Instantiate the custom model
additional_features_dim = 33  # Number of additional features
num_labels = 6
model = CustomCamembertForSequenceClassification('camembert-base', num_labels=num_labels, additional_features_dim=additional_features_dim)

# Tokenization function including additional features
def tokenize_function(examples):
    result = tokenizer(examples['sentence'], padding="max_length", truncation=True)
    additional_features = [
        [
            examples['flesch_reading_ease'][i],
            examples['polysyllable_count'][i],
            examples['coleman_liau_index'][i],
            examples['freqlemfilms2_mean'][i],
            examples['freqlemfilms2_median'][i],
            examples['freqlemfilms2_max'][i],
            examples['freqlemfilms2_min'][i],
            examples['freqlemfilms2_25%'][i],
            examples['freqlemfilms2_75%'][i],
            examples['freqlemlivres_mean'][i],
            examples['freqlemlivres_median'][i],
            examples['freqlemlivres_max'][i],
            examples['freqlemlivres_min'][i],
            examples['freqlemlivres_25%'][i],
            examples['freqlemlivres_75%'][i],
            examples['freqfilms2_mean'][i],
            examples['freqfilms2_median'][i],
            examples['freqfilms2_max'][i],
            examples['freqfilms2_min'][i],
            examples['freqfilms2_25%'][i],
            examples['freqfilms2_75%'][i],
            examples['nblettres_mean'][i],
            examples['nblettres_median'][i],
            examples['nblettres_max'][i],
            examples['nblettres_min'][i],
            examples['nblettres_25%'][i],
            examples['nblettres_75%'][i],
            examples['nbsyll_mean'][i],
            examples['nbsyll_median'][i],
            examples['nbsyll_max'][i],
            examples['nbsyll_min'][i],
            examples['nbsyll_25%'][i],
            examples['nbsyll_75%'][i]
        ] for i in range(len(examples['sentence']))
    ]
    result['additional_features'] = additional_features
    if 'difficulty' in examples:
        result['labels'] = examples['difficulty']
    return result

# Split the data into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.1)

# Convert pandas dataframe to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Define the data collator
class DataCollatorWithPaddingAndFeatures:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):
        batch = self.tokenizer.pad(features, return_tensors="pt")
        batch['additional_features'] = torch.stack([torch.FloatTensor(f['additional_features']) for f in features])
        return batch

data_collator = DataCollatorWithPaddingAndFeatures(tokenizer)

# WandB initialization
wandb.login()

def train():
    with wandb.init() as run:
        config = run.config
        training_args = TrainingArguments(
            output_dir=f'./results/run_{run.id}',  # Unique directory for each run
            evaluation_strategy="steps",
            eval_steps=100,
            per_device_train_batch_size=config.per_device_train_batch_size,
            per_device_eval_batch_size=16,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            num_train_epochs=config.num_train_epochs,
            warmup_steps=500,
            logging_dir='./logs',
            logging_steps=100,
            save_steps=500,
            save_total_limit=3,
            seed=42,
            fp16=torch.cuda.is_available(),
            no_cuda=not torch.cuda.is_available()
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer
        )

        trainer.train()
        eval_result = trainer.evaluate()
        wandb.log({"eval_loss": eval_result['eval_loss']})
        
        # Save the model with a unique directory
        model_save_path = f'./saved_models/run_{run.id}'
        os.makedirs(model_save_path, exist_ok=True)
        trainer.save_model(output_dir=model_save_path)

# Define the sweep configuration
sweep_configuration = {
    'method': 'bayes',
    'metric': {'name': 'eval_loss', 'goal': 'minimize'},
    'parameters': {
        'learning_rate': {'min': 1e-6, 'max': 1e-4},
        'weight_decay': {'min': 1e-5, 'max': 0.1},
        'per_device_train_batch_size': {'values': [8, 16, 32]},
        'num_train_epochs': {'values': [2, 3, 4, 5]},
        'gradient_accumulation_steps': {'values': [1, 2, 4, 8]}
    }
}

# Initialize and run the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project='hyperparameter-sweep')
wandb.agent(sweep_id, function=train, count=1)
