# Import et chargement des csv

In [1]:
import os
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from peft import LoraConfig, get_peft_model, TaskType
import torch.nn.functional as F
from transformers import AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_data = os.path.join("..", "Data", "daily_Sample.csv")
df = pd.read_csv(path_data, index_col=0)

In [3]:
path_biais = os.path.join("..", "Data", "Daily_Biais_Sample.csv")
df_biais = pd.read_csv(path_biais, index_col=0)

In [4]:
col_pred = [
    "intervention_diet_coaching",
    "intervention_exercise_plan",
    "intervention_meditation",
    "intervention_sick_leave",
    "intervention_therapy",
    "intervention_vacation",
    "intervention_workload_cap"
]

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Chargement du mod√®le pr√©-entrain√©

In [6]:
model_name = "distilgpt2"
N_LABELS = len(col_pred) 

# Chargement du tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configuration importante pour GPT-2
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Chargement du mod√®le de base (AVANT fine-tuning)
print("\nüîµ Chargement du mod√®le BASELINE (non fine-tun√©)...")
model_baseline = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=N_LABELS, 
    problem_type="multi_label_classification" 
)
model_baseline.config.pad_token_id = tokenizer.pad_token_id

print("‚úÖ Mod√®le baseline charg√© avec succ√®s")
print(f" ¬† Nombre de param√®tres : {model_baseline.num_parameters():,}")


üîµ Chargement du mod√®le BASELINE (non fine-tun√©)...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Mod√®le baseline charg√© avec succ√®s
 ¬† Nombre de param√®tres : 81,917,952


# Mise en forme des datasets

In [7]:
# CODE A INSERER : Fonction de conversion
def format_row_to_text(row):
    # Impl√©mentation : concat√©ner toutes les colonnes de la ligne en une seule cha√Æne
    return "; ".join([f"{col}: {value}" for col, value in row.items()]) + "."

# CR√âER LA COLONNE 'text' et 'labels' dans les DataFrames de travail
df_train['text'] = df_train.drop(columns=col_pred).apply(format_row_to_text, axis=1)
df_test['text'] = df_test.drop(columns=col_pred).apply(format_row_to_text, axis=1)

# Cr√©er la colonne 'labels' pour les 7 interventions (obligatoire pour le Trainer)
df_train['labels'] = df_train[col_pred].values.astype(np.float32).tolist()
df_test['labels'] = df_test[col_pred].values.astype(np.float32).tolist()

In [8]:
len(df_train)

28616

# Fine Tuning complet

In [None]:
# 1. Pr√©paration des datasets pour Hugging Face
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=128,
        padding="max_length"
    )

# 2. Conversion en Dataset Hugging Face
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print(f"‚úÖ Datasets pr√©par√©s pour le fine-tuning")

N_LABELS=len(col_pred)
# 3. Nouveau mod√®le pour le fine-tuning
print("\nüü¢ Initialisation d'un nouveau mod√®le pour le fine-tuning...")
model_finetuned = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=N_LABELS,
    problem_type="multi_label_classification"
)
model_finetuned.config.pad_token_id = tokenizer.pad_token_id

# 4. Configuration du training (corrig√©e)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",      # <-- anciennement evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)


# 5. Fonction de calcul des m√©triques
def compute_metrics_multilabel(eval_pred):
    logits, labels = eval_pred
    probabilities = 1 / (1 + np.exp(-logits))
    predictions = (probabilities >= 0.5).astype(int)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='samples', zero_division=0
    )
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 6. Trainer
trainer = Trainer(
    model=model_finetuned,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_multilabel,
)

# 7. Fine-tuning
print("\nüöÄ D√©but du fine-tuning...")
print("   (Cela peut prendre quelques minutes selon votre machine)")
print("-" * 60)

train_result = trainer.train()

print("\n‚úÖ Fine-tuning termin√© !")
print(f"   Loss finale : {train_result.training_loss:.4f}")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28616/28616 [00:05<00:00, 4904.30 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7154/7154 [00:01<00:00, 5333.82 examples/s]


‚úÖ Datasets pr√©par√©s pour le fine-tuning

üü¢ Initialisation d'un nouveau mod√®le pour le fine-tuning...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ D√©but du fine-tuning...
   (Cela peut prendre quelques minutes selon votre machine)
------------------------------------------------------------


Epoch,Training Loss,Validation Loss


In [None]:
print(train_result)

# LoRA

In [None]:
# 1. Charger un mod√®le frais pour l'entra√Ænement LoRA
print("\\nüîµ Initialisation d'un nouveau mod√®le pour le fine-tuning LoRA...")
model_lora = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(col_pred),
    problem_type="multi_label_classification"
)
model_lora.config.pad_token_id = tokenizer.pad_token_id

# 2. D√©finir la configuration de LoRA
#    r: La dimension du rang (plus c'est petit, moins de param√®tres √† entra√Æner)
#    lora_alpha: Le facteur d'√©chelle pour les poids LoRA
#    target_modules: Les couches du mod√®le sur lesquelles appliquer LoRA. Pour GPT-2, 'c_attn' est la couche d'attention cl√©.
#    task_type: Sp√©cifie la t√¢che pour configurer correctement le mod√®le
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# 3. Appliquer LoRA au mod√®le
print("   Application de la configuration LoRA au mod√®le...")
model_lora = get_peft_model(model_lora, lora_config)

# Afficher le nombre de param√®tres entra√Ænables pour voir la diff√©rence !
print("\\nüìä Comparaison des param√®tres entra√Ænables :")
def print_trainable_parameters(model, model_name="model"):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"   - {model_name}: {trainable_params:,} param√®tres entra√Ænables ({100 * trainable_params / all_param:.4f}%)"
    )

print_trainable_parameters(model_finetuned, "Mod√®le Complet")
print_trainable_parameters(model_lora, "Mod√®le LoRA")


# 4. Entra√Æner avec le Trainer (le m√™me Trainer sait g√©rer un mod√®le PEFT)
trainer_lora = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_multilabel,
)

# 5. Lancer le fine-tuning LoRA
print("\\nüöÄ D√©but du fine-tuning LoRA...")
print("-" * 60)
train_result_lora = trainer_lora.train()

print("\\n‚úÖ Fine-tuning LoRA termin√© !")
print(f"   Loss finale : {train_result_lora.training_loss:.4f}")

# Distillation

In [None]:
# 1. Le "professeur" est notre mod√®le BASELINE (non fine-tun√©)
teacher_model = model_baseline
teacher_model.eval() # Mettre le professeur en mode √©valuation

# 2. Cr√©er un mod√®le "√©l√®ve" plus petit
#    On utilise la m√™me architecture (GPT-2) mais avec moins de couches et une taille cach√©e plus faible.
print("\nüîµ Cr√©ation du mod√®le √©l√®ve (student)...")
student_config = AutoConfig.from_pretrained(
    model_name,
    num_labels=len(col_pred),
    problem_type="multi_label_classification",
    n_layer=3,  # distilgpt2 a 6 couches, on en met 3
    n_head=4,   # distilgpt2 a 12 t√™tes, on en met 4
    n_embd=256, # distilgpt2 a 768, on met 256
)
student_config.pad_token_id = tokenizer.pad_token_id

student_model = AutoModelForSequenceClassification.from_config(student_config)

print("\nüìä Comparaison des tailles de mod√®les :")
print(f"   - Professeur : {teacher_model.num_parameters():,} param√®tres")
print(f"   - √âl√®ve      : {student_model.num_parameters():,} param√®tres")


# 3. Cr√©er un Trainer personnalis√© pour la distillation
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.alpha = alpha
        self.temperature = temperature

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Sorties de l'√©l√®ve
        outputs_student = model(**inputs)
        loss_ce = outputs_student.loss # Loss classique avec les vrais labels
        logits_student = outputs_student.logits

        # Sorties du professeur
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            logits_teacher = outputs_teacher.logits

        # Calcul de la loss de distillation (KL Divergence)
        loss_kd = F.kl_div(
            F.log_softmax(logits_student / self.temperature, dim=-1),
            F.softmax(logits_teacher / self.temperature, dim=-1),
            reduction='batchmean'
        ) * (self.temperature ** 2)

        # Combinaison des deux losses
        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kd

        return (loss, outputs_student) if return_outputs else loss

# 4. Instancier le DistillationTrainer
#    alpha: poids de la loss classique (0.5 = 50% CE, 50% KD)
#    temperature: "adoucit" les probabilit√©s du professeur pour donner plus d'infos √† l'√©l√®ve
trainer_distill = DistillationTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_multilabel,
    alpha=0.5,
    temperature=2.0,
)

# 5. Lancer l'entra√Ænement par distillation
print("\nüöÄ D√©but de l'entra√Ænement par distillation...")
print("-" * 60)
train_result_distill = trainer_distill.train()

print("\n‚úÖ Distillation termin√©e !")
print(f"   Loss finale : {train_result_distill.training_loss:.4f}")