In [1]:
!pip install transformers datasets accelerate
!pip uninstall -y trl
!pip install trl==0.8.6

[0mCollecting trl==0.8.6
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl==0.8.6)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m245.2/245.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-1.0.3-py3-none-any.whl (180 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m180.7/180.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tyro, trl
Successfully installed trl-0.8.6 tyro-1.0.3


In [None]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORTANT : On revient aux imports classiques (plus de "experimental")
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

# =============================================================================
# 1. CONFIGURATION
# =============================================================================

class LengthSampler:
    def __init__(self, min_value, max_value):
        self.values = list(range(min_value, max_value))
    def __call__(self):
        return np.random.choice(self.values)

config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",  # Retour au nom standard
    learning_rate=1.41e-5,
    batch_size=64,
    mini_batch_size=16,
    gradient_accumulation_steps=1,
)

# =============================================================================
# 2. DATASET ET COLLATOR
# =============================================================================

def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch") # Avec trl 0.8.6, set_format fonctionne mieux
    return ds

dataset = build_dataset(config)

def ppo_collator(data):
    # Collator manuel pour g√©rer query (str) et input_ids (tensor)
    return {
        "input_ids": [d["input_ids"] for d in data],
        "query": [d["query"] for d in data]
    }

# =============================================================================
# 3. MOD√àLES
# =============================================================================

# Dans trl 0.8.6, le mod√®le s'initialise simplement
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

# Reward Model
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=0 if torch.cuda.is_available() else -1)

# Optimiseur (Standard Adam)
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

# =============================================================================
# 4. TRAINER
# =============================================================================

# Initialisation simplifi√©e de la version 0.8.6
ppo_trainer = PPOTrainer(
    config=config,           # C'est 'config' ici, pas 'args'
    model=model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=ppo_collator,
    optimizer=optimizer
)

# =============================================================================
# 5. BOUCLE D'ENTRA√éNEMENT
# =============================================================================

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 32,
}

# Param√®tres du pipeline de reward
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

print("D√©marrage de l'entra√Ænement PPO (Version Stable)...")

history_rewards = []

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

    # 1. G√©n√©ration (Rollout)
    query_tensors = batch["input_ids"]


    response_tensors = ppo_trainer.generate(
        query_tensors,
        return_prompt=False,
        **generation_kwargs
    )

    batch["response"] = tokenizer.batch_decode(response_tensors)

    # 2. Reward
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    # 3. PPO Step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

    # R√©cup√©ration de la reward moyenne du batch
    mean_reward = torch.stack(rewards).mean().item()
    history_rewards.append(mean_reward)

    # 4. Logging
    ppo_trainer.log_stats(stats, batch, rewards)

    if epoch % 5 == 0 and epoch > 0:
        print(f"Test r√©ussi au step {epoch}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(‚Ä¶):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


D√©marrage de l'entra√Ænement PPO (Version Stable)...


0it [00:00, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
6it [01:15, 12.36s/it]

Test r√©ussi au step 5


10it [02:09, 13.83s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
11it [02:21, 13.35s/it]

Test r√©ussi au step 10


16it [03:23, 12.49s/it]

Test r√©ussi au step 15


21it [04:25, 12.39s/it]

Test r√©ussi au step 20


26it [05:27, 12.43s/it]

Test r√©ussi au step 25


31it [06:28, 12.22s/it]

Test r√©ussi au step 30


36it [07:29, 12.14s/it]

Test r√©ussi au step 35


41it [08:29, 12.07s/it]

Test r√©ussi au step 40


46it [09:29, 11.96s/it]

Test r√©ussi au step 45


51it [10:29, 11.93s/it]

Test r√©ussi au step 50


56it [11:28, 11.86s/it]

Test r√©ussi au step 55


61it [12:27, 11.85s/it]

Test r√©ussi au step 60


66it [13:27, 11.85s/it]

Test r√©ussi au step 65


71it [14:26, 11.77s/it]

Test r√©ussi au step 70


76it [15:25, 11.84s/it]

Test r√©ussi au step 75


81it [16:24, 11.79s/it]

Test r√©ussi au step 80


86it [17:23, 11.89s/it]

Test r√©ussi au step 85


91it [18:25, 12.28s/it]

Test r√©ussi au step 90


96it [19:27, 12.32s/it]

Test r√©ussi au step 95


101it [20:28, 12.32s/it]

Test r√©ussi au step 100


106it [21:28, 12.10s/it]

Test r√©ussi au step 105


111it [22:28, 11.88s/it]

Test r√©ussi au step 110


116it [23:27, 11.85s/it]

Test r√©ussi au step 115


121it [24:26, 11.88s/it]

Test r√©ussi au step 120


126it [25:26, 11.88s/it]

Test r√©ussi au step 125


131it [26:25, 11.94s/it]

Test r√©ussi au step 130


133it [26:50, 12.09s/it]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history_rewards, label="Mean Reward")
plt.xlabel("Steps")
plt.ylabel("Reward (Score Sentiment)")
plt.title("√âvolution de la performance PPO sur IMDB")
plt.legend()
plt.grid(True)
plt.savefig("courbe_apprentissage_ppo.png")
print("Courbe sauvegard√©e sous courbe_apprentissage_ppo.png")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLMWithValueHead

# 1. Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "lvwerra/gpt2-imdb" # Le mod√®le de base (avant PPO)
trained_model_path = "ppo_model_step_50" # Ton dossier sauvegard√© (ajuste le nom)

# 2. Chargement des deux mod√®les
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

print("Chargement du mod√®le de base...")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_id).to(device)

print("Chargement du mod√®le PPO...")
# Assure-toi d'avoir sauvegard√© le mod√®le dans ta boucle avec save_pretrained !
try:
    model_ppo = AutoModelForCausalLMWithValueHead.from_pretrained(trained_model_path).to(device)
except OSError:
    print(f"Erreur: Le dossier {trained_model_path} n'existe pas. Utilise le model_ref pour tester.")
    model_ppo = model_ref # Fallback pour tester le code

# 3. Le Test
prompts = [
    "The movie was really",
    "I went to the cinema and",
    "Honestly, this film is",
    "The acting was"
]

gen_kwargs = {
    "min_length": -1,
    "max_new_tokens": 20,
    "do_sample": True,
    "top_k": 0.0,
    "top_p": 1.0,
    "pad_token_id": tokenizer.eos_token_id
}

print("\n--- R√âSULTATS COMPARATIFS ---\n")

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # G√©n√©ration Base
    out_ref = model_ref.generate(inputs["input_ids"], **gen_kwargs)
    text_ref = tokenizer.decode(out_ref[0], skip_special_tokens=True)

    # G√©n√©ration PPO
    out_ppo = model_ppo.generate(inputs["input_ids"], **gen_kwargs)
    text_ppo = tokenizer.decode(out_ppo[0], skip_special_tokens=True)

    print(f"PROMPT: {prompt}")
    print(f"üî¥ Base : {text_ref}")
    print(f"üü¢ PPO  : {text_ppo}")
    print("-" * 50)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns # pip install seaborn

def get_scores(model, tokenizer, reward_pipe, num_samples=50):
    scores = []
    # G√©n√©ration de prompts factices ou prise dans le dataset
    dummy_prompts = ["The movie", "I felt", "This film"] * 20

    for prompt in dummy_prompts[:num_samples]:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        out = model.generate(inputs["input_ids"], max_new_tokens=20, do_sample=True)
        text = tokenizer.decode(out[0], skip_special_tokens=True)

        # Scoring
        pipe_out = reward_pipe(text, **{"return_all_scores": True})
        score = pipe_out[0][1]["score"] # Score positif
        scores.append(score)
    return scores

print("Calcul des scores...")
scores_base = get_scores(model_ref, tokenizer, sentiment_pipe)
scores_ppo = get_scores(model_ppo, tokenizer, sentiment_pipe)

plt.figure(figsize=(10, 6))
sns.kdeplot(scores_base, fill=True, label="Base Model (SFT)", color="red")
sns.kdeplot(scores_ppo, fill=True, label="PPO Model", color="green")
plt.title("Distribution des scores de sentiment")
plt.xlabel("Score (0=N√©gatif, 1=Positif)")
plt.legend()
plt.savefig("histogramme_ppo.png")

In [None]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import RLOOTrainer, RLOOConfig # On importe RLOO au lieu de PPO
from datasets import load_dataset
from trl.core import LengthSampler

# 1. Configuration RLOO
# DIFF√âRENCE MAJEURE 1 : On remplace PPOConfig par RLOOConfig
rloo_config = RLOOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    batch_size=64,
    mini_batch_size=16,
    gradient_accumulation_steps=1,
    optimize_cuda_cache=True,

    # PARAM√àTRE CL√â DU PAPIER : rloo_k
    # Le papier montre que k=2 suffit souvent √† battre PPO
    # Cela signifie qu'on g√©n√®re 2 r√©ponses par prompt pour calculer la baseline.
    rloo_k=2,
)

# 2. Dataset (Identique √† PPO)
# On garde exactement le m√™me code pour que la comparaison soit scientifique.
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
    input_size = LengthSampler(input_min_text_length, input_max_text_length)
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

dataset = build_dataset(rloo_config)

# 3. Chargement du Mod√®le
# DIFF√âRENCE MAJEURE 2 : Pas de "WithValueHead"
# PPO n√©cessitait AutoModelForCausalLMWithValueHead (Actor + Critic).
# RLOO utilise simplement le mod√®le Causal classique (Actor seul).
# Cela all√®ge le chargement en m√©moire.
model = AutoModelForCausalLM.from_pretrained(rloo_config.model_name)
tokenizer = AutoTokenizer.from_pretrained(rloo_config.model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4. Initialisation du RLOO Trainer
rloo_trainer = RLOOTrainer(
    config=rloo_config,
    model=model,
    ref_model=None, # Toujours besoin d'une ref pour la divergence KL
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=lambda data: dict((key, [d[key] for d in data]) for key in data[0]),
)

# 5. Reward Model (Identique √† PPO)
from transformers import pipeline
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=0)
kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

def get_reward(texts):
    pipe_outputs = sentiment_pipe(texts, **kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    return rewards

# 6. La Boucle d'Entra√Ænement RLOO
# Note comment la logique de g√©n√©ration change l√©g√®rement pour accommoder k samples

print(f"D√©marrage de RLOO avec k={rloo_config.rloo_k}...")

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 32,
}

for epoch, batch in tqdm(enumerate(rloo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # RLOOTrainer s'attend √† ce qu'on lui fournisse les rollouts.
    # Ici, pour respecter la logique RLOO, on g√©n√®re k r√©ponses par prompt.
    # (La librairie trl g√®re souvent cela en interne via `step` si on lui passe juste les queries,
    # mais pour √™tre explicite :)

    response_tensors = rloo_trainer.generate(
        query_tensors,
        **generation_kwargs
    )

    batch["response"] = tokenizer.batch_decode(response_tensors)

    # Calcul des rewards
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    rewards = get_reward(texts)

    # √âtape d'Optimisation
    # C'est ici que RLOO calcule la baseline (moyenne des autres √©chantillons)
    # au lieu d'utiliser un r√©seau de neurones Critic.
    stats = rloo_trainer.step(query_tensors, response_tensors, rewards)

    rloo_trainer.log_stats(stats, batch, rewards)

    if epoch % 10 == 0 and epoch > 0:
        break