In [None]:
!pip3 install transformers datasets torch accelerate trl rouge_score

In [None]:
import torch
from torch import nn
from datasets import load_dataset
from transformers import GPT2Config, GPT2Tokenizer
from transformers import GPT2PreTrainedModel, GPT2Model
from trl import PPOTrainer, PPOConfig

# ─── 1) GPT-2 + Value & Sentiment Head ────────────────────────────────────────
class GPT2ForSentiment(PPOTrainer.model_class):  # placeholder, we'll build manually
    def __init__(self, config: GPT2Config, num_labels=2):
        super().__init__(config)
        self.transformer = GPT2Model(config)
        self.value_head = nn.Linear(config.hidden_size, 1, bias=False)
        self.dropout    = nn.Dropout(config.summary_first_dropout)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, labels=None):
        hidden_states = self.transformer(input_ids, attention_mask=attention_mask)[0]
        last_hidden   = hidden_states[:, -1, :]           # (B, H)

        # value (critic)
        values = self.value_head(last_hidden).squeeze(-1) # (B,)

        # classification logits
        pooled = self.dropout(last_hidden)
        logits = self.classifier(pooled)                 # (B, 2)

        # classification loss if labels given
        loss_cls = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_cls = loss_fct(logits, labels)

        return {
            "logits": logits,
            "loss": loss_cls,
            "values": values
        }

# ─── 2) Data, Tokenizer, Model ────────────────────────────────────────────────
dataset  = load_dataset("glue", "sst2", split="train")
val_ds   = load_dataset("glue", "sst2", split="validation")

config    = GPT2Config.from_pretrained("gpt2")
model     = GPT2ForSentiment(config)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# ─── 3) PPO Setup ─────────────────────────────────────────────────────────────
ppo_config = PPOConfig(
    model_name="gpt2",
    learning_rate=5e-6,
    batch_size=8,
    ppo_epochs=3,
    kl_ctl={"method":"adaptive", "target":0.05},
    log_with="tensorboard",
)
ppo_trainer = PPOTrainer(model=model, tokenizer=tokenizer, config=ppo_config)

# ─── 4) Reward fn (soft classification prob) ─────────────────────────────────
def sentiment_reward(prompts, responses, labels):
    # we ignore 'responses' text; use model outputs instead
    batch = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    out   = model(batch.input_ids.to(model.device), attention_mask=batch.attention_mask.to(model.device))
    probs = torch.softmax(out["logits"], dim=-1)
    r     = probs[torch.arange(len(labels)), torch.tensor(labels)]
    return (r - r.mean()) / (r.std() + 1e-8)

# ─── 5) Training Loop ─────────────────────────────────────────────────────────
for epoch in range(3):
    for i in range(0, len(dataset), ppo_config.batch_size):
        batch  = dataset[i : i + ppo_config.batch_size]
        texts  = batch["sentence"]
        labs   = batch["label"]
        prompts = [f"Review: {t}\nSentiment:" for t in texts]

        # a) generate (but we won’t use text)
        response_toks = ppo_trainer.generate(prompts, max_length=4)

        # b) reward via classification head
        rewards = sentiment_reward(prompts, response_toks, labs)

        # c) PPO step
        stats = ppo_trainer.step(prompts, response_toks, rewards)
        print(f"[Epoch {epoch}] Sentiment reward: {stats['objective/mean_reward']:.4f}")

# ─── 6) Save ────────────────────────────────────────────────────────────────────
ppo_trainer.save_pretrained("gpt2-ppo-sentiment")
tokenizer.save_pretrained("gpt2-ppo-sentiment")
