In [None]:
# ===============================
# ✅ 1. INSTALL DEPENDENCIES
# ===============================
!pip install transformers datasets evaluate xformers --quiet

# ===============================
# ✅ 2. MOUNT GOOGLE DRIVE
# ===============================
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ===============================
# ✅ 3. CONFIGURATION
# ===============================
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import evaluate
import os

# === Parameters ===
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 6
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 3e-5
USE_FP16 = True

# === Paths ===
DATA_DIR = "/content/drive/MyDrive/FactuAI/data/processed"
OUTPUT_DIR = "/content/drive/MyDrive/FactuAI/outputs/distilbert_run/checkpoints"
LOG_DIR = "/content/drive/MyDrive/FactuAI/outputs/distilbert_run/logs"

train_path = os.path.join(DATA_DIR, "liar2_train.csv")
val_path = os.path.join(DATA_DIR, "liar2_val.csv")
test_path = os.path.join(DATA_DIR, "liar2_test.csv")

In [None]:
# ===============================
# ✅ 4. LOAD MODEL AND TOKENIZER
# ===============================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# ===============================
# ✅ 5. LOAD & TOKENIZE DATA
# ===============================
dataset = load_dataset("csv", data_files={
    "train": train_path,
    "validation": val_path,
    "test": test_path
})

def tokenize_function(example):
    return tokenizer(example["statement"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# ===============================
# ✅ 6. METRICS
# ===============================
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# ===============================
# ✅ 7. TRAINING ARGUMENTS
# ===============================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir=LOG_DIR,
    load_best_model_at_end=True,
    fp16=USE_FP16
)

# ===============================
# ✅ 8. TRAINER SETUP
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# ===============================
# ✅ 9. START TRAINING
# ===============================
trainer.train()

# ===============================
# ✅ 10. FINAL EVALUATION
# ===============================
results = trainer.evaluate(tokenized_datasets["test"])
print("Test Results:", results)