## Environemnt Setup

In [None]:
!pip install torch>=2.0.0 transformers>=4.35.0 datasets>=2.14.0 tokenizers>=0.14.0 pandas>=1.5.0 numpy>=1.24.0 tqdm>=4.65.0 scikit-learn>=1.3.0 tensorboard>=2.14.0 accelerate>=0.24.0 safetensors>=0.4.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"

## Training

In [None]:
def load_and_preprocess_data(csv_path: str, tokenizer, max_length: int = 512):
    df = pd.read_csv(csv_path)

    # Convert each row to training text
    formatted_texts = []
    for _, row in df.iterrows():
        formatted_text = (
            f"[GENRE]: {row['Genre']}\n"
            f"[CHARACTERS]: {row['Characters']}\n"
            f"[PROMPT]: {row['Prompt']}\n"
            f"[STORY]: {row['Story']}"
        )
        formatted_texts.append(formatted_text)

    train_texts, eval_texts = train_test_split(
        formatted_texts,
        test_size=0.2,
        random_state=42
    )

    print(f"Train set: {len(train_texts)} examples")
    print(f"Eval set: {len(eval_texts)} examples\n\n")

    # Tokenize the texts - ONLY return input_ids and attention_mask
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,
            max_length=max_length,
            return_tensors=None,
            add_special_tokens=True
        )
    train_dataset = Dataset.from_dict({'text': train_texts})
    eval_dataset = Dataset.from_dict({'text': eval_texts})

    print("Tokenizing datasets...")
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text']
    )
    eval_dataset = eval_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text']
    )

    return train_dataset, eval_dataset


# Define global variables
MODEL_NAME = "distilgpt2"
OUTPUT_DIR = "/content/drive/MyDrive/distilgpt2_finetuned"
CSV_PATH = "dataset.csv"
MAX_LENGTH = 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess data globally
print(f"🔧 Using device: {device}")

print(f"📥 Loading tokenizer and model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

train_dataset, eval_dataset = load_and_preprocess_data(
    CSV_PATH, tokenizer, MAX_LENGTH
)

# Add length column for better grouping
def add_length(example):
    example["length"] = len(example["input_ids"])
    return example

print(f"📏 Adding length column for better batching...")
train_dataset = train_dataset.map(add_length)
eval_dataset = eval_dataset.map(add_length)

🔧 Using device: cuda
📥 Loading tokenizer and model: distilgpt2
Train set: 800 examples
Eval set: 200 examples


Tokenizing datasets...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

📏 Adding length column for better batching...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
def main():
    # this will automatically create labels from input_ids
    print(f"🛠️  Creating data collator...")
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    # Training arguments
    print(f"⚙️  Setting up training arguments...")
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        learning_rate=5e-5,
        warmup_steps=100,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=torch.cuda.is_available(),
        dataloader_pin_memory=False,
        report_to=[],
        remove_unused_columns=False,
        group_by_length=True,
        dataloader_drop_last=False,
        length_column_name="length",
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    print(f"\n🎯 Starting training...")

    trainer.train()

    # Save model
    print(f"💾 Saving model to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    eval_results = trainer.evaluate()
    final_loss = eval_results['eval_loss']
    final_perplexity = torch.exp(torch.tensor(final_loss))

    print(f"Training completed successfully!")
    print(f"Model saved to: {OUTPUT_DIR}")
    print(f"Final evaluation loss: {final_loss:.4f}")
    print(f"Final perplexity: {final_perplexity:.4f}")


if __name__ == "__main__":
    main()

🔧 Using device: cuda
📥 Loading tokenizer and model: distilgpt2
Train set: 800 examples
Eval set: 200 examples


Tokenizing datasets...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

📏 Adding length column for better batching...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

🛠️  Creating data collator...
⚙️  Setting up training arguments...


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



🎯 Starting training...


Epoch,Training Loss,Validation Loss
1,2.4472,2.395109
2,2.1966,2.269872
3,2.0207,2.245034


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


💾 Saving model to /content/drive/MyDrive/distilgpt2_finetuned


Training completed successfully!
Model saved to: /content/drive/MyDrive/distilgpt2_finetuned
Final evaluation loss: 2.2449
Final perplexity: 9.4398


## Generation of story

In [None]:
def load_fine_tuned_model(model_path: str):
    print(f"Loading fine-tuned model from: {model_path}")

    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model directory not found: {model_path}")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Ensure padding token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    return tokenizer, model, device


def generate_story(tokenizer, model, device, genre: str, prompt: str, max_length: int = 500):
    input_text = (
        f"[GENRE]: {genre}\n"
        f"[CHARACTERS]: \n"  # Empty characters field for generation
        f"[PROMPT]: {prompt}\n"
        f"[STORY]:"
    )

    # Tokenize the input
    inputs = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.8,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            no_repeat_ngram_size=2
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the story part (everything after "[STORY]:")
    story_start = generated_text.find("[STORY]:") + len("[STORY]:")
    generated_story = generated_text[story_start:].strip()

    return generated_story


def test():
    MODEL_PATH = "/content/drive/MyDrive/distilgpt2_finetuned"

    # Test Input
    TEST_GENRE = "Adventure & Exploration"
    TEST_PROMPT = "A young girl discovers a compass that doesn't point north, but instead points towards the nearest secret or mystery."

    try:
        tokenizer, model, device = load_fine_tuned_model(MODEL_PATH)

        print("=" * 60)
        print("TESTING FINE-TUNED DISTILGPT-2 MODEL")
        print("=" * 60)

        print(f"🎯 Generating story with test case:")
        print(f"   Genre: {TEST_GENRE}")
        print(f"   Prompt: {TEST_PROMPT}")
        print()

        generated_story = generate_story(
            tokenizer, model, device,
            TEST_GENRE, TEST_PROMPT,
            max_length=350
        )

        print("GENERATED STORY:")
        print("=" * 30)
        print(generated_story)
        print("=" * 30)

    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Make sure to run above cells first to create the fine-tuned model.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
test()

Loading fine-tuned model from: /content/drive/MyDrive/distilgpt2_finetuned


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TESTING FINE-TUNED DISTILGPT-2 MODEL
🎯 Generating story with test case:
   Genre: Adventure & Exploration
   Prompt: A young girl discovers a compass that doesn't point north, but instead points towards the nearest secret or mystery.

GENERATED STORY:
During an expedition to find and guide her sister's lost school map in Diyala district, they found it near their village campground. The next day, while trekking through forests, there was no one on board! One of the curious explorers came across this forgotten treasure: a gold-digger named Krantu who disguised himself as a birdkeeper with his magic wand. It would show how hidden history can reveal knowledge about nature. Inspired, she ventured into deep caves where secrets are often stolen by thieves alone. With its simple design, it turned any small object upside down and traced them back thousands of years. At first, curiosity seemed like waste; seeing what appeared before became awe inspiring again. But after more than ten centuries, 

## Perplexity Score

### Base Model

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
base_model.to(device)
base_model.eval()

total_loss_base = 0
with torch.no_grad():
    for i in tqdm(range(len(eval_dataset)), desc="Calculating Base Model Perplexity"):
        # Convert list of token IDs to a PyTorch tensor
        input_ids = torch.tensor(eval_dataset[i]["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(eval_dataset[i]["attention_mask"]).unsqueeze(0).to(device)

        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids # Use input_ids as labels for language modeling
        }

        outputs = base_model(**inputs)
        total_loss_base += outputs.loss.item()

average_loss_base = total_loss_base / len(eval_dataset)
perplexity_base = torch.exp(torch.tensor(average_loss_base))

print("\n--- Perplexity Score (Base Model) ---")
print(f"{perplexity_base.item():.4f}")

Calculating perplexity for the base model...


Calculating Base Model Perplexity: 100%|██████████| 200/200 [00:03<00:00, 65.69it/s]


--- Perplexity Score (Base Model) ---
55.7791





### Fine Tuned Model

In [None]:
try:
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
    fine_tuned_model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
except FileNotFoundError:
    print("Fine-tuned model not found. Please run the training cells first.")
    fine_tuned_model = None

if fine_tuned_model is not None:
    fine_tuned_model.eval()
    fine_tuned_model.to(device)

    total_loss_finetuned = 0
    with torch.no_grad():
        for i in tqdm(range(len(eval_dataset)), desc="Calculating Fine-Tuned Model Perplexity"):

            input_ids = torch.tensor(eval_dataset[i]["input_ids"]).unsqueeze(0).to(device)
            attention_mask = torch.tensor(eval_dataset[i]["attention_mask"]).unsqueeze(0).to(device)

            inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": input_ids
            }
            outputs = fine_tuned_model(**inputs)
            total_loss_finetuned += outputs.loss.item()

    average_loss_finetuned = total_loss_finetuned / len(eval_dataset)
    perplexity_finetuned = torch.exp(torch.tensor(average_loss_finetuned))

    print("\n--- Perplexity Score (Fine-Tuned Model) ---")
    print(f"{perplexity_finetuned.item():.4f}")

Calculating Fine-Tuned Model Perplexity: 100%|██████████| 200/200 [00:03<00:00, 65.73it/s]


--- Perplexity Score (Fine-Tuned Model) ---
9.4383





## Testing

In [None]:
# Load the base and fine-tuned distilgpt2 models
print(f"Loading base model: {MODEL_NAME}")
base_model_judge = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
base_model_judge.to(device)
base_model_judge.eval()

print(f"Loading fine-tuned model from: {OUTPUT_DIR}")
try:
    tokenizer_judge = AutoTokenizer.from_pretrained(OUTPUT_DIR)
    fine_tuned_model_judge = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
except FileNotFoundError:
    print("Fine-tuned model not found. Please run the training cells first.")
    fine_tuned_model_judge = None

if fine_tuned_model_judge is not None:
    fine_tuned_model_judge.to(device)
    fine_tuned_model_judge.eval()

# Use the eval_dataset that was loaded and preprocessed earlier
evaluation_data = []
N_STORIES_TO_EVALUATE = 10

print(f"Generating {N_STORIES_TO_EVALUATE} story pairs for evaluation using eval_dataset...")

# Iterate through the eval_dataset and prepare prompts
# eval_dataset contains tokenized data. We need to decode it
for i in tqdm(range(N_STORIES_TO_EVALUATE)):
    # Decoding
    decoded_text = tokenizer.decode(eval_dataset[i]["input_ids"], skip_special_tokens=True)

    genre_start = decoded_text.find("[GENRE]:") + len("[GENRE]:")
    characters_start = decoded_text.find("[CHARACTERS]:") + len("[CHARACTERS]:")
    prompt_start = decoded_text.find("[PROMPT]:") + len("[PROMPT]:")
    story_start = decoded_text.find("[STORY]:")

    genre = decoded_text[genre_start:characters_start].strip()
    characters = decoded_text[characters_start:prompt_start].strip()
    prompt = decoded_text[prompt_start:story_start].strip()

    prompt_text = (f"[GENRE]: {genre}\n"
                   f"[CHARACTERS]: {characters}\n"
                   f"[PROMPT]: {prompt}\n"
                   f"[STORY]:")

    # Generate story with Base Model
    if base_model_judge is not None:
        base_inputs = tokenizer_judge.encode(prompt_text, return_tensors='pt').to(device)
        with torch.no_grad():
            base_outputs = base_model_judge.generate(
                base_inputs,
                max_length=512,
                num_return_sequences=1,
                temperature=0.8,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                pad_token_id=tokenizer_judge.eos_token_id,
                repetition_penalty=1.1,
                no_repeat_ngram_size=2
            )
        base_story = tokenizer_judge.decode(base_outputs[0], skip_special_tokens=True)
        # Extract only the story part if the model generates beyond the story tag
        story_start_base = base_story.find("[STORY]:") + len("[STORY]:")
        base_story = base_story[story_start_base:].strip()
    else:
        base_story = "Base model not loaded."


    # Generate story with Fine-Tuned Model
    if fine_tuned_model_judge is not None:
        finetuned_inputs = tokenizer_judge.encode(prompt_text, return_tensors='pt').to(device)
        with torch.no_grad():
            final_outputs = fine_tuned_model_judge.generate(
                finetuned_inputs,
                max_length=512,
                num_return_sequences=1,
                temperature=0.8,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                pad_token_id=tokenizer_judge.eos_token_id,
                repetition_penalty=1.1,
                no_repeat_ngram_size=2
            )
        final_story = tokenizer_judge.decode(final_outputs[0], skip_special_tokens=True)

        story_start_finetuned = final_story.find("[STORY]:") + len("[STORY]:")
        final_story = final_story[story_start_finetuned:].strip()

    else:
        final_story = "Fine-tuned model not loaded."


    evaluation_data.append({
        "prompt": prompt_text,
        "base_model_story": base_story,
        "finetuned_model_story": final_story
    })

print(f"\nGenerated {len(evaluation_data)} story pairs for evaluation.")

Loading base model: distilgpt2
Loading fine-tuned model from: /content/drive/MyDrive/distilgpt2_finetuned
Generating 10 story pairs for evaluation using eval_dataset...


100%|██████████| 10/10 [00:46<00:00,  4.70s/it]

Generated 10 story pairs for evaluation.





In [None]:
import google.generativeai as genai
from google.colab import userdata
import json
import pandas as pd

try:
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    judge_model = genai.GenerativeModel('gemini-1.5-flash')
    print("Gemini API configured successfully. Starting evaluation.")
except Exception as e:
    print(f"API Key configuration failed. Please ensure you have saved the key correctly in Colab Secrets. Error: {e}")
    judge_model = None


if judge_model:
    judge_prompt_template = """
    You are an expert evaluator of children's stories. I will provide a prompt and two stories generated by two different AI models (Model A - Base Model and Model B - Fine-Tuned Model). Your task is to score each story on a scale of 1 to 10 for three criteria: Adherence (how well it follows the prompt), Style (writing quality and appropriateness for children), and Creativity (originality and imagination). Do not show any bias and evaluate based on the content provided.

    **Original Prompt:**
    {prompt}

    **Model A Story (Base Model):**
    {story_A}

    **Model B Story (Fine-Tuned Model):**
    {story_B}

    Please provide your evaluation as a single, valid JSON object, like this:
    {{
      "model_A_scores": {{ "adherence": <score>, "style": <score>, "creativity": <score> }},
      "model_B_scores": {{ "adherence": <score>, "style": <score>, "creativity": <score> }}
    }}
    """
    results = []
    print("Evaluating generated stories using LLM-as-a-Judge...")
    for item in tqdm(evaluation_data, desc="Evaluating stories"):
        prompt_for_judge = judge_prompt_template.format(
            prompt=item['prompt'],
            story_A=item['base_model_story'],
            story_B=item['finetuned_model_story']
        )

        response = judge_model.generate_content(prompt_for_judge)
        # Attempt to clean and parse the JSON response
        cleaned_response = response.text.strip()
        # Remove potential markdown code block formatting
        if cleaned_response.startswith("```json"):
          cleaned_response = cleaned_response[len("```json"):].strip()
        if cleaned_response.endswith("```"):
          cleaned_response = cleaned_response[:-len("```")].strip()

        scores = json.loads(cleaned_response)
        results.append(scores)

    if results:
        base_scores = {'adherence': [], 'style': [], 'creativity': []}
        finetuned_scores = {'adherence': [], 'style': [], 'creativity': []}

        for res in results:
            # Ensure the expected keys exist and have the correct structure
            if 'model_A_scores' in res and isinstance(res['model_A_scores'], dict) and \
               'model_B_scores' in res and isinstance(res['model_B_scores'], dict):
                for key in base_scores.keys():
                    # Use .get() with a default value (e.g., 0) to handle missing keys gracefully
                    base_scores[key].append(res['model_A_scores'].get(key, 0))
                    finetuned_scores[key].append(res['model_B_scores'].get(key, 0))
            else:
                print(f"Skipping ill-formatted result: {res}")


        # Calculate averages only if there are valid scores
        avg_base = {key: sum(val)/len(val) for key, val in base_scores.items() if val} if any(base_scores.values()) else {}
        avg_finetuned = {key: sum(val)/len(val) for key, val in finetuned_scores.items() if val} if any(finetuned_scores.values()) else {}


        if avg_base or avg_finetuned:
            report = pd.DataFrame([avg_base, avg_finetuned], index=['Base Model', 'Fine-Tuned Model'])
            print("\n--- LLM-as-a-Judge Evaluation Report ---")
            print(report)
        else:
             print("\nNo valid scores were collected, so no report can be generated.")
    else:
        print("\nNo results were collected, so no report can be generated.")

Gemini API configured successfully. Starting evaluation.
Evaluating generated stories using LLM-as-a-Judge...


Evaluating stories: 100%|██████████| 10/10 [00:38<00:00,  3.86s/it]


--- LLM-as-a-Judge Evaluation Report ---
                  adherence  style  creativity
Base Model              1.0    1.2         1.0
Fine-Tuned Model        2.2    3.1         2.8



