## Environment Setup

In [None]:
!pip install -q -U torch transformers datasets peft accelerate bitsandbytes sentence-transformers evaluate
!pip install -q google-generativeai
!pip install -q pyarrow<20.0.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import google.generativeai as genai
from google.colab import userdata

try:
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    print("Gemini API configured successfully.")
except Exception as e:
    print("Gemini API key not found in Colab Secrets. Please add it to proceed with the LLM-as-a-Judge evaluation.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/bin/bash: line 1: 20.0.0: No such file or directory


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Gemini API configured successfully.


In [None]:
from datasets import load_dataset
import os
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from tqdm import tqdm
import pandas as pd

## Dataset Load and Split

In [None]:
BASE_DATASET_PATH = "/content/drive/MyDrive/MyStoryProject/dataset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/MyStoryProject/split_data/"


print(f"Loading base dataset from: {BASE_DATASET_PATH}")
full_dataset = load_dataset("csv", data_files=BASE_DATASET_PATH, split="train")
print(f"Successfully loaded {len(full_dataset)} total examples.")


train_val_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)


final_splits = train_val_dataset['train'].train_test_split(test_size=0.1, seed=42)

train_dataset = final_splits['train']
validation_dataset = final_splits['test']
test_dataset = train_val_dataset['test']

print("\n--- Final Dataset Sizes ---")
print(f"Training set size:   {len(train_dataset)} examples")
print(f"Validation set size: {len(validation_dataset)} examples")
print(f"Testing set size:    {len(test_dataset)} examples")

os.makedirs(OUTPUT_DIR, exist_ok=True)
train_dataset.to_json(os.path.join(OUTPUT_DIR, "train_dataset.jsonl"), orient="records")
validation_dataset.to_json(os.path.join(OUTPUT_DIR, "validation_dataset.jsonl"), orient="records")
test_dataset.to_json(os.path.join(OUTPUT_DIR, "test_dataset.jsonl"), orient="records")

print(f"\nDatasets saved to {OUTPUT_DIR}")

Loading base dataset from: /content/drive/MyDrive/MyStoryProject/dataset.csv


Generating train split: 0 examples [00:00, ? examples/s]

Successfully loaded 1000 total examples.

--- Final Dataset Sizes ---
Training set size:   810 examples
Validation set size: 90 examples
Testing set size:    100 examples


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]


Datasets saved to /content/drive/MyDrive/MyStoryProject/split_data/


In [None]:
train_dataset = load_dataset("json", data_files=os.path.join(OUTPUT_DIR, "train_dataset.jsonl"), split="train")
validation_dataset = load_dataset("json", data_files=os.path.join(OUTPUT_DIR, "validation_dataset.jsonl"), split="train")
test_dataset = load_dataset("json", data_files=os.path.join(OUTPUT_DIR, "test_dataset.jsonl"), split="train")


def format_for_finetuning(example):
    return {"text": (f"Genre: {example['Genre']}. "
                     f"Summary: {example['Prompt']}. "
                     f"Characters: {example['Characters']}. "
                     f"\n\nStory:\n{example['Story']}")}


train_dataset = train_dataset.map(format_for_finetuning)
validation_dataset = validation_dataset.map(format_for_finetuning)

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

print("All datasets have been formatted and tokenized.")

All datasets have been formatted and tokenized.


## Model Training

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


In [None]:
ADAPTER_OUTPUT_DIR = "/content/drive/MyDrive/MyStoryProject/llama3_best_adapter"

training_args = TrainingArguments(
    output_dir=ADAPTER_OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


print("Starting fine-tuning with validation...")
trainer.train()


trainer.model.save_pretrained(ADAPTER_OUTPUT_DIR)
print(f"Best model adapter saved to {ADAPTER_OUTPUT_DIR}")

  trainer = Trainer(


Starting fine-tuning with validation...


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.447,1.348303
2,1.0624,1.326564
3,0.7456,1.404104
4,0.4843,1.586631
5,0.3114,1.804542


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Best model adapter saved to /content/drive/MyDrive/MyStoryProject/llama3_best_adapter


## Perplexity Score

### Finetuned Model

In [None]:
final_model = PeftModel.from_pretrained(model, ADAPTER_OUTPUT_DIR)
final_model.eval()

total_loss = 0
with torch.no_grad():
    for story_row in tqdm(test_dataset):
        story_text = story_row['Story']
        inputs = tokenizer(story_text, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
        outputs = final_model(**inputs, labels=inputs["input_ids"])
        total_loss += outputs.loss.item()

average_loss = total_loss / len(test_dataset)
perplexity_finetuned = torch.exp(torch.tensor(average_loss))

print("\n--- Perplexity Score (Fine-Tuned Model) ---")
print(f"{perplexity_finetuned.item():.4f}")

### Base Model

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
TEST_DATA_PATH = "/content/drive/MyDrive/MyStoryProject/split_data/test_dataset.jsonl"


bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
base_model.eval()


test_dataset = load_dataset("json", data_files=TEST_DATA_PATH, split="train")


total_loss_base = 0
with torch.no_grad():
    for story_row in tqdm(test_dataset):
        story_text = story_row['Story']
        inputs = tokenizer(story_text, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
        outputs = base_model(**inputs, labels=inputs["input_ids"])
        total_loss_base += outputs.loss.item()

average_loss_base = total_loss_base / len(test_dataset)
perplexity_base = torch.exp(torch.tensor(average_loss_base))

print("\n--- Perplexity Score (Base Model) ---")
print(f"{perplexity_base.item():.4f}")

## Testing of fine tuned model

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
ADAPTER_PATH = "/content/drive/MyDrive/MyStoryProject/llama3_best_adapter"
TEST_DATA_PATH = "/content/drive/MyDrive/MyStoryProject/split_data/test_dataset.jsonl"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
final_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)


test_prompts_dataset = load_dataset("json", data_files=TEST_DATA_PATH, split="train")
evaluation_data = []
N_STORIES_TO_EVALUATE = 10

for i in range(N_STORIES_TO_EVALUATE):
    example = test_prompts_dataset[i]
    prompt_text = (f"Genre: {example['Genre']}. "
                   f"Summary: {example['Prompt']}. "
                   f"Characters: {example['Characters']}. "
                   f"\n\nStory:\n")
    messages = [{"role": "user", "content": prompt_text}]
    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")


    base_outputs = base_model.generate(input_ids, max_new_tokens=250, do_sample=True, temperature=0.7)
    base_story = tokenizer.decode(base_outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)


    final_outputs = final_model.generate(input_ids, max_new_tokens=250, do_sample=True, temperature=0.7)
    final_story = tokenizer.decode(final_outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    evaluation_data.append({
        "prompt": prompt_text,
        "base_model_story": base_story,
        "finetuned_model_story": final_story
    })
print(f"Generated {len(evaluation_data)} story pairs for evaluation.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention m

Generated 10 story pairs for evaluation.


In [None]:
import google.generativeai as genai
from google.colab import userdata
import json

try:
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    judge_model = genai.GenerativeModel('gemini-1.5-flash')
    print("Gemini API configured successfully. Starting evaluation.")
except Exception as e:
    print(f"API Key configuration failed. Please ensure you have saved the key correctly in Colab Secrets. Error: {e}")

    judge_model = None


if judge_model:
    judge_prompt_template = """
    You are an expert evaluator of children's stories. I will provide a prompt and two stories generated by two different AI models (Model A and Model B). Your task is to score each story on a scale of 1 to 10 for three criteria: Adherence, Style, and Creativity. Do not show any bias.

    **Original Prompt:**
    {prompt}

    **Model A Story (Base Model):**
    {story_A}

    **Model B Story (Fine-Tuned Model):**
    {story_B}

    Please provide your evaluation as a single, valid JSON object, like this:
    {{
      "model_A_scores": {{ "adherence": <score>, "style": <score>, "creativity": <score> }},
      "model_B_scores": {{ "model_B_scores": {{ "adherence": <score>, "style": <score>, "creativity": <score> }}
    }}
    """
    results = []
    for item in tqdm(evaluation_data):
        prompt_for_judge = judge_prompt_template.format(
            prompt=item['prompt'],
            story_A=item['base_model_story'],
            story_B=item['finetuned_model_story']
        )
        try:
            response = judge_model.generate_content(prompt_for_judge)
            cleaned_response = response.text.strip().replace("```json", "").replace("```", "")
            scores = json.loads(cleaned_response)
            results.append(scores)
        except Exception as e:
            print(f"Error evaluating item: {e}")


    if results:
        base_scores = {'adherence': [], 'style': [], 'creativity': []}
        finetuned_scores = {'adherence': [], 'style': [], 'creativity': []}

        for res in results:

            if 'model_A_scores' in res and 'model_B_scores' in res:
                for key in base_scores.keys():
                    base_scores[key].append(res['model_A_scores'].get(key, 0))
                    finetuned_scores[key].append(res['model_B_scores'].get(key, 0))

        avg_base = {key: sum(val)/len(val) for key, val in base_scores.items() if val}
        avg_finetuned = {key: sum(val)/len(val) for key, val in finetuned_scores.items() if val}

        report = pd.DataFrame([avg_base, avg_finetuned], index=['Base Model', 'Fine-Tuned Model'])
        print("\n--- LLM-as-a-Judge Evaluation Report ---")
        print(report)
    else:
        print("\nNo results were collected, so no report can be generated.")

Gemini API configured successfully. Starting evaluation.


100%|██████████| 10/10 [00:17<00:00,  1.72s/it]


--- LLM-as-a-Judge Evaluation Report ---
                  adherence  style  creativity
Base Model              8.7    7.3         6.4
Fine-Tuned Model        8.7    7.1         6.4





## Testing from custom prompt

In [None]:
fine_tuned_model_path = "/content/drive/MyDrive/MyStoryProject/llama3_best_adapter"

# Need to load the base model and tokenizer again as they might not be available in the current session
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned adapter on top of the base model
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)
fine_tuned_model.eval()

# Define a test prompt
test_genre = "Fantasy"
test_summary = "A young wizard discovers a hidden power and must save his village from a dark force."
test_characters = ""

test_prompt_text = (f"Genre: {test_genre}. "
                    f"Summary: {test_summary}. "
                    f"Characters: {test_characters}. "
                    f"\n\nStory:\n")

# Generate a story using the fine-tuned model
messages = [{"role": "user", "content": test_prompt_text}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

with torch.no_grad():
    generated_outputs = fine_tuned_model.generate(input_ids, max_new_tokens=500, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)

generated_story = tokenizer.decode(generated_outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

print("--- Generated Story (Fine-Tuned Model) ---")
print(generated_story)