In [None]:
# New notebook: GenerateNextGeneration.ipynb

import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login


class RecursiveDataGenerator:
    def __init__(self, base_model_name: str, trained_model_path: str, generation: int):
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            login(token=HF_TOKEN)
        except:
            pass

        self.generation = generation
        print(f"Loading Generation {generation} model...")

        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

        self.model = PeftModel.from_pretrained(
            base_model,
            trained_model_path,
            is_trainable=False
        )
        print("Model loaded\n")

    def generate_from_prompts(self, prompts: list, max_length: int = 200) -> list:
        print(f"Generating Gen {self.generation + 1} data from {len(prompts)} prompts...")
        generated = []

        for i, prompt in enumerate(prompts):
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=max_length,
                    temperature=0.8,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            text = text[len(prompt):].strip()

            if len(text) > 50:
                generated.append(text)

            if (i + 1) % 50 == 0:
                print(f"  Generated {i + 1}/{len(prompts)}")

        return generated

    def save_generation_data(self, texts: list, output_dir: str):
        import os
        os.makedirs(output_dir, exist_ok=True)

        # Split into train/val/test
        import random
        random.shuffle(texts)
        n = len(texts)
        train_end = int(n * 0.8)
        val_end = int(n * 0.9)

        splits = {
            'train': texts[:train_end],
            'validation': texts[train_end:val_end],
            'test': texts[val_end:]
        }

        for split_name, split_data in splits.items():
            filepath = os.path.join(output_dir, f"{split_name}.jsonl")
            with open(filepath, 'w') as f:
                for text in split_data:
                    f.write(json.dumps({
                        "text": text,
                        "source": f"ai_generated_gen{self.generation + 1}",
                        "model": "llama",
                        "generation": self.generation + 1
                    }) + "\n")

        print(f"Saved Gen {self.generation + 1} data to {output_dir}")


def main():
    from google.colab import drive
    drive.mount('/content/drive')

    project_root = "/content/drive/MyDrive/FinalProject"
    BASE_MODEL = "meta-llama/Llama-3.2-1B"

    # ============================================
    # CHANGE THESE FOR EACH GENERATION
    # ============================================
    CURRENT_GEN = 1  # Currently at Gen 1, will create Gen 2
    TRAINED_MODEL = f"{project_root}/trained_models_v2/ai_generated_data_gpt2_medium_llama"
    OUTPUT_DIR = f"{project_root}/ai_generated_data_gen2"

    # Load prompts from human baseline
    with open(f"{project_root}/human_baseline_data/train.jsonl", 'r') as f:
        human_data = [json.loads(line) for line in f]

    prompts = []
    for item in human_data[:3000]:
        words = item['text'].split()
        if len(words) >= 20:
            prompt = ' '.join(words[:15])
            prompts.append(prompt)

    print(f"Loaded {len(prompts)} prompts\n")

    # Generate next generation
    generator = RecursiveDataGenerator(BASE_MODEL, TRAINED_MODEL, CURRENT_GEN)
    generated = generator.generate_from_prompts(prompts)
    generator.save_generation_data(generated, OUTPUT_DIR)

    print(f"\nâœ“ Generation {CURRENT_GEN + 1} data created!")
    print(f"Next: Train on {OUTPUT_DIR}")


if __name__ == "__main__":
    main()