In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import random
import numpy as np
import os
import gc


class AIDatasetGenerator:
    def __init__(self, model_name: str):
        try:
            HF_TOKEN = "your_token_here"
            login(token=HF_TOKEN)
        except:
            pass

        print(f"Loading model: {model_name}...")
        self.model_name = model_name
        self.model_short_name = model_name.split('/')[-1].lower().replace('-', '_')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        print("Model loaded successfully\n")

    def create_prompts(self, human_file: str, num_prompts: int) -> list:
        with open(human_file, 'r') as f:
            samples = [json.loads(line) for line in f]

        prompts = []
        sampled = random.sample(samples, min(num_prompts, len(samples)))

        for sample in sampled:
            words = sample['text'].split()
            if len(words) >= 20:
                prompt_len = random.randint(10, min(30, len(words)))
                prompts.append(' '.join(words[:prompt_len]))

        return prompts

    def generate(self, prompts: list, max_length: int = 200, batch_size: int = 50) -> list:
        print(f"Generating {len(prompts)} AI samples...")
        generated = []

        for i, prompt in enumerate(prompts):
            try:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

                with torch.no_grad():  # Disable gradient computation
                    outputs = self.model.generate(
                        **inputs,
                        max_length=max_length,
                        temperature=0.8,
                        do_sample=True,
                        top_p=0.9,
                        pad_token_id=self.tokenizer.eos_token_id
                    )

                text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                text = text[len(prompt):].strip()

                if len(text) > 50:
                    generated.append(text)

                # Clear memory every batch_size iterations
                if (i + 1) % batch_size == 0:
                    print(f"  Generated {i + 1}/{len(prompts)}")
                    torch.cuda.empty_cache()
                    gc.collect()

                # Delete tensors to free memory
                del inputs, outputs

            except Exception as e:
                print(f"Error at sample {i}: {e}")
                torch.cuda.empty_cache()
                gc.collect()
                continue

        print(f"Generated {len(generated)} samples")
        return generated

    def split_dataset(self, texts: list, train=0.8, val=0.1):
        random.shuffle(texts)
        n = len(texts)
        train_end = int(n * train)
        val_end = int(n * (train + val))

        splits = {
            'train': texts[:train_end],
            'validation': texts[train_end:val_end],
            'test': texts[val_end:]
        }

        print(f"\nDataset splits:")
        print(f"  Train: {len(splits['train'])} samples")
        print(f"  Validation: {len(splits['validation'])} samples")
        print(f"  Test: {len(splits['test'])} samples")

        return splits

    def get_statistics(self, texts: list) -> dict:
        all_words = ' '.join(texts).split()
        lengths = [len(text) for text in texts]

        return {
            'total_samples': len(texts),
            'avg_length': int(np.mean(lengths)),
            'vocabulary_size': len(set(all_words)),
            'lexical_diversity': round(len(set(all_words)) / len(all_words), 3)
        }

    def save_dataset(self, texts: list, filepath: str):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(json.dumps({
                    "text": text,
                    "source": "ai_generated",
                    "model": self.model_name
                }) + "\n")


def main():
    from google.colab import drive
    drive.mount('/content/drive')

    project_root = "/content/drive/MyDrive/FinalProject"
    human_data = f"{project_root}/human_baseline_data/train.jsonl"

    # MODEL_NAME = "meta-llama/Llama-3.2-1B"
    MODEL_NAME = "gpt2-medium"

    generator = AIDatasetGenerator(MODEL_NAME)

    # Create model-specific folder
    ai_data_path = f"{project_root}/ai_generated_data/{generator.model_short_name}"

    prompts = generator.create_prompts(human_data, num_prompts=4000)
    ai_texts = generator.generate(prompts, max_length=200, batch_size=50)

    stats = generator.get_statistics(ai_texts)
    print(f"\nDataset statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")

    splits = generator.split_dataset(ai_texts)

    for split_name, split_data in splits.items():
        filepath = os.path.join(ai_data_path, f"{split_name}.jsonl")
        generator.save_dataset(split_data, filepath)

    print(f"\nSaved to: {ai_data_path}")
    return splits


if __name__ == "__main__":
    dataset_splits = main()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading model: gpt2-medium...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded successfully

Generating 3998 AI samples...
  Generated 50/3998
  Generated 100/3998
  Generated 150/3998
  Generated 200/3998
  Generated 250/3998
  Generated 300/3998
  Generated 350/3998
  Generated 400/3998
  Generated 450/3998
  Generated 500/3998
  Generated 550/3998
  Generated 600/3998
  Generated 650/3998
  Generated 700/3998
  Generated 750/3998
  Generated 800/3998
  Generated 850/3998
  Generated 900/3998
  Generated 950/3998
  Generated 1000/3998
  Generated 1050/3998
  Generated 1100/3998
  Generated 1150/3998
  Generated 1200/3998
  Generated 1250/3998
  Generated 1300/3998
  Generated 1350/3998
  Generated 1400/3998
  Generated 1450/3998
  Generated 1500/3998
  Generated 1550/3998
  Generated 1600/3998
  Generated 1650/3998
  Generated 1700/3998
  Generated 1750/3998
  Generated 1800/3998
  Generated 1850/3998
  Generated 1900/3998
  Generated 1950/3998
  Generated 2000/3998
  Generated 2050/3998
  Generated 2100/3998
  Generated 2150/3998
  Generated 2200/