In [1]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login
import os


class ModelEvaluator:
    def __init__(self, base_model_name: str, trained_model_path: str = None):
        # Login
        try:
            from google.colab import userdata
            HF_TOKEN = "TOKEN"
            login(token=HF_TOKEN)
        except:
            pass

        print(f"Loading model: {base_model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

        # Load trained model if path provided
        if trained_model_path:
            print(f"Loading trained model from: {trained_model_path}")
            self.model = PeftModel.from_pretrained(
                base_model,
                trained_model_path,
                is_trainable=False
            )
        else:
            self.model = base_model

        print("Model loaded\n")

    def generate_text(self, prompt: str, max_length: int = 150) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.8,
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.eos_token_id
            )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def generate_from_prompts(self, prompts: list) -> list:
        print(f"Generating from {len(prompts)} prompts...")
        generated = []
        for i, prompt in enumerate(prompts):
            text = self.generate_text(prompt)
            generated.append(text)
            if (i + 1) % 10 == 0:
                print(f"  Generated {i + 1}/{len(prompts)}")
        return generated

    def calculate_lexical_diversity(self, texts: list) -> dict:
        all_tokens = []
        for text in texts:
            tokens = text.lower().split()
            all_tokens.extend(tokens)

        types = len(set(all_tokens))
        tokens_count = len(all_tokens)
        ttr = types / tokens_count if tokens_count > 0 else 0

        bigrams = []
        for text in texts:
            words = text.lower().split()
            bigrams.extend([f"{words[i]}_{words[i+1]}" for i in range(len(words)-1)])

        unique_bigrams = len(set(bigrams))
        total_bigrams = len(bigrams)

        return {
            'type_token_ratio': round(ttr, 4),
            'vocabulary_size': types,
            'total_tokens': tokens_count,
            'bigram_diversity': round(unique_bigrams / total_bigrams if total_bigrams > 0 else 0, 4)
        }

    def calculate_perplexity(self, texts: list) -> float:
        total_loss = 0
        count = 0

        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs, labels=inputs["input_ids"])
                total_loss += outputs.loss.item()
                count += 1

        avg_loss = total_loss / count
        perplexity = np.exp(avg_loss)
        return round(perplexity, 2)


def load_test_prompts(filepath: str, n: int = 50) -> list:
    """Load test prompts - default 50 for real evaluation"""
    with open(filepath, 'r') as f:
        data = [json.loads(line) for line in f]

    prompts = []
    for item in data[:n*2]:  # Get extra in case some are too short
        words = item['text'].split()
        if len(words) >= 20:
            prompt = ' '.join(words[:15])
            prompts.append(prompt)
            if len(prompts) >= n:
                break

    return prompts


def main():
    from google.colab import drive
    drive.mount('/content/drive')

    project_root = "/content/drive/MyDrive/FinalProject"
    BASE_MODEL = "meta-llama/Llama-3.2-1B"

    # Load test prompts - 50 for real evaluation
    test_file = f"{project_root}/human_baseline_data/test.jsonl"
    prompts = load_test_prompts(test_file, n=50)
    print(f"Loaded {len(prompts)} test prompts\n")

    # Models to evaluate
    models_config = {
        'Base (Untrained)': None,
        'Human-trained': f"{project_root}/trained_models_v2/human_baseline_data_llama",
        'AI-trained': f"{project_root}/trained_models_v2/ai_generated_data_gpt2_medium_llama",
        'Mixed-trained': f"{project_root}/trained_models_v2/mixed_data_gpt2_medium_llama"
    }

    results = {}

    for model_name, model_path in models_config.items():
        print("="*60)
        print(f"EVALUATING: {model_name}")
        print("="*60)

        try:
            evaluator = ModelEvaluator(BASE_MODEL, model_path)
            generated = evaluator.generate_from_prompts(prompts)

            print("\nCalculating metrics...")

            lexical = evaluator.calculate_lexical_diversity(generated)
            perplexity = evaluator.calculate_perplexity(generated)

            results[model_name] = {
                'lexical_diversity': lexical,
                'perplexity': perplexity,
                'num_prompts': len(prompts)
            }

            print(f"\n{model_name} Results:")
            print(f"  Type-Token Ratio: {lexical['type_token_ratio']}")
            print(f"  Vocabulary Size: {lexical['vocabulary_size']}")
            print(f"  Bigram Diversity: {lexical['bigram_diversity']}")
            print(f"  Perplexity: {perplexity}")
            print()

            del evaluator
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error: {e}")
            import traceback
            traceback.print_exc()
            print("Skipping...\n")
            continue

    # Summary
    print("\n" + "="*60)
    print(f"FINAL EVALUATION RESULTS ({len(prompts)} prompts)")
    print("="*60)

    # Print as table
    print(f"\n{'Model':<20} {'TTR':<8} {'Vocab':<8} {'Bigram':<8} {'Perplexity':<12}")
    print("-" * 60)

    for model_name, metrics in results.items():
        ttr = metrics['lexical_diversity']['type_token_ratio']
        vocab = metrics['lexical_diversity']['vocabulary_size']
        bigram = metrics['lexical_diversity']['bigram_diversity']
        ppl = metrics['perplexity']
        print(f"{model_name:<20} {ttr:<8.4f} {vocab:<8} {bigram:<8.4f} {ppl:<12.2f}")

    # Save
    results_path = f"{project_root}/evaluation_results_final.json"
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"\n✓ Results saved to: {results_path}")
    print("\n" + "="*60)
    print("EVALUATION COMPLETE - READY FOR PHASE 3 ANALYSIS")
    print("="*60)

    return results


if __name__ == "__main__":
    results = main()



Mounted at /content/drive
Loaded 50 test prompts

EVALUATING: Base (Untrained)
Loading model: meta-llama/Llama-3.2-1B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model loaded

Generating from 50 prompts...
  Generated 10/50
  Generated 20/50
  Generated 30/50
  Generated 40/50
  Generated 50/50

Calculating metrics...

Base (Untrained) Results:
  Type-Token Ratio: 0.3398
  Vocabulary Size: 1811
  Bigram Diversity: 0.7456
  Perplexity: 5.7

EVALUATING: Human-trained
Loading model: meta-llama/Llama-3.2-1B
Loading trained model from: /content/drive/MyDrive/FinalProject/trained_models_v2/human_baseline_data_llama
Model loaded

Generating from 50 prompts...
  Generated 10/50
  Generated 20/50
  Generated 30/50
  Generated 40/50
  Generated 50/50

Calculating metrics...

Human-trained Results:
  Type-Token Ratio: 0.3294
  Vocabulary Size: 1912
  Bigram Diversity: 0.7552
  Perplexity: 7.03

EVALUATING: AI-trained
Loading model: meta-llama/Llama-3.2-1B
Loading trained model from: /content/drive/MyDrive/FinalProject/trained_models_v2/ai_generated_data_gpt2_medium_llama
Model loaded

Generating from 50 prompts...
  Generated 10/50
  Generated 20/50
  Gen