In [2]:
import datasets
import numpy as np
from collections import Counter
import re
import hashlib
import json
import os

class HumanDatasetCollector:
    def __init__(self):
        pass

    def collect_wikitext(self, max_samples: int = 5000) -> list:
        print(f"Loading WikiText-103 dataset...")
        dataset = datasets.load_dataset('wikitext', 'wikitext-103-v1')
        train_data = dataset['train']

        texts = []
        for example in train_data:
            if len(texts) >= max_samples:
                break

            text = example['text'].strip()
            if self._is_quality_text(text):
                texts.append(self._clean_text(text))

        print(f"Collected {len(texts)} samples")
        return texts

    def _is_quality_text(self, text: str) -> bool:
        if len(text) < 100 or len(text) > 5000:
            return False

        words = text.split()
        if len(words) < 20:
            return False

        sentences = [s for s in text.split('.') if s.strip()]
        if len(sentences) < 2:
            return False

        avg_length = np.mean([len(s.split()) for s in sentences])
        if avg_length < 5 or avg_length > 50:
            return False

        return True

    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\n\n+', '\n\n', text)
        text = re.sub(r'= .* =', '', text)
        text = re.sub(r'\[\[.*?\]\]', '', text)
        text = re.sub(r'{{.*?}}', '', text)
        return text.strip()

    def remove_duplicates(self, texts: list) -> list:
        seen = set()
        unique = []
        for text in texts:
            text_hash = hashlib.md5(text.encode()).hexdigest()
            if text_hash not in seen:
                seen.add(text_hash)
                unique.append(text)

        if len(texts) > len(unique):
            print(f"Removed {len(texts) - len(unique)} duplicates")
        return unique

    def split_dataset(self, texts: list, train=0.8, val=0.1):
        np.random.shuffle(texts)
        n = len(texts)
        train_end = int(n * train)
        val_end = int(n * (train + val))

        splits = {
            'train': texts[:train_end],
            'validation': texts[train_end:val_end],
            'test': texts[val_end:]
        }

        print(f"\nDataset splits:")
        print(f"  Train: {len(splits['train'])} samples")
        print(f"  Validation: {len(splits['validation'])} samples")
        print(f"  Test: {len(splits['test'])} samples")

        return splits

    def get_statistics(self, texts: list) -> dict:
        all_words = ' '.join(texts).split()
        lengths = [len(text) for text in texts]

        return {
            'total_samples': len(texts),
            'avg_length': int(np.mean(lengths)),
            'vocabulary_size': len(set(all_words)),
            'lexical_diversity': round(len(set(all_words)) / len(all_words), 3)
        }

    def save_dataset(self, texts: list, filepath: str):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(json.dumps({"text": text, "source": "wikitext"}) + "\n")


def main():
    from google.colab import drive
    drive.mount('/content/drive')

    base_path = "/content/drive/MyDrive/FinalProject/human_baseline_data"

    collector = HumanDatasetCollector()
    texts = collector.collect_wikitext(max_samples=5000)
    texts = collector.remove_duplicates(texts)

    stats = collector.get_statistics(texts)
    print(f"\nDataset statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")

    splits = collector.split_dataset(texts)

    for split_name, split_data in splits.items():
        filepath = os.path.join(base_path, f"{split_name}.jsonl")
        collector.save_dataset(split_data, filepath)

    print(f"\nSaved to: {base_path}")
    return splits


if __name__ == "__main__":
    dataset_splits = main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading WikiText-103 dataset...
Collected 5000 samples
Removed 2 duplicates

Dataset statistics:
  total_samples: 4998
  avg_length: 730
  vocabulary_size: 38393
  lexical_diversity: 0.056

Dataset splits:
  Train: 3998 samples
  Validation: 500 samples
  Test: 500 samples

Saved to: /content/drive/MyDrive/FinalProject/human_baseline_data
