In [1]:
import json
import random
import os


class MixedDatasetCreator:
    def __init__(self):
        pass

    def load_dataset(self, filepath: str) -> list:
        with open(filepath, 'r') as f:
            return [json.loads(line) for line in f]

    def create_mixed_dataset(self, human_data: list, ai_data: list, total_samples: int) -> list:
        # Calculate 50/50 split
        n_human = total_samples // 2
        n_ai = total_samples // 2

        # Sample from each dataset
        human_sample = random.sample(human_data, min(n_human, len(human_data)))
        ai_sample = random.sample(ai_data, min(n_ai, len(ai_data)))

        # Combine and shuffle
        mixed = human_sample + ai_sample
        random.shuffle(mixed)

        print(f"Mixed dataset created:")
        print(f"  Human samples: {len(human_sample)}")
        print(f"  AI samples: {len(ai_sample)}")
        print(f"  Total: {len(mixed)}")

        return mixed

    def split_dataset(self, texts: list, train=0.8, val=0.1):
        random.shuffle(texts)
        n = len(texts)
        train_end = int(n * train)
        val_end = int(n * (train + val))

        splits = {
            'train': texts[:train_end],
            'validation': texts[train_end:val_end],
            'test': texts[val_end:]
        }

        print(f"\nDataset splits:")
        print(f"  Train: {len(splits['train'])} samples")
        print(f"  Validation: {len(splits['validation'])} samples")
        print(f"  Test: {len(splits['test'])} samples")

        return splits

    def save_dataset(self, data: list, filepath: str):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item) + "\n")


def main():
    from google.colab import drive
    drive.mount('/content/drive')

    project_root = "/content/drive/MyDrive/FinalProject"

    # Get AI model name
    ai_model_name = "gpt2_medium"  # Change this to match your AI model folder

    # Load human and AI datasets
    human_train = f"{project_root}/human_baseline_data/train.jsonl"
    ai_train = f"{project_root}/ai_generated_data/{ai_model_name}/train.jsonl"
    mixed_data_path = f"{project_root}/mixed_data/{ai_model_name}"

    creator = MixedDatasetCreator()

    print("Loading datasets...")
    human_data = creator.load_dataset(human_train)
    ai_data = creator.load_dataset(ai_train)

    print(f"Loaded {len(human_data)} human samples")
    print(f"Loaded {len(ai_data)} AI samples\n")

    # Create mixed dataset (use the smaller dataset size Ã— 2 to ensure 50/50)
    total_samples = min(len(human_data), len(ai_data)) * 2
    mixed_data = creator.create_mixed_dataset(human_data, ai_data, total_samples)

    # Split into train/val/test
    splits = creator.split_dataset(mixed_data)

    # Save all splits
    for split_name, split_data in splits.items():
        filepath = os.path.join(mixed_data_path, f"{split_name}.jsonl")
        creator.save_dataset(split_data, filepath)

    return splits


if __name__ == "__main__":
    dataset_splits = main()

Mounted at /content/drive
Loading datasets...
Loaded 3998 human samples
Loaded 3180 AI samples

Mixed dataset created:
  Human samples: 3180
  AI samples: 3180
  Total: 6360

Dataset splits:
  Train: 5088 samples
  Validation: 636 samples
  Test: 636 samples
