In [12]:
# create_test_split.py
import pandas as pd
import argparse
import os

def split_dataset(file_path = "Twitter_Emotion_Dataset.csv", output_dir="natural_split/", val_ratio=0.05, seed=42):
    df = pd.read_csv(file_path)

    df_orig = df

    print(f"   Original: {len(df_orig)}")

    def stratified_split(df, val_ratio):
        """Split a single source (orig or aug) into train/val/test."""
        n_total = len(df)
        n_val = int(n_total * val_ratio)

        df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
        df_val = df_shuffled.iloc[:n_val]
        df_train = df_shuffled.iloc[n_val:]

        return df_train, df_val

    # Split original and augmented separately
    orig_train, orig_val = stratified_split(df_orig, val_ratio)

    # Combine them (balanced parts)
    df_train = pd.concat([orig_train]).sample(frac=1, random_state=seed).reset_index(drop=True)
    df_val = pd.concat([orig_val]).sample(frac=1, random_state=seed).reset_index(drop=True)

    # Make output dir
    os.makedirs(output_dir, exist_ok=True)

    # Save all splits
    df_train.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    df_val.to_csv(os.path.join(output_dir, "val.csv"), index=False)

    print(f"✅ Saved splits to {output_dir}")
    print(f"Train: {len(df_train)} | Val: {len(df_val)}")
split_dataset()

   Original: 4401
✅ Saved splits to natural_split/
Train: 4181 | Val: 220


In [14]:
import pandas as pd

# Load datasets
df_full = pd.read_csv("Twitter_Emotion_Dataset.csv")
df_sample = pd.read_csv("train.csv")

# Merge to find rows in full that are not in sample
df_diff = df_full.merge(df_sample.drop_duplicates(),
                        how='left',
                        indicator=True)

# Keep only rows that exist only in df_full
df_not_in_sample = df_diff[df_diff['_merge'] == 'left_only'].drop(columns=['_merge'])

# Save
df_not_in_sample.to_csv("val.csv", index=False)