In [111]:
import os
import glob
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# --------------------------
# 1. Configuration
# --------------------------
# Base folder where the combined datasets are stored.
DATASET_FOLDER = os.path.join("..", "Data", "dataset")

# Create output directories for train and test splits
TRAIN_FOLDER = os.path.join(DATASET_FOLDER, "train")
TEST_FOLDER  = os.path.join(DATASET_FOLDER, "test")
os.makedirs(TRAIN_FOLDER, exist_ok=True)
os.makedirs(TEST_FOLDER, exist_ok=True)

# Define the test split fraction (e.g., 20% of speakers for testing)
TEST_FRACTION = 0.2

# --------------------------
# 2. Processing Each Dataset
# --------------------------
# Find all dataset CSV files in the dataset folder (exclude combined_dataset.csv if present)
dataset_files = glob.glob(os.path.join(DATASET_FOLDER, "*_dataset.csv"))
dataset_files = [f for f in dataset_files if "combined_dataset.csv" not in f]

for file_path in dataset_files:
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Get the unique speakers
    speakers = df['speaker'].unique()
    
    # Determine how many speakers to use for testing (at least one)
    n_test = max(1, int(len(speakers) * TEST_FRACTION))
    
    # Randomly choose speakers for the test set
    test_speakers = np.random.choice(speakers, size=n_test, replace=False)
    
    # Split the dataset: all rows with a speaker in test_speakers go to test set.
    train_df = df[~df['speaker'].isin(test_speakers)]
    test_df  = df[df['speaker'].isin(test_speakers)]
    
    # Derive a base name for the output files from the current dataset filename
    base_name = os.path.basename(file_path).replace("_dataset.csv", "")
    
    # Define output paths for train and test splits
    train_csv  = os.path.join(TRAIN_FOLDER, f"{base_name}_train.csv")
    train_json = os.path.join(TRAIN_FOLDER, f"{base_name}_train.json")
    test_csv   = os.path.join(TEST_FOLDER, f"{base_name}_test.csv")
    test_json  = os.path.join(TEST_FOLDER, f"{base_name}_test.json")
    
    # Save the splits as CSV
    train_df.to_csv(train_csv, index=False, encoding="utf-8")
    test_df.to_csv(test_csv, index=False, encoding="utf-8")
    
    # Save the splits as JSON (records-oriented)
    train_df.to_json(train_json, orient="records", force_ascii=False, indent=2)
    test_df.to_json(test_json, orient="records", force_ascii=False, indent=2)
    
    print(f"Processed {base_name}: {len(train_df)} train samples, {len(test_df)} test samples.")


Processed mon: 2147 train samples, 503 test samples.
Processed dat: 3424 train samples, 655 test samples.
Processed tim: 142 train samples, 27 test samples.
Processed id: 1952 train samples, 391 test samples.
Processed bcd: 644 train samples, 111 test samples.
Processed car: 571 train samples, 122 test samples.
Processed phn: 664 train samples, 125 test samples.
