In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
def split_csv(input_csv, output_train, output_val, output_test, train_size=0.7, val_size=0.15, test_size=0.15, random_state=None):
    if train_size + val_size + test_size != 1.0:
        raise ValueError("The sum of train_size, val_size, and test_size must equal 1.0")
    df = pd.read_csv(input_csv)
    train_df, temp_df = train_test_split(df, test_size=(val_size + test_size), random_state=random_state)
    temp_val_size = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(temp_df, test_size=(1 - temp_val_size), random_state=random_state)
    train_df.to_csv(output_train, index=False)
    val_df.to_csv(output_val, index=False)
    test_df.to_csv(output_test, index=False)
    print(f"Dataset split completed:")
    print(f"Training set: {len(train_df)} rows saved to {output_train}")
    print(f"Validation set: {len(val_df)} rows saved to {output_val}")
    print(f"Test set: {len(test_df)} rows saved to {output_test}")
split_csv(
    input_csv="./tabular_dataset/trieu_data_processed.csv",
    output_train="./tabular_dataset/train.csv",
    output_val="./tabular_dataset/val.csv",
    output_test="./tabular_dataset/test.csv",
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    random_state=2508
)

Dataset split completed:
Training set: 205 rows saved to ./tabular_dataset/train.csv
Validation set: 44 rows saved to ./tabular_dataset/val.csv
Test set: 44 rows saved to ./tabular_dataset/test.csv
