In [1]:
import os
import pandas as pd
import random
from pathlib import Path
from typing import Union

def split_dataset(
    root_dir: Union[str, Path],
    val_ratio: float = 0.1,
    val_files_ratio: float = 1.0,
    output_dir: Union[str, Path] = "val_split",
    seed: int = 42,
):
    """
    Splits a proportion of time series CSV files into validation sets.
    Optionally select only a subset of files to apply the split.

    Args:
        root_dir (str or Path): Root directory containing time series data.
        val_ratio (float): Proportion of each selected file's data to use for validation.
        val_files_ratio (float): Proportion of files to be split into validation.
        output_dir (str or Path): Root directory to save validation files.
        seed (int): Random seed for reproducibility.
    """
    root_dir = Path(root_dir)
    output_dir = Path(output_dir)
    
    # Collect all csv file paths
    all_csv_files = []
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".csv"):
                all_csv_files.append(Path(subdir) / file)

    # Select a subset of files for validation
    random.seed(seed)
    num_val_files = int(len(all_csv_files) * val_files_ratio)
    selected_files = random.sample(all_csv_files, num_val_files)

    for file_path in selected_files:
        print(f"Processing: {file_path}")

        # Read CSV
        df = pd.read_csv(file_path)

        # Sort by timestamp if necessary
        if 'timestamp' in df.columns:
            df = df.sort_values(by='timestamp')

        # Split by proportion
        split_idx = int(len(df) * (1 - val_ratio))
        val_df = df.iloc[split_idx:]

        # Construct output path
        relative_path = file_path.relative_to(root_dir)
        val_file_path = output_dir / relative_path

        # Ensure directory exists
        val_file_path.parent.mkdir(parents=True, exist_ok=True)

        # Save validation file
        val_df.to_csv(val_file_path, index=False)


In [4]:
data_dir = r'v1_no_volume'
val_ratio_each_f: float = 0.3
val_file_ratio: float = 0.2
output_dir = r'val_v1_nv'
random_seed = 42



split_dataset(root_dir=data_dir, val_ratio=val_ratio_each_f, val_files_ratio=val_file_ratio, output_dir=output_dir, seed=random_seed)

Processing: v1_no_volume\stock_datasets\csi500\1h\601208.SS_1h.csv
Processing: v1_no_volume\stock_datasets\csi500\1d\301095.SZ_1d.csv
Processing: v1_no_volume\stock_datasets\ftse100\1h\BARC.L_1h.csv
Processing: v1_no_volume\stock_datasets\csi500\1wk\603730.SS_1wk.csv
Processing: v1_no_volume\stock_datasets\csi500\1wk\600751.SS_1wk.csv
Processing: v1_no_volume\stock_datasets\csi500\1h\688639.SS_1h.csv
Processing: v1_no_volume\stock_datasets\csi500\1h\600657.SS_1h.csv
Processing: v1_no_volume\stock_datasets\sp500\1m\VZ_1m.csv
Processing: v1_no_volume\stock_datasets\csi500\1h\301291.SZ_1h.csv
Processing: v1_no_volume\stock_datasets\sp500\1wk\SW_1wk.csv
Processing: v1_no_volume\stock_datasets\sp500\1d\JNPR_1d.csv
Processing: v1_no_volume\stock_datasets\csi500\1d\600158.SS_1d.csv
Processing: v1_no_volume\stock_datasets\csi500\1d\600058.SS_1d.csv
Processing: v1_no_volume\stock_datasets\csi500\1h\600312.SS_1h.csv
Processing: v1_no_volume\stock_datasets\csi500\1wk\600587.SS_1wk.csv
Processing:

In [6]:
print("done")

done
