In [1]:
import os
import pandas as pd
import random
from pathlib import Path
from typing import Union

def split_dataset(
    root_dir: Union[str, Path],
    val_ratio: float = 0.1,
    val_files_ratio: float = 1.0,
    output_dir: Union[str, Path] = "val_split",
    seed: int = 42,
    min_length: int = 512,
):
    """
    Splits a proportion of time series CSV files into validation sets.
    Optionally select only a subset of files to apply the split.

    Args:
        root_dir (str or Path): Root directory containing time series data.
        val_ratio (float): Proportion of each selected file's data to use for validation.
        val_files_ratio (float): Proportion of files to be split into validation.
        output_dir (str or Path): Root directory to save validation files.
        seed (int): Random seed for reproducibility.
    """
    root_dir = Path(root_dir)
    output_dir = Path(output_dir)
    
    # Collect all csv file paths
    all_csv_files = []
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".csv"):
                file_path = Path(subdir) / file
                try:
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"❌ Error reading {file_path}: {e}")
                    continue

                if len(df) < min_length:
                    continue
                all_csv_files.append(Path(subdir) / file)


    # Select a subset of files for validation
    random.seed(seed)
    num_val_files = int(len(all_csv_files) * val_files_ratio)
    selected_files = random.sample(all_csv_files, num_val_files)

    for file_path in selected_files:
        print(f"Processing: {file_path}")

        # Read CSV
        df = pd.read_csv(file_path)
        df.rename(columns={"Date": "Datetime"}, inplace=True)


        split_len = max(int(len(df) * val_ratio), min_length)
        if split_len == 0:
            print(f"⚠️ Skipping {file_path}: split length is 0")
            continue


        # Split by proportion
        split_idx = int(len(df) - split_len)
        val_df = df.iloc[split_idx:]

        # Construct output path
        relative_path = file_path.relative_to(root_dir)
        val_file_path = output_dir / relative_path

        # Ensure directory exists
        val_file_path.parent.mkdir(parents=True, exist_ok=True)

        # Save validation file
        val_df.to_csv(val_file_path, index=False)


In [2]:
data_dir = r'finance_v1_nv\v1_no_volume'
val_ratio_each_f: float = 0.3
val_file_ratio: float = 0.2
output_dir = r'test_v1_nv'
random_seed = 42
min_length = 512



split_dataset(root_dir=data_dir, val_ratio=val_ratio_each_f, val_files_ratio=val_file_ratio, output_dir=output_dir, seed=random_seed, min_length=min_length)

Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1h\601678.SS_1h.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1d\301207.SZ_1d.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\hsi\1d\1038.HK_1d.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\ftse100\1h\MKS.L_1h.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1wk\601777.SS_1wk.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1m\301165.SZ_1m.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1h\600761.SS_1h.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\sp500\1wk\SO_1wk.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1h\600098.SS_1h.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\sp500\1h\ETR_1h.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1d\600252.SS_1d.csv
Processing: finance_v1_nv\v1_no_volume\stock_datasets\csi500\1d\600163.SS_1d.csv
Processing: finance_v1_nv\v1_no_volume\stock_da

In [5]:
print("done")

done


In [3]:
#restructuring dataset dir
import os
import shutil
from pathlib import Path

def restructure_dataset(parent_dir: str, result_dir: str):
    parent_path = Path(parent_dir).resolve()
    result_path = Path(result_dir).resolve()

    assert parent_path.exists(), f"Directory {parent_dir} does not exist."
    result_path.mkdir(parents=True, exist_ok=True)

    # Iterate through each subdirectory
    for sub in sorted(parent_path.iterdir()):
        if sub.is_dir():
            new_dir_name = f"{parent_path.name}_{sub.name}"
            new_dir_path = result_path / new_dir_name
            new_dir_path.mkdir(parents=True, exist_ok=True)

            # Copy all files/subfolders from original subdir to new_dir_path
            for item in sub.iterdir():
                target = new_dir_path / item.name
                if item.is_dir():
                    shutil.copytree(item, target, dirs_exist_ok=True)
                else:
                    shutil.copy2(item, target)

            print(f"Copied contents from {sub} → {new_dir_path}")






In [6]:
# Replace this with your actual path
target_dir = r'test_v1_nv\futures_datasets'
result_dir = r'test_v1_nv_flat'
restructure_dataset(target_dir, result_dir=result_dir)

Copied contents from F:\program repo\Finance_data\test_v1_nv\futures_datasets\1d → F:\program repo\Finance_data\test_v1_nv_flat\futures_datasets_1d
Copied contents from F:\program repo\Finance_data\test_v1_nv\futures_datasets\1h → F:\program repo\Finance_data\test_v1_nv_flat\futures_datasets_1h
Copied contents from F:\program repo\Finance_data\test_v1_nv\futures_datasets\1m → F:\program repo\Finance_data\test_v1_nv_flat\futures_datasets_1m
Copied contents from F:\program repo\Finance_data\test_v1_nv\futures_datasets\1wk → F:\program repo\Finance_data\test_v1_nv_flat\futures_datasets_1wk


In [7]:
#for market stock dir flat

In [8]:
import shutil
from pathlib import Path

def regroup_by_frequency_copy(root_dir: str, result_dir: str):
    root_path = Path(root_dir).resolve()
    result_path = Path(result_dir).resolve()

    assert root_path.exists(), f"Directory {root_dir} does not exist."
    result_path.mkdir(parents=True, exist_ok=True)

    for market_dir in sorted(root_path.iterdir()):
        if not market_dir.is_dir():
            continue

        for freq_dir in sorted(market_dir.iterdir()):
            if not freq_dir.is_dir():
                continue

            freq = freq_dir.name
            market = market_dir.name

            # Define target: result_dir/stock_datasets_<freq>/<market>/
            target_base = result_path / f"{root_path.name}_{freq}"
            target_market_dir = target_base / market
            target_market_dir.mkdir(parents=True, exist_ok=True)

            # Copy all files/subfolders recursively
            for item in freq_dir.iterdir():
                target_item = target_market_dir / item.name
                if item.is_dir():
                    shutil.copytree(item, target_item, dirs_exist_ok=True)
                else:
                    shutil.copy2(item, target_item)

            print(f"Copied {freq_dir} → {target_market_dir}")

In [9]:
target_dir = r'test_v1_nv\stock_datasets'
result_dir = r'test_v1_nv_flat'
regroup_by_frequency_copy(root_dir=target_dir, result_dir=result_dir)

Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\csi500\1d → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1d\csi500
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\csi500\1h → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1h\csi500
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\csi500\1m → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1m\csi500
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\csi500\1wk → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1wk\csi500
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\ftse100\1d → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1d\ftse100
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\ftse100\1h → F:\program repo\Finance_data\test_v1_nv_flat\stock_datasets_1h\ftse100
Copied F:\program repo\Finance_data\test_v1_nv\stock_datasets\ftse100\1m → F:\program repo\Finance_data\test_v1_nv_flat\stock_data

In [10]:
print("done")

done
