In [1]:
import os
import pandas as pd
from typing import List

In [2]:
def process_csv_files(
    csv_file_list: List[str],
    output_dir: str,
    test_ratio: float = 0.2,
    drop_columns: List[str] = [],
    date_column: str = "date"  # Name of the original date column
):
    """
    Processes a list of CSV files:
    - Extracts last n% rows as test set.
    - Drops specified columns.
    - Renames date column to 'Datetime'.
    - Saves the output to the given directory.

    Args:
        csv_file_list (List[str]): Paths to CSV files.
        output_dir (str): Directory where processed files are saved.
        test_ratio (float): Fraction of each CSV to be used as test set.
        drop_columns (List[str]): Columns to drop.
        date_column (str): Column to rename to 'Datetime'.
    """
    os.makedirs(output_dir, exist_ok=True)

    for file_path in csv_file_list:
        # Read CSV
        df = pd.read_csv(file_path)

        # Compute split index
        test_size = int(len(df) * test_ratio)
        test_df = df.iloc[-test_size:].copy()

        # Drop specified columns
        test_df.drop(columns=drop_columns, inplace=True, errors='ignore')

        # Rename date column to 'Datetime'
        if date_column in test_df.columns:
            test_df.rename(columns={date_column: 'Datetime'}, inplace=True)

        # Output path
        base_name = os.path.basename(file_path).split('.')[0]
        output_path = os.path.join(output_dir, f"test_{base_name}_1d.csv")

        # Save
        test_df.to_csv(output_path, index=False)
        print(f"Saved: {output_path}")



In [3]:
def find_files_with_suffix(directory: str, suffix: str) -> List[str]:
    """
    Recursively traverse the given directory to find all files ending with `suffix`.

    Args:
        directory (str): Path to the directory where the search should begin.
        suffix (str): File suffix (e.g., ".csv", ".txt") to filter by.

    Returns:
        List[str]: List of full file paths that match the given suffix.
    """
    matched_files = []
    for root, dirs, files in os.walk(directory):
        dirs.sort()   # Ensures deterministic traversal of subdirectories
        files.sort()  # Ensures deterministic file ordering
        for f in files:
            if f.endswith(suffix):
                matched_files.append(os.path.join(root, f))
    return matched_files

In [None]:
input_dir = r"pcie_set\EcmP_stock_L_2016_24_mix"
output_directory = r"pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc"
columns_to_drop = ["open", "high", "low", "volume", "open_pct_change", "high_pct_change", "low_pct_change", "close_pct_change"]
date_col_name = "date"  # 
split_ratio = 1  # 100% test accoridng to pcie, do the actual split in ffm code

csv_files = find_files_with_suffix(input_dir, suffix='.csv')

process_csv_files(
    csv_file_list=csv_files,
    output_dir=output_directory,
    test_ratio=split_ratio,
    drop_columns=columns_to_drop,
    date_column=date_col_name
)

Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_AAPL_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_ABBV_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_AEP_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_AMGN_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_AMZN_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BAC_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BA_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BCH_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BHP_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BP_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BRK-A_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BSAC_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_BUD_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2016_24_testffm_0.2tc\test_CAT_pct_1d.csv
S

In [7]:
print("done")

done


## train split

In [8]:
def process_csv_files_train(
    csv_file_list: List[str],
    output_dir: str,
    train_ratio: float = 0.7,
    drop_columns: List[str] = [],
    date_column: str = "date"  # Name of the original date column
):
    """
    Processes a list of CSV files:
    - Extracts last n% rows as test set.
    - Drops specified columns.
    - Renames date column to 'Datetime'.
    - Saves the output to the given directory.

    Args:
        csv_file_list (List[str]): Paths to CSV files.
        output_dir (str): Directory where processed files are saved.
        train_ratio (float): Fraction of each CSV to be used as test set.
        drop_columns (List[str]): Columns to drop.
        date_column (str): Column to rename to 'Datetime'.
    """
    os.makedirs(output_dir, exist_ok=True)

    for file_path in csv_file_list:
        # Read CSV
        df = pd.read_csv(file_path)

        # Compute split index
        train_size = int(len(df) * train_ratio)
        test_df = df.iloc[:train_size].copy()

        # Drop specified columns
        test_df.drop(columns=drop_columns, inplace=True, errors='ignore')

        # Rename date column to 'Datetime'
        if date_column in test_df.columns:
            test_df.rename(columns={date_column: 'Datetime'}, inplace=True)

        # Output path
        base_name = os.path.basename(file_path).split('.')[0]
        output_path = os.path.join(output_dir, f"test_{base_name}_1d.csv")

        # Save
        test_df.to_csv(output_path, index=False)
        print(f"Saved: {output_path}")

In [12]:
input_dir = r"pcie_set\EcmP_stock_L_2005_24"
output_directory = r"pcie_set/EcmP_stock_L_2005_24_ft_0.7"
columns_to_drop = ["volume", "open_pct_change", "high_pct_change", "low_pct_change", "close_pct_change"]
date_col_name = "date"  # 
split_ratio = 0.7  # 70% test accoridng to pcie

csv_files = find_files_with_suffix(input_dir, suffix='.csv')

process_csv_files_train(
    csv_file_list=csv_files,
    output_dir=output_directory,
    train_ratio=split_ratio,
    drop_columns=columns_to_drop,
    date_column=date_col_name
)

Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_AAPL_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_AMZN_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_BA_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_BP_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_CAT_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_GE_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_GOOG_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_HSBC_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_INTC_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_LMT_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_META_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_MSFT_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_TSM_pct_1d.csv
Saved: pcie_set/EcmP_stock_L_2005_24_ft_0.7\test_UPS_pct_1d.csv


In [11]:
print("done")

done
