In [1]:
import os
# For Colab/Google Drive integration:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/FinRL/final')  # Change to your project folder in Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List
import warnings
warnings.filterwarnings('ignore')

In [3]:
def split_data_from_single_csv(original_csv_path: str, data_dir: str = "./data",
                              train_ratio: float = 0.7, val_ratio: float = 0.15,
                              test_ratio: float = 0.15, timeframe: str = "1sec") -> Tuple[str, str, str]:
    """
    Split a single CSV file into train, validation, and test sets chronologically.

    Args:
        original_csv_path: Path to the original CSV file
        data_dir: Directory to save split files
        train_ratio: Ratio for training data (default: 0.7)
        val_ratio: Ratio for validation data (default: 0.15)
        test_ratio: Ratio for test data (default: 0.15)
        timeframe: Timeframe identifier for file naming (default: "1sec")

    Returns:
        Tuple of (train_csv_path, val_csv_path, test_csv_path)
    """
    print(f"Splitting data from {original_csv_path} (timeframe: {timeframe})...")

    # Create timeframe-specific directory
    timeframe_dir = os.path.join(data_dir, timeframe)
    os.makedirs(timeframe_dir, exist_ok=True)
    print(f"Output directory: {timeframe_dir}")

    # Read the original CSV file
    df = pd.read_csv(original_csv_path)
    print(f"Original data shape: {df.shape}")

    # Ensure the data is sorted by time
    if 'system_time' in df.columns:
        df = df.sort_values('system_time')
        print("Sorted data by system_time")
    else:
        print("Warning: No 'system_time' column found. Data will not be sorted.")

    # Calculate split indices
    total_rows = len(df)
    train_end = int(total_rows * train_ratio)
    val_end = int(total_rows * (train_ratio + val_ratio))

    # Split the data
    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]

    print(f"Split indices: Train=0:{train_end}, Val={train_end}:{val_end}, Test={val_end}:{total_rows}")
    print(f"Split sizes: Train={len(train_df):,}, Val={len(val_df):,}, Test={len(test_df):,}")

    # Generate output file paths with timeframe
    base_name = os.path.splitext(os.path.basename(original_csv_path))[0]
    # Remove timeframe suffix if it exists in the base name
    if base_name.endswith(('_1sec', '_1min', '_5min')):
        base_name = base_name.rsplit('_', 1)[0]

    train_csv_path = os.path.join(timeframe_dir, f"{base_name}_{timeframe}_train_70.csv")
    val_csv_path = os.path.join(timeframe_dir, f"{base_name}_{timeframe}_val_15.csv")
    test_csv_path = os.path.join(timeframe_dir, f"{base_name}_{timeframe}_test_15.csv")

    # Save split files
    train_df.to_csv(train_csv_path, index=False)
    val_df.to_csv(val_csv_path, index=False)
    test_df.to_csv(test_csv_path, index=False)

    print("✓ Split files saved:")
    print(f"  - Train: {train_csv_path}")
    print(f"  - Val: {val_csv_path}")
    print(f"  - Test: {test_csv_path}")

    return train_csv_path, val_csv_path, test_csv_path

In [4]:
def verify_split_files(train_csv_path: str, val_csv_path: str, test_csv_path: str, timeframe: str = "1sec") -> bool:
    """
    Verify that all split files exist and have the expected structure.

    Args:
        train_csv_path: Path to training CSV file
        val_csv_path: Path to validation CSV file
        test_csv_path: Path to test CSV file
        timeframe: Timeframe identifier for verification

    Returns:
        True if all files exist and are valid, False otherwise
    """
    print(f"\nVerifying split files (timeframe: {timeframe})...")

    files_to_check = [
        ("Train", train_csv_path),
        ("Validation", val_csv_path),
        ("Test", test_csv_path)
    ]

    all_valid = True
    total_records = 0

    for split_name, file_path in files_to_check:
        if os.path.exists(file_path):
            try:
                df = pd.read_csv(file_path)
                total_records += len(df)
                print(f"✓ {split_name}: {file_path} ({len(df):,} rows, {len(df.columns)} columns)")

                # Check if timeframe is in filename
                if timeframe not in file_path:
                    print(f"  ⚠️  Warning: Timeframe '{timeframe}' not found in filename")

            except Exception as e:
                print(f"✗ {split_name}: {file_path} - Error reading file: {e}")
                all_valid = False
        else:
            print(f"✗ {split_name}: {file_path} - File not found")
            all_valid = False

    if all_valid:
        print(f"✓ All split files are valid! Total records: {total_records:,}")
    else:
        print("✗ Some split files have issues!")

    return all_valid

In [5]:
def get_split_file_paths(base_name: str, data_dir: str = "./data", timeframe: str = "1sec") -> Dict[str, str]:
    """
    Get the expected file paths for split files based on timeframe.

    Args:
        base_name: Base name of the original file (without extension)
        data_dir: Directory containing split files
        timeframe: Timeframe identifier ("1sec", "1min", "5min")

    Returns:
        Dictionary with keys 'train', 'val', 'test' and corresponding file paths
    """
    # Remove timeframe suffix if it exists in the base name
    if base_name.endswith(('_1sec', '_1min', '_5min')):
        base_name = base_name.rsplit('_', 1)[0]

    timeframe_dir = os.path.join(data_dir, timeframe)
    file_paths = {
        'train': os.path.join(timeframe_dir, f"{base_name}_{timeframe}_train_70.csv"),
        'val': os.path.join(timeframe_dir, f"{base_name}_{timeframe}_val_15.csv"),
        'test': os.path.join(timeframe_dir, f"{base_name}_{timeframe}_test_15.csv")
    }

    return file_paths

In [6]:
def check_split_files_exist(base_name: str, data_dir: str = "./data", timeframe: str = "1sec") -> bool:
    """
    Check if split files exist for a given timeframe.

    Args:
        base_name: Base name of the original file (without extension)
        data_dir: Directory containing split files
        timeframe: Timeframe identifier ("1sec", "1min", "5min")

    Returns:
        True if all split files exist, False otherwise
    """
    file_paths = get_split_file_paths(base_name, data_dir, timeframe)

    all_exist = True
    for split_type, file_path in file_paths.items():
        if not os.path.exists(file_path):
            print(f"Missing {split_type} file: {file_path}")
            all_exist = False
        else:
            print(f"✓ {split_type} file exists: {file_path}")

    return all_exist

In [7]:
def run_data_splitting_pipeline(input_file: str, timeframe: str = "1sec",
                               output_dir: str = "./data"):
    """
    Run the complete data splitting pipeline.

    Args:
        input_file: Path to input CSV file
        timeframe: Timeframe for splitting ("1sec", "1min", "5min")
        output_dir: Directory to save split files
    """
    print("=" * 60)
    print(f"Data Splitting Pipeline - {timeframe}")
    print("=" * 60)

    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file {input_file} not found!")
        return None

    # Validate timeframe
    supported_timeframes = ["1sec", "1min", "5min"]
    if timeframe not in supported_timeframes:
        print(f"Error: Unsupported timeframe '{timeframe}'. Supported: {', '.join(supported_timeframes)}")
        return None

    try:
        # Perform the split
        train_path, val_path, test_path = split_data_from_single_csv(
            original_csv_path=input_file,
            data_dir=output_dir,
            train_ratio=0.7,
            val_ratio=0.15,
            test_ratio=0.15,
            timeframe=timeframe
        )

        # Verify the split files
        verify_split_files(train_path, val_path, test_path, timeframe)

        print("\n" + "=" * 60)
        print("Data splitting completed successfully!")
        print("=" * 60)

        return train_path, val_path, test_path

    except Exception as e:
        print(f"Error during data splitting: {e}")
        return None

In [8]:
# Configuration 1: 1-second data splitting
def config_1sec():
    """Configuration for 1-second data splitting."""
    print("\n" + "="*60)
    print("CONFIGURATION 1: 1-SECOND DATA SPLITTING")
    print("="*60)

    input_file = "./data/BTC_1sec_with_sentiment_risk_train.csv"

    if os.path.exists(input_file):
        result = run_data_splitting_pipeline(input_file, timeframe="1sec")
        if result:
            print("✓ 1-second data splitting completed!")
        else:
            print("✗ 1-second data splitting failed!")
    else:
        print(f"File {input_file} not found. Please ensure the file exists in the /data directory.")

In [9]:
# Configuration 2: 1-minute data splitting
def config_1min():
    """Configuration for 1-minute data splitting."""
    print("\n" + "="*60)
    print("CONFIGURATION 2: 1-MINUTE DATA SPLITTING")
    print("="*60)

    input_file = "./data/BTC_1min_with_sentiment_risk_train.csv"

    if os.path.exists(input_file):
        result = run_data_splitting_pipeline(input_file, timeframe="1min")
        if result:
            print("✓ 1-minute data splitting completed!")
        else:
            print("✗ 1-minute data splitting failed!")
    else:
        print(f"File {input_file} not found. Please ensure the file exists in the /data directory.")

In [10]:
# Configuration 3: 5-minute data splitting
def config_5min():
    """Configuration for 5-minute data splitting."""
    print("\n" + "="*60)
    print("CONFIGURATION 3: 5-MINUTE DATA SPLITTING")
    print("="*60)

    input_file = "./data/BTC_5min_with_sentiment_risk_train.csv"

    if os.path.exists(input_file):
        result = run_data_splitting_pipeline(input_file, timeframe="5min")
        if result:
            print("✓ 5-minute data splitting completed!")
        else:
            print("✗ 5-minute data splitting failed!")
    else:
        print(f"File {input_file} not found. Please ensure the file exists in the /data directory.")

In [11]:
config_1sec()


CONFIGURATION 1: 1-SECOND DATA SPLITTING
Data Splitting Pipeline - 1sec
Splitting data from ./data/BTC_1sec_with_sentiment_risk_train.csv (timeframe: 1sec)...
Output directory: ./data/1sec
Original data shape: (494749, 158)
Sorted data by system_time
Split indices: Train=0:346324, Val=346324:420536, Test=420536:494749
Split sizes: Train=346,324, Val=74,212, Test=74,213
✓ Split files saved:
  - Train: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_train_70.csv
  - Val: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_val_15.csv
  - Test: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_test_15.csv

Verifying split files (timeframe: 1sec)...
✓ Train: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_train_70.csv (346,324 rows, 158 columns)
✓ Validation: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_val_15.csv (74,212 rows, 158 columns)
✓ Test: ./data/1sec/BTC_1sec_with_sentiment_risk_train_1sec_test_15.csv (74,213 rows, 158 columns)
✓ All split files are valid! Total 

In [12]:
config_1min()


CONFIGURATION 2: 1-MINUTE DATA SPLITTING
Data Splitting Pipeline - 1min
Splitting data from ./data/BTC_1min_with_sentiment_risk_train.csv (timeframe: 1min)...
Output directory: ./data/1min
Original data shape: (8249, 158)
Sorted data by system_time
Split indices: Train=0:5774, Val=5774:7011, Test=7011:8249
Split sizes: Train=5,774, Val=1,237, Test=1,238
✓ Split files saved:
  - Train: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_train_70.csv
  - Val: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_val_15.csv
  - Test: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_test_15.csv

Verifying split files (timeframe: 1min)...
✓ Train: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_train_70.csv (5,774 rows, 158 columns)
✓ Validation: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_val_15.csv (1,237 rows, 158 columns)
✓ Test: ./data/1min/BTC_1min_with_sentiment_risk_train_1min_test_15.csv (1,238 rows, 158 columns)
✓ All split files are valid! Total records: 8,249

Data

In [13]:
config_5min()


CONFIGURATION 3: 5-MINUTE DATA SPLITTING
Data Splitting Pipeline - 5min
Splitting data from ./data/BTC_5min_with_sentiment_risk_train.csv (timeframe: 5min)...
Output directory: ./data/5min
Original data shape: (1651, 158)
Sorted data by system_time
Split indices: Train=0:1155, Val=1155:1403, Test=1403:1651
Split sizes: Train=1,155, Val=248, Test=248
✓ Split files saved:
  - Train: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_train_70.csv
  - Val: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_val_15.csv
  - Test: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_test_15.csv

Verifying split files (timeframe: 5min)...
✓ Train: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_train_70.csv (1,155 rows, 158 columns)
✓ Validation: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_val_15.csv (248 rows, 158 columns)
✓ Test: ./data/5min/BTC_5min_with_sentiment_risk_train_5min_test_15.csv (248 rows, 158 columns)
✓ All split files are valid! Total records: 1,651

Data splitti