In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
def calculate_technical_indicators(df):
    """Calculate all required technical indicators"""
    features = pd.DataFrame()

    # Calendar Features
    features['day_of_week'] = df['Date'].dt.dayofweek
    features['day_of_month'] = df['Date'].dt.day
    features['week_of_year'] = df['Date'].dt.isocalendar().week
    features['month'] = df['Date'].dt.month
    features['quarter'] = df['Date'].dt.quarter

    # Period End Indicators
    features['is_month_end'] = df['Date'].dt.is_month_end.astype(int)
    features['is_quarter_end'] = df['Date'].dt.is_quarter_end.astype(int)
    features['is_year_end'] = df['Date'].dt.is_year_end.astype(int)
    features['days_to_month_end'] = df['Date'].dt.days_in_month - df['Date'].dt.day

    # Add trading day information
    features['is_trading_day'] = df['is_trading_day']

    # Copy OHLCV data
    features['Open'] = df['Open']
    features['High'] = df['High']
    features['Low'] = df['Low']
    features['Close'] = df['Close']
    features['Volume'] = df['Volume']

    # # Include other columns if they exist
    # if 'Dividends' in df.columns:
    #     features['Dividends'] = df['Dividends']
    # if 'Stock Splits' in df.columns:
    #     features['Stock Splits'] = df['Stock Splits']
    # if 'Capital Gains' in df.columns:
    #     features['Capital Gains'] = df['Capital Gains']

    # Define multiple timeframes for different market behaviors
    short_term = [5, 14]           # Capture short-term movements
    medium_term = [20, 50]         # Capture medium-term trends
    long_term = [100, 200]         # Capture long-term trends

    # 1. RSI (multiple periods)
    delta = df['Close'].diff()
    for period in [5, 14, 21, 50]:  # Added 5 for very short term, 50 for longer term
        gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
        rs = gain / loss
        features[f'RSI_{period}'] = 100 - (100 / (1 + rs))

    # 2. Bollinger Bands Width (multiple periods)
    for period in [10, 20, 50]:  # Added shorter and longer timeframes
        ma = df['Close'].rolling(window=period).mean()
        std = df['Close'].rolling(window=period).std()
        features[f'BB_Width_{period}'] = ((ma + 2*std) - (ma - 2*std)) / ma

    # 3. ADX (multiple periods)
    high_low = df['High'] - df['Low']
    high_close = np.abs(df['High'] - df['Close'].shift())
    low_close = np.abs(df['Low'] - df['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = ranges.max(axis=1)

    plus_dm = df['High'].diff()
    minus_dm = df['Low'].diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0

    for period in [14, 30, 50]:  # Added longer timeframes
        tr_period = true_range.rolling(period).sum()
        plus_dm_period = plus_dm.rolling(period).sum()
        minus_dm_period = minus_dm.rolling(period).sum()

        plus_di = 100 * (plus_dm_period / tr_period)
        minus_di = 100 * (minus_dm_period / tr_period)
        features[f'ADX_{period}'] = 100 * np.abs(plus_di - minus_di) / (plus_di + minus_di)

    # 4. Volume ROC (multiple periods)
    for period in [10, 20, 50]:  # Added shorter and longer timeframes
        features[f'Volume_ROC_{period}'] = (
            (df['Volume'] - df['Volume'].shift(period)) /
            df['Volume'].shift(period)
        ) * 100

    # 5. Z-score of price (multiple periods)
    for period in [10, 20, 50]:  # Added variety of timeframes
        rolling_mean = df['Close'].rolling(window=period).mean()
        rolling_std = df['Close'].rolling(window=period).std()
        features[f'Price_Z_Score_{period}'] = (df['Close'] - rolling_mean) / rolling_std

    # 6. Skewness (multiple periods)
    for period in [10, 20, 50]:  # Added different timeframes
        features[f'Skewness_{period}'] = df['Close'].rolling(period).skew()

    # 7. Additional trend indicators
    # Price distance from moving averages
    for period in medium_term + long_term:
        ma = df['Close'].rolling(window=period).mean()
        features[f'Price_Distance_MA_{period}'] = (df['Close'] - ma) / ma * 100

    # 8. Volatility ratios between timeframes
    features['Volatility_Ratio_Short_Long'] = (
        df['Close'].rolling(20).std() / df['Close'].rolling(100).std()
    )

    return features

In [3]:
def prepare_data_with_features(csv_path):
    """
    Prepare data with technical indicators, combining raw, normalized, and percentage change features
    into a single DataFrame with appropriate prefixes

    Returns:
    - combined_features: DataFrame containing all feature types
    - scaler: Fitted StandardScaler
    """

    # Read the CSV file
    df = pd.read_csv(csv_path) # Fixed: Indentation was incorrect
    df['Date'] = pd.to_datetime(df['Date'], utc=True)

    # Add trading day feature
    df['is_trading_day'] = df['Date'].dt.dayofweek.apply(lambda x: 1 if x < 5 else 0)

    # Calculate technical indicators
    features_df = calculate_technical_indicators(df)

    # Handle missing values from technical indicator calculations
    features_df = features_df.fillna(method='bfill').fillna(method='ffill')

    # Create percentage change features
    features_pct_change = pd.DataFrame()

    # Calculate percentage changes for price-based columns
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        if col in features_df.columns:
            features_pct_change[f'{col}_pct_change'] = features_df[col].pct_change()
            features_pct_change[f'{col}_pct_change_1d'] = features_df[col].pct_change(periods=1)
            features_pct_change[f'{col}_pct_change_5d'] = features_df[col].pct_change(periods=5)
            features_pct_change[f'{col}_pct_change_20d'] = features_df[col].pct_change(periods=20)

    # Calculate percentage changes for volume
    if 'Volume' in features_df.columns:
        features_pct_change['Volume_pct_change'] = features_df['Volume'].pct_change()
        features_pct_change['Volume_pct_change_5d'] = features_df['Volume'].pct_change(periods=5)
        features_pct_change['Volume_pct_change_20d'] = features_df['Volume'].pct_change(periods=20)

    # Calculate percentage changes for technical indicators
    technical_cols = [col for col in features_df.columns
                     if any(indicator in col for indicator in ['RSI', 'BB', 'ADX', 'ROC'])]
    for col in technical_cols:
        features_pct_change[f'{col}_pct_change'] = features_df[col].pct_change()

    # Handle missing values in percentage changes
    features_pct_change = features_pct_change.fillna(method='bfill').fillna(method='ffill')

    # Don't normalize calendar features
    calendar_cols = ['day_of_week', 'day_of_month', 'week_of_year', 'month', 'quarter',
                    'is_month_end', 'is_quarter_end', 'is_year_end', 'days_to_month_end']

    # Separate features to normalize
    features_to_normalize = features_df.drop(columns=calendar_cols, errors='ignore')

    # Standardize features (excluding calendar features)
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(features_to_normalize)
    features_normalized = pd.DataFrame(normalized_data, columns=features_to_normalize.columns)

    # Create combined DataFrame
    combined_features = pd.DataFrame()

    # Add raw features with 'raw_' prefix (except calendar features)
    for col in features_df.columns:
        if col not in calendar_cols:
            combined_features[f'raw_{col}'] = features_df[col]
        else:
            combined_features[col] = features_df[col]  # Calendar features without prefix

    # Add normalized features with 'norm_' prefix
    for col in features_normalized.columns:
        if col not in calendar_cols:
            combined_features[f'norm_{col}'] = features_normalized[col]

    # Add percentage change features with 'pct_' prefix
    for col in features_pct_change.columns:
        combined_features[f'pct_{col}'] = features_pct_change[col]

    return combined_features, scaler

In [4]:
def prepare_data_with_features(csv_path):
    """
    Prepare data with technical indicators, combining raw, normalized, and percentage change features
    into a single DataFrame with appropriate prefixes

    Returns:
    - combined_features: DataFrame containing all feature types
    - scaler: Fitted StandardScaler
    """

    # Read the CSV file
    df = pd.read_csv(csv_path) # Fixed: Indentation was incorrect
    df['Date'] = pd.to_datetime(df['Date'], utc=True)

    # Add trading day feature
    df['is_trading_day'] = df['Date'].dt.dayofweek.apply(lambda x: 1 if x < 5 else 0)

    # Calculate technical indicators
    features_df = calculate_technical_indicators(df)

    # Handle missing values from technical indicator calculations
    features_df = features_df.fillna(method='bfill').fillna(method='ffill')

    # Create percentage change features
    features_pct_change = pd.DataFrame()

    # Calculate percentage changes for price-based columns
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        if col in features_df.columns:
            features_pct_change[f'{col}_pct_change'] = features_df[col].pct_change()
            features_pct_change[f'{col}_pct_change_1d'] = features_df[col].pct_change(periods=1)
            features_pct_change[f'{col}_pct_change_5d'] = features_df[col].pct_change(periods=5)
            features_pct_change[f'{col}_pct_change_20d'] = features_df[col].pct_change(periods=20)

    # Calculate percentage changes for volume
    if 'Volume' in features_df.columns:
        features_pct_change['Volume_pct_change'] = features_df['Volume'].pct_change()
        features_pct_change['Volume_pct_change_5d'] = features_df['Volume'].pct_change(periods=5)
        features_pct_change['Volume_pct_change_20d'] = features_df['Volume'].pct_change(periods=20)

    # Calculate percentage changes for technical indicators
    technical_cols = [col for col in features_df.columns
                     if any(indicator in col for indicator in ['RSI', 'BB', 'ADX', 'ROC'])]
    for col in technical_cols:
        features_pct_change[f'{col}_pct_change'] = features_df[col].pct_change()

    # Handle missing values in percentage changes
    features_pct_change = features_pct_change.fillna(method='bfill').fillna(method='ffill')

    # Don't normalize calendar features
    calendar_cols = ['day_of_week', 'day_of_month', 'week_of_year', 'month', 'quarter',
                    'is_month_end', 'is_quarter_end', 'is_year_end', 'days_to_month_end']

    # Separate features to normalize
    features_to_normalize = features_df.drop(columns=calendar_cols, errors='ignore')

    # Standardize features (excluding calendar features)
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(features_to_normalize)
    features_normalized = pd.DataFrame(normalized_data, columns=features_to_normalize.columns)

    # Create combined DataFrame
    combined_features = pd.DataFrame()

    # Add raw features with 'raw_' prefix (except calendar features)
    for col in features_df.columns:
        if col not in calendar_cols:
            combined_features[f'raw_{col}'] = features_df[col]
        else:
            combined_features[col] = features_df[col]  # Calendar features without prefix

    # Add normalized features with 'norm_' prefix
    for col in features_normalized.columns:
        if col not in calendar_cols:
            combined_features[f'norm_{col}'] = features_normalized[col]

    # Add percentage change features with 'pct_' prefix
    for col in features_pct_change.columns:
        combined_features[f'pct_{col}'] = features_pct_change[col]

    return combined_features, scaler

In [5]:
def create_sliding_window_data(data, window_size, target_days):
    """Create sliding window data for individual prediction model"""
    X, y = [], []

    for i in range(len(data) - window_size - target_days + 1):
        X_window = data[i:i + window_size]
        y_target = data[i + window_size:i + window_size + target_days, 1:3]

        X.append(X_window)
        y.append(y_target)

    return np.array(X), np.array(y)

In [6]:
def create_sliding_window_data_a(i_from_window_size_loop, data, window_size, max_window_size, target_days):
    """Create sliding window data for all-in-1 model"""
    X, y = [], []

    for i in range(max_window_size, len(data) - target_days + 1):
        if i_from_window_size_loop == 0:
            y_target = data[i:i + target_days, 1:3]
            y.append(y_target)

        X_window = data[i-window_size:i]
        X.append(X_window)

    return np.array(X), np.array(y)

In [7]:
# Main execution
def process_data(csv_path, save_path_prefix):
    """Process data and save results with combined features"""
    # Prepare features
    combined_features, scaler = prepare_data_with_features(csv_path)

    window_sizes = [7, 14, 30, 60]
    target_days = 7

    # Save scaler and features for later use
    with open(f'{save_path_prefix}_scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    with open(f'{save_path_prefix}_combined_features.pkl', 'wb') as file:
        pickle.dump(combined_features, file)

    # Print feature information
    print("\nFeature set information:")
    print(f"Total number of features: {combined_features.shape[1]}")
    print("\nFeature groups:")
    raw_features = [col for col in combined_features.columns if col.startswith('raw_')]
    norm_features = [col for col in combined_features.columns if col.startswith('norm_')]
    pct_features = [col for col in combined_features.columns if col.startswith('pct_')]
    calendar_features = [col for col in combined_features.columns
                        if not any(col.startswith(prefix) for prefix in ['raw_', 'norm_', 'pct_'])]

    print(f"Raw features: {len(raw_features)}")
    print(f"Normalized features: {len(norm_features)}")
    print(f"Percentage change features: {len(pct_features)}")
    print(f"Calendar features: {len(calendar_features)}")

    # Individual prediction model data
    print("\nCreating individual prediction model data...")
    for window_size in window_sizes:
        X, y = create_sliding_window_data(combined_features.values, window_size, target_days)
        print(f"\nWindow size: {window_size} - X: {X.shape}, y: {y.shape}")

        with open(f'{save_path_prefix}_X_{window_size}days_i.pkl', 'wb') as file:
            pickle.dump(X, file)
        print("Done Saving X")

        with open(f'{save_path_prefix}_y_{window_size}days_i.pkl', 'wb') as file:
            pickle.dump(y, file)
        print("Done Saving y")

    # All-in-1 model data
    print("\nCreating all-in-1 model data...")
    max_window_size = max(window_sizes)
    for i, window_size in enumerate(window_sizes):
        X, y = create_sliding_window_data_a(i, combined_features.values, window_size, max_window_size, target_days)
        print(f"\nWindow size: {window_size} - X: {X.shape}, y: {y.shape}")

        with open(f'{save_path_prefix}_X_{window_size}days_a.pkl', 'wb') as file:
            pickle.dump(X, file)
        print("Done Saving X")

        if i == 0:
            with open(f'{save_path_prefix}_y_a.pkl', 'wb') as file:
                pickle.dump(y, file)
            print("Done Saving y")

    # Save feature names for reference
    feature_info = {
        'raw_features': raw_features,
        'normalized_features': norm_features,
        'percentage_features': pct_features,
        'calendar_features': calendar_features
    }
    with open(f'{save_path_prefix}_feature_info.pkl', 'wb') as file:
        pickle.dump(feature_info, file)


In [8]:
csv_path = "GLD-20041118-20250119.csv"  # Update with your CSV path
save_path_prefix = "processed_data"      # Update with your desired save path prefix
process_data(csv_path, save_path_prefix)

  features_df = features_df.fillna(method='bfill').fillna(method='ffill')
  features_pct_change = features_pct_change.fillna(method='bfill').fillna(method='ffill')



Feature set information:
Total number of features: 101

Feature groups:
Raw features: 30
Normalized features: 30
Percentage change features: 32
Calendar features: 9

Creating individual prediction model data...

Window size: 7 - X: (5061, 7, 101), y: (5061, 7, 2)
Done Saving X
Done Saving y

Window size: 14 - X: (5054, 14, 101), y: (5054, 7, 2)
Done Saving X
Done Saving y

Window size: 30 - X: (5038, 30, 101), y: (5038, 7, 2)
Done Saving X
Done Saving y

Window size: 60 - X: (5008, 60, 101), y: (5008, 7, 2)
Done Saving X
Done Saving y

Creating all-in-1 model data...

Window size: 7 - X: (5008, 7, 101), y: (5008, 7, 2)
Done Saving X
Done Saving y

Window size: 14 - X: (5008, 14, 101), y: (0,)
Done Saving X

Window size: 30 - X: (5008, 30, 101), y: (0,)
Done Saving X

Window size: 60 - X: (5008, 60, 101), y: (0,)
Done Saving X
