In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from model_module import model_preparation, best_model
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline

In [147]:
df = pd.read_csv("./data/angelone_eq_15min_2018_2025.csv").set_index("datetime").drop(columns=["timestamp"])
# Convert df index to datetime (add this RIGHT AFTER loading/creating df)
df.index = pd.to_datetime(df.index)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-10-05 09:45:00+05:30,275.0,275.0,275.0,275.0,306762
2020-10-05 10:00:00+05:30,275.0,275.0,275.0,275.0,0
2020-10-05 10:15:00+05:30,272.6,292.35,257.0,278.6,2886820
2020-10-05 10:30:00+05:30,278.6,292.4,275.0,286.7,949825
2020-10-05 10:45:00+05:30,286.7,294.8,285.55,294.0,831097


#### Creating Features

    Directional momentum

In [148]:
# Trend direction
df['green_candle'] = (df['close'] > df['open']).astype(int)
df['higher_high'] = (df['high'] > df['high'].shift(1)).astype(int)
df['lower_low'] = (df['low'] < df['low'].shift(1)).astype(int)
df['close_above_sma13'] = (df['close'] > df['close'].rolling(13).mean()).astype(int)
df['close_above_sma26'] = (df['close'] > df['close'].rolling(26).mean()).astype(int)

# # Momentum direction
# df['return_1_sign'] = np.sign(df['log_return'].shift(1))
# df['return_4_positive'] = (df['return_4'] > 0).astype(int)
# df['return_13_positive'] = (df['return_13'] > 0).astype(int)

In [149]:
# Time features (datetime is index)
df['hour'] = pd.to_datetime(df.index).hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week'] = pd.to_datetime(df.index).dayofweek

In [150]:
df['hl_ratio'] = (df['high'] - df['low']) / df['close']
df['upper_shadow'] = (df['high'] - df[['open', 'close']].max(axis=1)) / df['close']
df['lower_shadow'] = (df[['open', 'close']].min(axis=1) - df['low']) / df['close']

    momentum Features

In [151]:
df['return_4'] = np.log(df['close'] / df['close'].shift(4))   # 1 hour
df['return_13'] = np.log(df['close'] / df['close'].shift(13)) # ~3 hours
df['return_26'] = np.log(df['close'] / df['close'].shift(26)) # 1 day

    Volume

In [152]:
df = df.copy()
lookback = 20  # 20 bars = 5 hours of 15min data

df["volume_ma"] = df["volume"].rolling(window=lookback).mean()
df["volume_ratio"] = df["volume"] / df["volume_ma"]

# Clean up
df = df.dropna(how="any")

    RSI

In [153]:
# RSI calculation
def calculate_rsi(data, period=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Add RSI with standard 14-period
df["rsi_14"] = calculate_rsi(df["close"], period=14)

# Clean up
df = df.dropna(how="any")

# Check the feature
print("RSI Statistics:")
print(df["rsi_14"].describe())
print(f"\nOverbought (>70): {(df['rsi_14'] > 70).sum()} bars")
print(f"Oversold (<30): {(df['rsi_14'] < 30).sum()} bars")


RSI Statistics:
count    32008.000000
mean        49.333163
std         19.061507
min          0.000000
25%         35.177163
50%         49.368087
75%         63.157895
max        100.000000
Name: rsi_14, dtype: float64

Overbought (>70): 4874 bars
Oversold (<30): 5469 bars


    MACD

In [154]:
# MACD calculation
def calculate_macd(data, fast=12, slow=26, signal=9):
    ema_fast = data.ewm(span=fast, adjust=False).mean()
    ema_slow = data.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    macd_histogram = macd_line - signal_line
    return macd_line, signal_line, macd_histogram

df["macd"], df["macd_signal"], df["macd_hist"] = calculate_macd(df["close"])

# Normalize MACD features by price (make them relative)
df["macd_norm"] = df["macd"] / df["close"]
df["macd_hist_norm"] = df["macd_hist"] / df["close"]

# Clean up
df = df.dropna(how="any")

print("MACD Statistics:")
print(df[["macd_norm", "macd_hist_norm"]].describe())

MACD Statistics:
          macd_norm  macd_hist_norm
count  32008.000000    32008.000000
mean       0.000345        0.000005
std        0.007443        0.002041
min       -0.049496       -0.016980
25%       -0.003475       -0.000954
50%        0.000006       -0.000060
75%        0.003886        0.000902
max        0.045701        0.016795


    Moving Averages

In [155]:
# Moving averages (just for calculation, not direct features)
df["5ma"] = df["close"].rolling(5).mean()
df["10ma"] = df["close"].rolling(10).mean()
df["50ma"] = df["close"].rolling(50).mean()
df["200ma"] = df["close"].rolling(200).mean()  # Fixed: was 100

# RELATIVE features (normalized by price)
df["5_10_ma_diff_pct"] = (df["5ma"] - df["10ma"]) / df["close"]
df["close_5ma_diff_pct"] = (df["close"] - df["5ma"]) / df["close"]
df["close_10ma_diff_pct"] = (df["close"] - df["10ma"]) / df["close"]
df["golden_cross_pct"] = (df["50ma"] - df["200ma"]) / df["close"]

# Additional useful MA features
df["close_50ma_diff_pct"] = (df["close"] - df["50ma"]) / df["close"]
df["close_200ma_diff_pct"] = (df["close"] - df["200ma"]) / df["close"]

# Slope/momentum of MAs (rate of change)
df["5ma_roc"] = df["5ma"].pct_change(periods=5)
df["50ma_roc"] = df["50ma"].pct_change(periods=10)


In [156]:
# High-Low range features
df["hl_range"] = (df["high"] - df["low"]) / df["close"]  # Normalized
df["hl_range_ma"] = df["hl_range"].rolling(20).mean()
df["hl_range_ratio"] = df["hl_range"] / df["hl_range_ma"]  # Current vs average

# Close position within bar
df["close_position"] = (df["close"] - df["low"]) / (df["high"] - df["low"])  # 0 to 1

df = df.dropna()

    Candlestick/Microstructure Features

In [157]:
# === CANDLESTICK BODY & WICKS ===
df["body_size"] = abs(df["close"] - df["open"]) / df["close"]
df["upper_wick"] = (df["high"] - df[["open", "close"]].max(axis=1)) / df["close"]
df["lower_wick"] = (df[["open", "close"]].min(axis=1) - df["low"]) / df["close"]

# Body direction
df["body_direction"] = np.sign(df["close"] - df["open"])

# === OPEN-CLOSE RELATIONSHIPS ===
df["open_close_diff"] = (df["close"] - df["open"]) / df["close"]
df["high_close_diff"] = (df["high"] - df["close"]) / df["close"]
df["low_close_diff"] = (df["close"] - df["low"]) / df["close"]

# === VOLUME-WEIGHTED PRICE (VWAP) ===
df["vwap_5"] = (df["close"] * df["volume"]).rolling(5).sum() / df["volume"].rolling(5).sum()
df["vwap_10"] = (df["close"] * df["volume"]).rolling(10).sum() / df["volume"].rolling(10).sum()
df["vwap_5_diff"] = (df["close"] - df["vwap_5"]) / df["close"]
df["vwap_10_diff"] = (df["close"] - df["vwap_10"]) / df["close"]

# === MOMENTUM OF MICROSTRUCTURE ===
df["close_position_change"] = df["close_position"].diff()
df["body_size_ratio"] = df["body_size"] / df["body_size"].rolling(10).mean()

# === BUYING/SELLING PRESSURE PROXIES ===
# Approximation: If close near high = buying pressure
df["buying_pressure"] = (df["close"] - df["low"]) / (df["high"] - df["low"] + 1e-10)
df["selling_pressure"] = (df["high"] - df["close"]) / (df["high"] - df["low"] + 1e-10)

# Clean up
df = df.dropna()

print(f"Added microstructure features. New shape: {df.shape}")

Added microstructure features. New shape: (31661, 59)


#### Creating Target

In [158]:
df["close_log_return"] = np.log(df["close"] / df["close"].shift(1))
df = df.dropna(how="any")

    Shift to avoid lookahead bias

In [159]:
df = df.copy()
df["shifted_log_return"] = df["close_log_return"].shift(-1)

In [160]:
df["close_log_return_lag1"] = df["shifted_log_return"].shift(1)
df["close_log_return_lag2"] = df["shifted_log_return"].shift(2)
df["close_log_return_lag3"] = df["shifted_log_return"].shift(3)
df = df.dropna(how="any")

#### Visualization discarded for later

In [161]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from scipy import stats

# # Assuming your data is loaded
# # df has columns: ['open', 'high', 'low', 'close', 'volume']

# # Calculate log returns (1-bar forward)
# df['log_return'] = np.log(df['close'] / df['close'].shift(1))

# # Remove NaN
# df = df.dropna()

# # ============================================
# # 1. BASIC STATISTICS
# # ============================================
# print("="*60)
# print("LOG RETURNS DISTRIBUTION ANALYSIS")
# print("="*60)
# print(f"Total samples: {len(df)}")
# print(f"Mean: {df['log_return'].mean():.6f}")
# print(f"Std Dev: {df['log_return'].std():.6f}")
# print(f"Skewness: {df['log_return'].skew():.4f}")
# print(f"Kurtosis: {df['log_return'].kurtosis():.4f}")
# print(f"Min: {df['log_return'].min():.4f}")
# print(f"Max: {df['log_return'].max():.4f}")
# print()

# # ============================================
# # 2. PERCENTILE ANALYSIS (CRITICAL)
# # ============================================
# percentiles = [1, 5, 10, 20, 25, 40, 50, 60, 75, 80, 90, 95, 99]
# print("PERCENTILE BREAKDOWN:")
# print("-" * 60)
# for p in percentiles:
#     val = np.percentile(df['log_return'], p)
#     print(f"{p:3d}th percentile: {val:8.4f} ({val*100:6.2f}%)")
# print()

# # ============================================
# # 3. PROPOSED CLASS BOUNDARIES
# # ============================================
# # Based on symmetric percentiles
# p10 = np.percentile(df['log_return'], 10)  # Class 4: Big Down
# p30 = np.percentile(df['log_return'], 30)  # Class 2: Small Down
# p70 = np.percentile(df['log_return'], 70)  # Class 1: Small Up
# p90 = np.percentile(df['log_return'], 90)  # Class 3: Big Up

# print("PROPOSED CLASS BOUNDARIES (Percentile-based):")
# print("-" * 60)
# print(f"Class 4 (Big Down):    return < {p10:.4f} ({p10*100:.2f}%)")
# print(f"Class 2 (Small Down):  {p10:.4f} to {p30:.4f}")
# print(f"Class 0 (Neutral):     {p30:.4f} to {p70:.4f}")
# print(f"Class 1 (Small Up):    {p70:.4f} to {p90:.4f}")
# print(f"Class 3 (Big Up):      return > {p90:.4f} ({p90*100:.2f}%)")
# print()

# # Alternative: Fixed percentage boundaries
# print("ALTERNATIVE: FIXED PERCENTAGE BOUNDARIES:")
# print("-" * 60)
# print(f"Class 4 (Big Down):    return < -0.50%")
# print(f"Class 2 (Small Down):  -0.50% to -0.15%")
# print(f"Class 0 (Neutral):     -0.15% to +0.15%")
# print(f"Class 1 (Small Up):    +0.15% to +0.50%")
# print(f"Class 3 (Big Up):      return > +0.50%")
# print()

# # Count samples in fixed boundaries
# class_4_count = len(df[df['log_return'] < -0.005])
# class_2_count = len(df[(df['log_return'] >= -0.005) & (df['log_return'] < -0.0015)])
# class_0_count = len(df[(df['log_return'] >= -0.0015) & (df['log_return'] < 0.0015)])
# class_1_count = len(df[(df['log_return'] >= 0.0015) & (df['log_return'] < 0.005)])
# class_3_count = len(df[df['log_return'] >= 0.005])

# print("Sample counts with fixed boundaries:")
# print(f"Class 0 (Neutral): {class_0_count} ({class_0_count/len(df)*100:.1f}%)")
# print(f"Class 1 (Small Up): {class_1_count} ({class_1_count/len(df)*100:.1f}%)")
# print(f"Class 2 (Small Down): {class_2_count} ({class_2_count/len(df)*100:.1f}%)")
# print(f"Class 3 (Big Up): {class_3_count} ({class_3_count/len(df)*100:.1f}%)")
# print(f"Class 4 (Big Down): {class_4_count} ({class_4_count/len(df)*100:.1f}%)")
# print()

# # ============================================
# # 4. VISUALIZATION
# # ============================================
# fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# # Histogram with KDE
# axes[0, 0].hist(df['log_return'], bins=100, alpha=0.7, color='steelblue', edgecolor='black')
# axes[0, 0].axvline(df['log_return'].mean(), color='red', linestyle='--', label=f'Mean: {df["log_return"].mean():.4f}')
# axes[0, 0].axvline(0, color='black', linestyle='-', alpha=0.3, label='Zero')
# axes[0, 0].set_xlabel('Log Return')
# axes[0, 0].set_ylabel('Frequency')
# axes[0, 0].set_title('Distribution of Log Returns')
# axes[0, 0].legend()
# axes[0, 0].grid(alpha=0.3)

# # Q-Q Plot (Test for normality)
# stats.probplot(df['log_return'], dist="norm", plot=axes[0, 1])
# axes[0, 1].set_title('Q-Q Plot (Normality Test)')
# axes[0, 1].grid(alpha=0.3)

# # Boxplot
# axes[1, 0].boxplot(df['log_return'], vert=True)
# axes[1, 0].set_ylabel('Log Return')
# axes[1, 0].set_title('Boxplot (Outlier Detection)')
# axes[1, 0].axhline(0, color='red', linestyle='--', alpha=0.5)
# axes[1, 0].grid(alpha=0.3)

# # Class boundaries visualization
# axes[1, 1].hist(df['log_return'], bins=100, alpha=0.5, color='gray', edgecolor='black')
# axes[1, 1].axvline(p10, color='red', linestyle='--', linewidth=2, label=f'10th: {p10:.4f}')
# axes[1, 1].axvline(p30, color='orange', linestyle='--', linewidth=2, label=f'30th: {p30:.4f}')
# axes[1, 1].axvline(p70, color='green', linestyle='--', linewidth=2, label=f'70th: {p70:.4f}')
# axes[1, 1].axvline(p90, color='blue', linestyle='--', linewidth=2, label=f'90th: {p90:.4f}')
# axes[1, 1].set_xlabel('Log Return')
# axes[1, 1].set_ylabel('Frequency')
# axes[1, 1].set_title('Proposed Class Boundaries (Percentile-based)')
# axes[1, 1].legend()
# axes[1, 1].grid(alpha=0.3)

# plt.tight_layout()
# plt.savefig('log_returns_distribution_analysis.png', dpi=300, bbox_inches='tight')
# plt.show()

# # ============================================
# # 5. NORMALITY TEST
# # ============================================
# # Jarque-Bera test
# jb_stat, jb_pvalue = stats.jarque_bera(df['log_return'])
# print("NORMALITY TESTS:")
# print("-" * 60)
# print(f"Jarque-Bera statistic: {jb_stat:.2f}")
# print(f"Jarque-Bera p-value: {jb_pvalue:.6f}")
# if jb_pvalue < 0.05:
#     print("→ Returns are NOT normally distributed (reject null)")
# else:
#     print("→ Returns appear normally distributed")
# print()

# # ============================================
# # 6. AUTOCORRELATION CHECK
# # ============================================
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# fig, axes = plt.subplots(1, 2, figsize=(15, 4))
# plot_acf(df['log_return'].dropna(), lags=50, ax=axes[0])
# axes[0].set_title('Autocorrelation Function (ACF)')
# plot_pacf(df['log_return'].dropna(), lags=50, ax=axes[1])
# axes[1].set_title('Partial Autocorrelation Function (PACF)')
# plt.tight_layout()
# plt.savefig('autocorrelation_analysis.png', dpi=300, bbox_inches='tight')
# plt.show()

# print("ACF/PACF analysis saved. Check for:")
# print("- Significant lags → momentum/mean reversion patterns")
# print("- All lags near zero → random walk (no predictability)")
# print()

# # ============================================
# # 7. TIME-BASED ANALYSIS
# # ============================================
# # Assuming you have datetime index
# if 'datetime' in df.columns or isinstance(df.index, pd.DatetimeIndex):
#     if 'datetime' not in df.columns:
#         df['datetime'] = df.index
    
#     df['hour'] = pd.to_datetime(df['datetime']).dt.hour
#     df['year'] = pd.to_datetime(df['datetime']).dt.year
    
#     print("VOLATILITY BY HOUR (Intraday Pattern):")
#     print("-" * 60)
#     hourly_stats = df.groupby('hour')['log_return'].agg(['mean', 'std', 'count'])
#     print(hourly_stats)
#     print()
    
#     print("ANNUAL STATISTICS (Regime Changes):")
#     print("-" * 60)
#     yearly_stats = df.groupby('year')['log_return'].agg(['mean', 'std', 'count'])
#     print(yearly_stats)
#     print()

# print("="*60)
# print("ANALYSIS COMPLETE")
# print("="*60)

#### Model Preparation

    Creating Target and Deciding Features

     Top 10 features by importance

In [162]:
X = df[[
   
    "hl_ratio",              # 26.9% - #1
    "body_size",             # 13.0% - #2
    "hour_cos",              # 10.5% - #3
    "volume_ratio",          # 9.4%  - #4
    "hour_sin",              # 8.1%  - #5
    "high_close_diff",       # 7.7%  - #6
    "close_5ma_diff_pct",    # 5.7%  - #7
    "close_log_return_lag1", # 4.1%  - #8
    "upper_shadow",          # 3.2%  - #9
    "lower_shadow"           # 3.0%  - #10
]]
# y = (df["shifted_log_return"] > 0).astype(int)

In [163]:
# Calculate expanding percentiles on PAST data only
df['p10'] = df['shifted_log_return'].expanding(min_periods=1000).quantile(0.10).shift(1)
df['p30'] = df['shifted_log_return'].expanding(min_periods=1000).quantile(0.30).shift(1)
df['p70'] = df['shifted_log_return'].expanding(min_periods=1000).quantile(0.70).shift(1)
df['p90'] = df['shifted_log_return'].expanding(min_periods=1000).quantile(0.90).shift(1)

def label_class_expanding(row):
    ret = row['shifted_log_return']
    p10, p30, p70, p90 = row['p10'], row['p30'], row['p70'], row['p90']
    
    if ret < p10: return 4      # Big Down
    elif ret < p30: return 2    # Small Down
    elif ret < p70: return 0    # Neutral
    elif ret < p90: return 1    # Small Up
    else: return 3              # Big Up

df['target'] = df.apply(label_class_expanding, axis=1)

# Drop the percentile columns (don't use as features)
df = df.drop(columns=['p10', 'p30', 'p70', 'p90'])

# Drop NaN
df = df.dropna()

print(df['target'].value_counts().sort_index())

target
0    12316
1     6543
2     6365
3     3732
4     2700
Name: count, dtype: int64


In [164]:
y = df["target"]

### Model

#### Best model after hyperparameter tuning

In [165]:

    
# # Step 1: 60% train, 40% remaining
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)

# # Step 2: Split 40% into 50/50 → each gets 20% of original
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

# model = RandomForestClassifier(
#     n_estimators=500,
#     max_depth=3,
#     min_samples_split=100,
#     min_samples_leaf=50,
#     class_weight='balanced',
#     random_state=42,
#     n_jobs=-1
# )
# model.fit(X_train, y_train)

# print(f"Train score: {accuracy_score(y_train, model.predict(X_train)):.4f}")
# print(f"Val score: {accuracy_score(y_val, model.predict(X_val)):.4f}")
# print(f"Test score: {accuracy_score(y_test, model.predict(X_test)):.4f}")
# print(f"\nTest Confusion Matrix:\n{confusion_matrix(y_test, model.predict(X_test))}")
# print(f"\nClassification Report:\n{classification_report(y_test, model.predict(X_test))}")




#### Walk forward Validation

In [166]:
# Define the initial training cutoff date
train_end_date = '2022-12-31'

# Split into initial training set and walk-forward test set
X_train_initial = X[:train_end_date]
y_train_initial = y[:train_end_date]

X_walkforward = X[train_end_date:]
y_walkforward = y[train_end_date:]

print(f"Initial Training Set: {X_train_initial.index[0]} to {X_train_initial.index[-1]}")
print(f"Training samples: {len(X_train_initial)}")
print(f"\nWalk-Forward Set: {X_walkforward.index[0]} to {X_walkforward.index[-1]}")
print(f"Walk-forward samples: {len(X_walkforward)}")
print(f"\nTraining period: ~{len(X_train_initial) / len(X) * 100:.1f}% of data")

Initial Training Set: 2020-10-19 10:00:00+05:30 to 2022-12-30 15:30:00+05:30
Training samples: 13456

Walk-Forward Set: 2023-01-02 09:30:00+05:30 to 2025-12-12 15:15:00+05:30
Walk-forward samples: 18200

Training period: ~42.5% of data


In [167]:
# Train initial Random Forest model
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=3,
    min_samples_split=100,
    min_samples_leaf=50,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest on initial training set...")
model.fit(X_train_initial, y_train_initial)
print("✓ Model training complete")

# Quick sanity check
train_score = model.score(X_train_initial, y_train_initial)
print(f"\nTraining accuracy: {train_score:.4f}")

Training Random Forest on initial training set...
✓ Model training complete

Training accuracy: 0.3396


In [168]:
# Get first 3 months of walk-forward data for testing
test_end_date = '2025-03-31'

X_test_3m = X_walkforward[:test_end_date]
y_test_3m = y_walkforward[:test_end_date]

print(f"Test Period: {X_test_3m.index[0]} to {X_test_3m.index[-1]}")
print(f"Test samples: {len(X_test_3m)}")

# Predict on first 3 months
y_pred_3m = model.predict(X_test_3m)

# Evaluate
accuracy_3m = accuracy_score(y_test_3m, y_pred_3m)
print(f"\nAccuracy on first 3 months (Jan-Mar 2023): {accuracy_3m:.4f}")

# Show classification report
print("\nClassification Report:")
print(classification_report(y_test_3m, y_pred_3m))

Test Period: 2023-01-02 09:30:00+05:30 to 2025-03-28 15:30:00+05:30
Test samples: 13847

Accuracy on first 3 months (Jan-Mar 2023): 0.3783

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.64      0.59      5756
           1       0.26      0.19      0.22      2815
           2       0.26      0.09      0.14      2786
           3       0.15      0.13      0.14      1269
           4       0.20      0.50      0.28      1221

    accuracy                           0.38     13847
   macro avg       0.28      0.31      0.27     13847
weighted avg       0.37      0.38      0.36     13847



In [169]:
# Get predictions and probabilities for the 3-month test period
y_pred_3m = model.predict(X_test_3m)
y_proba_3m = model.predict_proba(X_test_3m)

# Create results dataframe
results_3m = pd.DataFrame({
    'actual': y_test_3m.values,
    'predicted': y_pred_3m
}, index=y_test_3m.index)

# Add probabilities for each class
for i in range(5):
    results_3m[f'prob_class_{i}'] = y_proba_3m[:, i]

# Get actual returns
results_3m['actual_return'] = df.loc[results_3m.index, 'shifted_log_return']

# Trading logic: Only trade on class 3 (Big Up) and class 4 (Big Down) with >20% confidence
results_3m['position'] = 0  # Default: no position

# Long when predicted class 3 with >20% confidence
long_signal = (results_3m['predicted'] == 3) & (results_3m['prob_class_3'] > 0.20)
results_3m.loc[long_signal, 'position'] = 1

# Short when predicted class 4 with >20% confidence
short_signal = (results_3m['predicted'] == 4) & (results_3m['prob_class_4'] > 0.20)
results_3m.loc[short_signal, 'position'] = -1

# Strategy returns
results_3m['strategy_return'] = results_3m['position'] * results_3m['actual_return']

# Transaction costs: 0.04% per round trip
transaction_cost = 0.0004
results_3m['position_change'] = results_3m['position'].diff().fillna(results_3m['position'])
results_3m['trade_occurred'] = (results_3m['position_change'] != 0).astype(int)
results_3m['transaction_cost'] = results_3m['trade_occurred'] * transaction_cost
results_3m['strategy_return_net'] = results_3m['strategy_return'] - results_3m['transaction_cost']

# Performance metrics
total_return_gross = results_3m['strategy_return'].sum()
total_return_net = results_3m['strategy_return_net'].sum()
sharpe_ratio_net = results_3m['strategy_return_net'].mean() / results_3m['strategy_return_net'].std() * np.sqrt(252 * 26)

print("=== Strategy: Trade only Class 3 & 4 with >20% confidence ===")
print(f"\nTrades executed:")
print(f"  Long (Class 3): {long_signal.sum()}")
print(f"  Short (Class 4): {short_signal.sum()}")
print(f"  No position: {(results_3m['position'] == 0).sum()}")
print(f"\nGross Return: {total_return_gross:.4f} ({total_return_gross*100:.2f}%)")
print(f"Transaction Costs: {results_3m['transaction_cost'].sum():.4f}")
print(f"Net Return: {total_return_net:.4f} ({total_return_net*100:.2f}%)")
print(f"Sharpe Ratio (Annualized): {sharpe_ratio_net:.4f}")

=== Strategy: Trade only Class 3 & 4 with >20% confidence ===

Trades executed:
  Long (Class 3): 1050
  Short (Class 4): 3107
  No position: 9690

Gross Return: -0.1721 (-17.21%)
Transaction Costs: 1.2052
Net Return: -1.3773 (-137.73%)
Sharpe Ratio (Annualized): -1.6740


In [170]:
# Test different confidence thresholds
confidence_thresholds = np.arange(0.21, 0.41, 0.01)  # 21% to 40%

results_summary = []

for conf_threshold in confidence_thresholds:
    # Create temporary results
    temp_results = results_3m.copy()
    
    # Reset positions
    temp_results['position'] = 0
    
    # Long when predicted class 3 with >= threshold confidence
    long_signal = (temp_results['predicted'] == 3) & (temp_results['prob_class_3'] >= conf_threshold)
    temp_results.loc[long_signal, 'position'] = 1
    
    # Short when predicted class 4 with >= threshold confidence
    short_signal = (temp_results['predicted'] == 4) & (temp_results['prob_class_4'] >= conf_threshold)
    temp_results.loc[short_signal, 'position'] = -1
    
    # Calculate returns
    temp_results['strategy_return'] = temp_results['position'] * temp_results['actual_return']
    
    # Transaction costs
    temp_results['position_change'] = temp_results['position'].diff().fillna(temp_results['position'])
    temp_results['trade_occurred'] = (temp_results['position_change'] != 0).astype(int)
    temp_results['transaction_cost'] = temp_results['trade_occurred'] * transaction_cost
    temp_results['strategy_return_net'] = temp_results['strategy_return'] - temp_results['transaction_cost']
    
    # Metrics
    num_trades = (temp_results['position'] != 0).sum()
    num_longs = (temp_results['position'] == 1).sum()
    num_shorts = (temp_results['position'] == -1).sum()
    gross_return = temp_results['strategy_return'].sum()
    net_return = temp_results['strategy_return_net'].sum()
    total_costs = temp_results['transaction_cost'].sum()
    
    if temp_results['strategy_return_net'].std() > 0:
        sharpe = temp_results['strategy_return_net'].mean() / temp_results['strategy_return_net'].std() * np.sqrt(252 * 26)
    else:
        sharpe = 0
    
    results_summary.append({
        'confidence': conf_threshold,
        'num_trades': num_trades,
        'num_longs': num_longs,
        'num_shorts': num_shorts,
        'gross_return': gross_return,
        'transaction_costs': total_costs,
        'net_return': net_return,
        'sharpe_ratio': sharpe
    })

# Convert to DataFrame for easy viewing
summary_df = pd.DataFrame(results_summary)

print("=== Confidence Threshold Analysis (21% - 40%) ===\n")
print(summary_df.to_string(index=False))

# Find best performing threshold
best_sharpe_idx = summary_df['sharpe_ratio'].idxmax()
best_return_idx = summary_df['net_return'].idxmax()

print(f"\n=== Best Configurations ===")
print(f"\nBest Sharpe Ratio: {summary_df.loc[best_sharpe_idx, 'confidence']:.2f} "
      f"(Sharpe: {summary_df.loc[best_sharpe_idx, 'sharpe_ratio']:.4f}, "
      f"Return: {summary_df.loc[best_sharpe_idx, 'net_return']*100:.2f}%)")

print(f"\nBest Net Return: {summary_df.loc[best_return_idx, 'confidence']:.2f} "
      f"(Return: {summary_df.loc[best_return_idx, 'net_return']*100:.2f}%, "
      f"Sharpe: {summary_df.loc[best_return_idx, 'sharpe_ratio']:.4f})")

=== Confidence Threshold Analysis (21% - 40%) ===

 confidence  num_trades  num_longs  num_shorts  gross_return  transaction_costs  net_return  sharpe_ratio
       0.21        4156       1050        3106     -0.167701             1.2060   -1.373701     -1.669622
       0.22        4061       1027        3034     -0.161931             1.1948   -1.356731     -1.659974
       0.23        3733        919        2814     -0.215646             1.1400   -1.355646     -1.691012
       0.24        3387        814        2573     -0.420658             1.0684   -1.489058     -1.897045
       0.25        3003        673        2330     -0.248975             1.0056   -1.254575     -1.648454
       0.26        2521        442        2079     -0.194450             0.9428   -1.137250     -1.609825
       0.27        1879        129        1750     -0.497937             0.8116   -1.309537     -2.089652
       0.28        1501          1        1500     -0.663040             0.6652   -1.328240     -2.27

### Full Walk forward Validation.

In [171]:
# Walk-forward validation setup
confidence_threshold = 0.26
transaction_cost = 0.0004

In [172]:
# Initialize storage for results
all_predictions = []
all_actuals = []
all_probabilities = []  # Store full probability arrays
all_returns = []
window_metrics = []

In [173]:
# Convert string index to datetime BEFORE the loop
X_walkforward.index = pd.to_datetime(X_walkforward.index)
y_walkforward.index = pd.to_datetime(y_walkforward.index)

# Get walk-forward period boundaries
walkforward_start = X_walkforward.index[0]
walkforward_end = X_walkforward.index[-1]
current_date = walkforward_start
window_num = 0


In [174]:
print(walkforward_start)
print(walkforward_end)
print(current_date)

2023-01-02 09:30:00+05:30
2025-12-12 15:15:00+05:30
2023-01-02 09:30:00+05:30


In [175]:
# Initialize storage for predictions only
window_predictions_list = []

print("=== Walk-Forward Validation (3-Month Windows, Retrain After Each) ===\n")

while current_date < walkforward_end:
    window_num += 1
    
    # Define 3-month test window
    test_start = current_date
    test_end = test_start + pd.DateOffset(months=3)
    
    # Get test data for this window
    X_test_window = X_walkforward.loc[test_start:test_end]
    y_test_window = y_walkforward.loc[test_start:test_end]
    
    if len(X_test_window) == 0:
        break
    
    print(f"Window {window_num}: {X_test_window.index[0].date()} to {X_test_window.index[-1].date()} ({len(X_test_window)} bars)")
    
    # Predict on this window using current model
    y_pred = model.predict(X_test_window)
    y_proba = model.predict_proba(X_test_window)
    
    # Create predictions dataframe for this window
    window_predictions = pd.DataFrame(index=X_test_window.index)
    window_predictions['actual'] = y_test_window.values
    window_predictions['predicted'] = y_pred
    
    # Add probability scores for all classes
    for i in range(5):
        window_predictions[f'prob_class_{i}'] = y_proba[:, i]
    
    # Save this window's predictions
    window_predictions_list.append(window_predictions)
    
    # Retrain model: Add this window to training data
    X_train_expanded = pd.concat([X_train_initial, X_walkforward.loc[walkforward_start:test_end]])
    y_train_expanded = pd.concat([y_train_initial, y_walkforward.loc[walkforward_start:test_end]])
    
    print(f"  Retraining model with {len(X_train_expanded)} samples...")
    model.fit(X_train_expanded, y_train_expanded)
    print(f"  ✓ Model retrained\n")
    
    # Move to next window
    current_date = test_end

print("=== Walk-Forward Loop Complete ===\n")

=== Walk-Forward Validation (3-Month Windows, Retrain After Each) ===

Window 1: 2023-01-02 to 2023-03-31 (1550 bars)
  Retraining model with 15006 samples...
  ✓ Model retrained

Window 2: 2023-04-03 to 2023-06-30 (1500 bars)
  Retraining model with 16506 samples...
  ✓ Model retrained

Window 3: 2023-07-03 to 2023-09-29 (1575 bars)
  Retraining model with 18081 samples...
  ✓ Model retrained

Window 4: 2023-10-03 to 2024-01-02 (1530 bars)
  Retraining model with 19611 samples...
  ✓ Model retrained

Window 5: 2024-01-02 to 2024-04-02 (1533 bars)
  Retraining model with 21143 samples...
  ✓ Model retrained

Window 6: 2024-04-02 to 2024-07-02 (1508 bars)
  Retraining model with 22650 samples...
  ✓ Model retrained

Window 7: 2024-07-02 to 2024-10-01 (1600 bars)
  Retraining model with 24249 samples...
  ✓ Model retrained

Window 8: 2024-10-03 to 2025-01-02 (1530 bars)
  Retraining model with 25779 samples...
  ✓ Model retrained

Window 9: 2025-01-02 to 2025-04-02 (1551 bars)
  Retraini

In [176]:
# Combine all window predictions into one dataframe
all_predictions_df = pd.concat(window_predictions_list)

print(f"Total predictions collected: {len(all_predictions_df)}")
print(f"Date range: {all_predictions_df.index[0].date()} to {all_predictions_df.index[-1].date()}\n")

Total predictions collected: 18206
Date range: 2023-01-02 to 2025-12-12



In [177]:
all_predictions_df

Unnamed: 0_level_0,actual,predicted,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-02 09:30:00+05:30,3,4,0.084137,0.152246,0.147651,0.267892,0.348074
2023-01-02 09:45:00+05:30,0,4,0.105732,0.170619,0.181933,0.243290,0.298426
2023-01-02 10:00:00+05:30,0,0,0.297611,0.216539,0.213443,0.164041,0.108365
2023-01-02 10:15:00+05:30,0,0,0.318316,0.201793,0.216244,0.164189,0.099458
2023-01-02 10:30:00+05:30,0,0,0.308753,0.206530,0.216059,0.164850,0.103808
...,...,...,...,...,...,...,...
2025-12-12 14:15:00+05:30,0,2,0.183210,0.205668,0.214045,0.196510,0.200568
2025-12-12 14:30:00+05:30,2,1,0.231661,0.231879,0.220961,0.159948,0.155551
2025-12-12 14:45:00+05:30,1,1,0.210920,0.230872,0.216655,0.167946,0.173606
2025-12-12 15:00:00+05:30,1,3,0.209406,0.199524,0.197164,0.215359,0.178548


In [178]:
# Add actual returns to the predictions dataframe
all_predictions_df['actual_return'] = df.loc[all_predictions_df.index, 'shifted_log_return'].values

In [179]:
all_predictions_df

Unnamed: 0_level_0,actual,predicted,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4,actual_return
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-02 09:30:00+05:30,3,4,0.084137,0.152246,0.147651,0.267892,0.348074,0.006883
2023-01-02 09:45:00+05:30,0,4,0.105732,0.170619,0.181933,0.243290,0.298426,-0.000762
2023-01-02 10:00:00+05:30,0,0,0.297611,0.216539,0.213443,0.164041,0.108365,0.000038
2023-01-02 10:15:00+05:30,0,0,0.318316,0.201793,0.216244,0.164189,0.099458,0.000191
2023-01-02 10:30:00+05:30,0,0,0.308753,0.206530,0.216059,0.164850,0.103808,-0.000229
...,...,...,...,...,...,...,...,...
2025-12-12 14:15:00+05:30,0,2,0.183210,0.205668,0.214045,0.196510,0.200568,-0.000578
2025-12-12 14:30:00+05:30,2,1,0.231661,0.231879,0.220961,0.159948,0.155551,-0.004480
2025-12-12 14:45:00+05:30,1,1,0.210920,0.230872,0.216655,0.167946,0.173606,0.002127
2025-12-12 15:00:00+05:30,1,3,0.209406,0.199524,0.197164,0.215359,0.178548,0.004278


In [180]:
all_predictions_df["acc"] = (all_predictions_df["actual"] == all_predictions_df["predicted"])

In [181]:
all_predictions_df["acc"].value_counts()

acc
False    11419
True      6787
Name: count, dtype: int64