In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plts
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from model_module import model_preparation, best_model_flexible, best_model
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from joblib import dump, load

In [138]:
name_ = input("name: ")

In [139]:
model_file = f"./deployable_models/{name_}_5_XGB.joblib"

In [140]:
model = load(model_file)

In [141]:
filename = model_file.split("/")[2].split("_")[0]
version = "XGB"
timeframe = 5

In [142]:
columns_to_drop = ["atr","rsi","sma_50","sma_200","sma_20","uptrend","support","resistance"]

In [143]:
df = pd.read_csv(f"../raw_data_for_machine_learning/{filename}_{timeframe}.csv").set_index("datetime").drop(columns=columns_to_drop)
df.index = pd.to_datetime(df.index)

In [144]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,ml_target,ml_target_short
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 10:10:00+05:30,455.56,455.58,454.99,455.41,57604,,
2018-01-01 10:15:00+05:30,455.41,455.58,455.21,455.43,19862,,
2018-01-01 10:20:00+05:30,455.43,455.61,454.94,455.61,32768,,
2018-01-01 10:25:00+05:30,455.61,455.68,455.43,455.66,20440,,
2018-01-01 10:30:00+05:30,455.66,455.68,455.19,455.36,17808,,


In [145]:
df['ml_target'].isna().value_counts()

ml_target
True     146051
False      3686
Name: count, dtype: int64

In [146]:
df['ml_target_short'].isna().value_counts()

ml_target_short
True     146299
False      3438
Name: count, dtype: int64

#### Feature Engineering

##### Calculating Support and resistance

In [147]:
df["resistance"] = df["close"].shift().rolling(10).max() 
df["support"]    = df["close"].shift().rolling(10).min() 

##### Calculating Technical Indicators

In [148]:
# Calculate True Range (Wilder's definition)
df['prev_close'] = df['close'].shift(1)
df['tr'] = df[['high', 'low', 'prev_close']].apply(
    lambda x: max(x['high'] - x['low'], 
                  abs(x['high'] - x['prev_close']), 
                  abs(x['low'] - x['prev_close'])), axis=1
)
df['atr'] = df['tr'].shift().rolling(14).mean()

    Time of the day feature

In [149]:
# Option 3: Session Progress and Position-Based Features
df["hour"] = df.index.hour
# First, let's define market hours in minutes
market_open_minutes = 9 * 60 + 15  # 9:15 AM = 555 minutes from midnight
market_close_minutes = 15 * 60 + 30  # 3:30 PM = 930 minutes from midnight
total_trading_minutes = market_close_minutes - market_open_minutes  # 375 minutes (6h 15min)

# Get current time in minutes from midnight
df['current_minutes'] = df['hour'] * 60 + pd.to_datetime(df.index).minute

# Calculate session progress (0 at open, 1 at close)
df['session_progress'] = (df['current_minutes'] - market_open_minutes) / total_trading_minutes

# Create binary flags for early and late sessions
df['is_early_session'] = (df['session_progress'] < 0.33).astype(int)  # First ~2 hours
df['is_late_session'] = (df['session_progress'] > 0.67).astype(int)   # Last ~2 hours

# Optional: You can drop the intermediate 'current_minutes' column if you don't need it
# df = df.drop('current_minutes', axis=1)

    Intrabar volatility

In [150]:
df['hl_ratio'] = (
    (df['high'].shift(1) - df['low'].shift(1)) / df['close'].shift(1)
)   #This comes from previous candle


    Price positioning feature

In [151]:
df['open_to_support_dist'] = (df['open'] - df['support']) / df['open']

    Volume

In [152]:
df = df.copy()
lookback = 20  # 20 bars = 5 hours of 15min data

df["volume_ma"] = df["volume"].shift().rolling(window=lookback).mean()
df["volume_ratio"] = df["volume"].shift() / df["volume_ma"]

    RSI

In [153]:
def calculate_rsi(data, period=14):
    delta = data.diff().shift(1)  # âœ“ Shift after diff
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df["rsi_14"] = calculate_rsi(df["close"], period=14)

    MACD

In [154]:
def calculate_macd(data, fast=12, slow=26, signal=9):
    shifted_data = data.shift(1)
    ema_fast = shifted_data.ewm(span=fast, adjust=False).mean()
    ema_slow = shifted_data.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    macd_histogram = macd_line - signal_line
    return macd_line, signal_line, macd_histogram

df["macd"], df["macd_signal"], df["macd_hist"] = calculate_macd(df["close"])

# Normalize by PREVIOUS close (not current)
df["macd_norm"] = df["macd"] / df["close"].shift(1)
df["macd_hist_norm"] = df["macd_hist"] / df["close"].shift(1)

    Moving Averages

In [155]:
# Moving averages (just for calculation, not direct features)
df["5ma"] = df["close"].shift().rolling(5).mean()
df["10ma"] = df["close"].shift().rolling(10).mean()
df["50ma"] = df["close"].shift().rolling(50).mean()
df["200ma"] = df["close"].shift().rolling(200).mean()  # Fixed: was 100

# RELATIVE features (normalized by price)
df["5_10_ma_diff_pct"] = (df["5ma"] - df["10ma"]) / df["close"].shift()
df["close_5ma_diff_pct"] = (df["close"].shift() - df["5ma"]) / df["close"].shift()
df["close_10ma_diff_pct"] = (df["close"].shift() - df["10ma"]) / df["close"].shift()
df["golden_cross_pct"] = (df["50ma"] - df["200ma"]) / df["close"].shift()

# Additional useful MA features
df["close_50ma_diff_pct"] = (df["close"].shift() - df["50ma"]) / df["close"].shift()
df["close_200ma_diff_pct"] = (df["close"].shift() - df["200ma"]) / df["close"].shift()

# Slope/momentum of MAs (rate of change)
df["5ma_roc"] = df["5ma"].pct_change(periods=5)
df["50ma_roc"] = df["50ma"].pct_change(periods=10)


    Pressure Features

In [156]:
df["volume_pressure_prev"] = (
    df["close"].shift(1) - df["open"].shift(1)
) / df["volume"].shift(1)
df["range_per_volume_prev"] = (
    df["high"].shift(1) - df["low"].shift(1)
) / df["volume"].shift(1)


    Gap Pressure

In [157]:
df["gap_pct"] = (
    df["open"] - df["close"].shift(1)
) / df["close"].shift(1)


    Candle type

In [158]:
df["candle_type"] = (df["close"].shift() - df["open"].shift())/(df["high"].shift()-df["low"].shift())   
df["candle_type_lag1"] = df["candle_type"].shift(1)  # Previous candle
df["candle_type_lag2"] = df["candle_type"].shift(2)  # 2 candles ago


    Support and Resistance difference percentage

In [159]:
# Open's position relative to S/R levels
df['open_above_resistance'] = (
    ((df['open'] - df['resistance']) / df['atr']).shift(1)
)
df['open_below_support'] = (
    ((df['support'] - df['open']) / df['atr']).shift(1)
)

# Or combined - where is open in the S/R range?
df['open_sr_position'] = (
    ((df['open'] - df['support']) / (df['resistance'] - df['support']))
).shift(1)  # 0 = at support, 1 = at resistance, >1 = above, <0 = below

# S/R range width (how wide is the channel?)
df['sr_range_atr'] = (
    ((df['resistance'] - df['support']) / df['atr']).shift(1)
)

    Breakout Features

In [160]:
# Strength
df['resistance_breakout_strength'] = (
    ((df['close'] - df['resistance']) / df['atr']).clip(lower=0).shift(1)
)
df['support_breakdown_strength'] = (
    ((df['support'] - df['close']) / df['atr']).clip(lower=0).shift(1)
)
# Conviction
rng = (df['high'] - df['low']).replace(0, np.nan)
df['resistance_breakout_conviction'] = (
    ((df['close'] - df['resistance']) / rng).clip(-1, 1).shift(1)
)
df['support_breakdown_conviction'] = (
    ((df['support'] - df['close']) / rng).clip(-1, 1).shift(1)
)
# Volume
df['breakout_volume_ratio'] = (
    df['volume'] / df['volume'].rolling(20).mean()
).shift(1)
# Pressure / velocity
df['resistance_touch_count'] = (
    (df['high'].shift(1) >= df['resistance'].shift(1))
    .rolling(10).sum()
)
df['support_touch_count'] = (
    (df['low'].shift(1) <= df['support'].shift(1))
    .rolling(10).sum()
)

#### Target Creation

In [161]:
df['ml_target'] = np.where(
    df['ml_target'].isna(), 
    0,
    np.where(df['ml_target'] > 0, 1, -1)
)

df['ml_target_short'] = np.where(
    df['ml_target_short'].isna(), 
    0,
    np.where(df['ml_target_short'] > 0, 1, -1)
)


In [162]:
print(df['ml_target'].value_counts(dropna=False))
print(df['ml_target_short'].value_counts(dropna=False))


ml_target
 0    146051
 1      2168
-1      1518
Name: count, dtype: int64
ml_target_short
 0    146299
 1      2156
-1      1282
Name: count, dtype: int64


In [163]:
df['ml_target_combined'] = (
    df['ml_target']
    .where(df['ml_target'] != 0, df['ml_target_short'])
)


In [164]:
df['ml_target_combined'].value_counts()


ml_target_combined
 0    142613
 1      4324
-1      2800
Name: count, dtype: int64

In [165]:
feature_and_target = [
    # Time-based features
    'hour',
    'session_progress',
    'is_early_session',
    'is_late_session',
    
    # Volatility features
    # 'atr',
    'hl_ratio',
    
    # Volume features
    'volume_ma',
    'volume_ratio',
    'volume_pressure_prev',
    'range_per_volume_prev',
    'breakout_volume_ratio',
    
    # Technical indicators
    'rsi_14',
    'macd',
    'macd_signal',
    'macd_hist',
    'macd_norm',
    'macd_hist_norm',
    
    # Moving averages (relative features)
    '5ma',
    '10ma',
    '50ma',
    '200ma',
    '5_10_ma_diff_pct',
    'close_5ma_diff_pct',
    'close_10ma_diff_pct',
    'golden_cross_pct',
    'close_50ma_diff_pct',
    'close_200ma_diff_pct',
    '5ma_roc',
    '50ma_roc',
    
    # Price gaps
    'gap_pct',
    
    # Candle patterns
    'candle_type',
    'candle_type_lag1',
    'candle_type_lag2',
    
    # Support/Resistance
    'resistance',
    'support',
    
    # Breakout features
    'resistance_breakout_strength',
    'support_breakdown_strength',
    'resistance_breakout_conviction',
    'support_breakdown_conviction',
    'resistance_touch_count',
    'support_touch_count',

    #Target

    "ml_target_combined"
]

In [166]:

df_feature_target = df[feature_and_target]
print(len(df_feature_target))
df_feature_target = df_feature_target.dropna(how="any")
print(len(df_feature_target))


149737
149488


In [167]:
df_feature_target = df_feature_target[
    (df_feature_target['ml_target_combined'] == -1) |
    (df_feature_target['ml_target_combined'] == 1)
]


In [168]:
X = df_feature_target.drop(columns="ml_target_combined")
y = df_feature_target["ml_target_combined"]

#### Applying Machine learning filter to strategy.

#### Adding Model to the strategy

In [169]:
feature_and_target = [
    # Time-based features
    'hour',
    'session_progress',
    'is_early_session',
    'is_late_session',
    
    # Volatility features
    # 'atr',
    'hl_ratio',
    
    # Volume features
    'volume_ma',
    'volume_ratio',
    'volume_pressure_prev',
    'range_per_volume_prev',
    'breakout_volume_ratio',
    
    # Technical indicators
    'rsi_14',
    'macd',
    'macd_signal',
    'macd_hist',
    'macd_norm',
    'macd_hist_norm',
    
    # Moving averages (relative features)
    '5ma',
    '10ma',
    '50ma',
    '200ma',
    '5_10_ma_diff_pct',
    'close_5ma_diff_pct',
    'close_10ma_diff_pct',
    'golden_cross_pct',
    'close_50ma_diff_pct',
    'close_200ma_diff_pct',
    '5ma_roc',
    '50ma_roc',
    
    # Price gaps
    'gap_pct',
    
    # Candle patterns
    'candle_type',
    'candle_type_lag1',
    'candle_type_lag2',
    
    # Support/Resistance
    'resistance',
    'support',
    
    # Breakout features
    'resistance_breakout_strength',
    'support_breakdown_strength',
    'resistance_breakout_conviction',
    'support_breakdown_conviction',
    'resistance_touch_count',
    'support_touch_count',

]


df = df.dropna(subset=feature_and_target).copy()


df["y_pred"] = model.predict(df[feature_and_target])
y_predict_probability = model.predict_proba(df[feature_and_target])

In [170]:
df.to_csv(f"./deployable_models/{filename}_xgb_{timeframe}.csv")