In [83]:
from dbConn import engine
import pandas as pd
import numpy as np

In [84]:
data = pd.read_sql_query('''
     SELECT 
        spx.Datetime,
        spx.Open AS Open_SPX,
        spx.High AS High_SPX,
        spx.Low AS Low_SPX,
        spx.Close AS Close_SPX,
        ndx.Open AS Open_NDX,
        ndx.High AS High_NDX,
        ndx.Low AS Low_NDX,
        ndx.Close AS Close_NDX,
        rut.Open AS Open_RUT,
        rut.High AS High_RUT,
        rut.Low AS Low_RUT,
        rut.Close AS Close_RUT
       FROM (
        SELECT * FROM SPX_full_5min
        ) spx
        JOIN (
        SELECT * FROM NDX_full_5min
        ) ndx ON spx.Datetime = ndx.Datetime
        JOIN (
        SELECT * FROM RUT_full_5min
        ) rut ON spx.Datetime = rut.Datetime
        WHERE spx.Datetime >= '2020-01-01'
''', con = engine.connect())

In [85]:
data['Date'] = data['Datetime'].dt.date
data['Time'] = data['Datetime'].dt.time

In [86]:
data.head()

Unnamed: 0,Datetime,Open_SPX,High_SPX,Low_SPX,Close_SPX,Open_NDX,High_NDX,Low_NDX,Close_NDX,Open_RUT,High_RUT,Low_RUT,Close_RUT,Date,Time
0,2020-01-02 09:30:00,3244.67,3248.08,3244.16,3246.92,8802.22,8817.35,8796.83,8814.99,1675.9,1678.14,1673.69,1674.68,2020-01-02,09:30:00
1,2020-01-02 09:35:00,3246.89,3249.63,3246.89,3248.39,8815.19,8829.71,8815.19,8826.76,1674.57,1675.63,1674.45,1674.75,2020-01-02,09:35:00
2,2020-01-02 09:40:00,3248.39,3249.88,3247.81,3248.69,8826.72,8831.23,8822.25,8824.69,1674.77,1674.79,1671.56,1671.56,2020-01-02,09:40:00
3,2020-01-02 09:45:00,3248.67,3250.04,3247.22,3247.39,8824.52,8825.96,8817.1,8817.74,1671.53,1671.54,1668.07,1668.07,2020-01-02,09:45:00
4,2020-01-02 09:50:00,3247.38,3248.14,3245.61,3246.18,8817.85,8819.86,8814.75,8817.78,1668.0,1668.0,1666.29,1666.39,2020-01-02,09:50:00


In [87]:
import datetime

In [124]:
ticker_dict = {
    "^GSPC":{
        "hod_model":"models/hod_model_spx.joblib",
        "lod_model":"models/lod_model_spx.joblib",
        "suffix":"SPX"
    },
    "^NDX":{
        "hod_model":"models/hod_model_ndx.joblib",
        "lod_model":"models/lod_model_ndx.joblib",
        "suffix":"NDX"
    },
    "^RUT":{
        "hod_model":"models/hod_model_rut.joblib",
        "lod_model":"models/lod_model_rut.joblib",
        "suffix":"RUT"
    }
}

tck = '^NDX'

spx_data = data.loc[:,[
    'Datetime',
    f'Open_{ticker_dict[tck]["suffix"]}',
    f'High_{ticker_dict[tck]["suffix"]}',
    f'Low_{ticker_dict[tck]["suffix"]}',
    f'Close_{ticker_dict[tck]["suffix"]}'
]]
spx_data.columns = ['Datetime','open','high','low','close']
spx_data['time'] = spx_data['Datetime'].dt.time
spx_data = spx_data.loc[(spx_data['time'] < datetime.time(16,0)) & (spx_data['time'] >= datetime.time(9,30))]

In [125]:
spx_data = spx_data.set_index('Datetime')

In [126]:
# Get the last bar of each date
spx_data['eod_close'] = spx_data.groupby(spx_data.index.date)['close'].tail(1)
spx_data['prev_close'] = spx_data['eod_close'].shift(1)
spx_data['prev_close'] = spx_data['prev_close'].ffill()
spx_data['eod_close'] = spx_data['eod_close'].bfill()
spx_data['green_day'] = spx_data['eod_close'] > spx_data['prev_close']

spx_data['eod_close_pts'] = spx_data['eod_close'] - spx_data['prev_close']
spx_data['eod_close_pct'] = spx_data['eod_close_pts'] / spx_data['prev_close']

In [127]:
spx_data.tail()

Unnamed: 0_level_0,open,high,low,close,time,eod_close,prev_close,green_day,eod_close_pts,eod_close_pct
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-12-29 15:35:00,16838.74,16842.86,16838.01,16841.75,15:35:00,16828.29,16902.5,False,-74.21,-0.00439
2023-12-29 15:40:00,16841.74,16847.36,16834.87,16835.1,15:40:00,16828.29,16902.5,False,-74.21,-0.00439
2023-12-29 15:45:00,16834.79,16837.59,16830.15,16830.62,15:45:00,16828.29,16902.5,False,-74.21,-0.00439
2023-12-29 15:50:00,16826.83,16840.57,16808.74,16808.88,15:50:00,16828.29,16902.5,False,-74.21,-0.00439
2023-12-29 15:55:00,16804.76,16841.42,16800.79,16828.29,15:55:00,16828.29,16902.5,False,-74.21,-0.00439


In [128]:
def create_shifted_cols(dataframe, col_name, shift_per):
    for i in range(1, shift_per+1):
        dataframe[f'{col_name}_shifted_{i}'] = dataframe[col_name].shift(i)

In [129]:
import numpy as np
import pandas as pd
import datetime
from pandas.tseries.offsets import BDay

spx_data['label'] = np.nan

# Per date, run it.
for day in sorted(set(spx_data.index.date)):
    day_str = datetime.datetime.strftime(day, '%Y-%m-%d')
    day_open = spx_data.loc[day_str, 'open'].iloc[0]
    spx_data.loc[day_str, 'lod'] = spx_data.loc[day_str, 'low'].min()
    spx_data.loc[day_str, 'label_lod'] = (spx_data.loc[day_str, 'low'] == spx_data.loc[day_str, 'lod']).astype(int)
    
    spx_data.loc[day_str, 'hod'] = spx_data.loc[day_str, 'high'].max()
    spx_data.loc[day_str, 'label'] = (spx_data.loc[day_str, 'high'] == spx_data.loc[day_str, 'hod']).astype(int)
    spx_data.loc[day_str, 'day_open'] = day_open
    spx_data.loc[day_str, 'day_open_pts'] = spx_data.loc[day_str, 'close'] - spx_data.loc[day_str, 'day_open']
    spx_data.loc[day_str, 'day_open_pct'] = spx_data.loc[day_str, 'day_open_pts'] / spx_data.loc[day_str, 'day_open']
    spx_data.loc[day_str, 'prev_close_pts'] = spx_data.loc[day_str, 'close'] - spx_data.loc[day_str, 'prev_close']
    spx_data.loc[day_str, 'prev_close_pct'] = spx_data.loc[day_str, 'prev_close_pts'] / spx_data.loc[day_str, 'prev_close']

    # Shifted
    spx_data.loc[day_str, 'prev_close_pct_n1'] = spx_data.loc[day_str, 'prev_close_pct'].shift(1)
    spx_data.loc[day_str, 'prev_close_pct_n2'] = spx_data.loc[day_str, 'prev_close_pct'].shift(2)
    spx_data.loc[day_str, 'prev_close_pct_n3'] = spx_data.loc[day_str, 'prev_close_pct'].shift(3)

    spx_data.loc[day_str, 'day_open_pct_n1'] = spx_data.loc[day_str, 'day_open_pct'].shift(1)
    spx_data.loc[day_str, 'day_open_pct_n2'] = spx_data.loc[day_str, 'day_open_pct'].shift(2)
    spx_data.loc[day_str, 'day_open_pct_n3'] = spx_data.loc[day_str, 'day_open_pct'].shift(3)



    # Label the data: 1 if the current high is the HOD, 0 otherwise

    # # Find the index of HOD
    # hod_index = spx_data.loc[day_str][spx_data.loc[day_str, 'label'] == 1].index

    # # If HOD is found within the first 5 bars, label the first 5 bars as 1
    # if not hod_index.empty:
    #     hod_index = hod_index[0]  # In case there are multiple, take the first
    #     if hod_index <= spx_data.loc[day_str].index[4]:  # Check if HOD is within the first 5 bars
    #         spx_data.loc[spx_data.loc[day_str].index[:5], 'label'] = 1
    #     else:
    #         start_time = spx_data.loc[day_str].index[0]
    #         end_time = hod_index
    #         # Calculate the start index using Timedelta
    #         start_index = end_time - pd.Timedelta(minutes=5 * 4)
    #         start_index = max(start_index, start_time)
    #         spx_data.loc[start_index:end_time, 'label'] = 1

# spx_data.head()


In [130]:
spx_data.loc['2020-01-04':, ['prev_close_pct_n1','prev_close_pct_n2','prev_close_pct_n3']].dropna()

Unnamed: 0_level_0,prev_close_pct_n1,prev_close_pct_n2,prev_close_pct_n3
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-06 09:45:00,-0.003943,-0.005310,-0.005422
2020-01-06 09:50:00,-0.002972,-0.003943,-0.005310
2020-01-06 09:55:00,-0.002741,-0.002972,-0.003943
2020-01-06 10:00:00,-0.001599,-0.002741,-0.002972
2020-01-06 10:05:00,-0.001314,-0.001599,-0.002741
...,...,...,...
2023-12-29 15:35:00,-0.003749,-0.003197,-0.003724
2023-12-29 15:40:00,-0.003594,-0.003749,-0.003197
2023-12-29 15:45:00,-0.003988,-0.003594,-0.003749
2023-12-29 15:50:00,-0.004253,-0.003988,-0.003594


In [131]:
spx_data['gap_open'] = spx_data['day_open'] - spx_data['prev_close']
spx_data['gap_open_pct'] = spx_data['gap_open'] / spx_data['prev_close']

# Dissecting Tops
Purpose
- Determine what a top looks like.

Questions
- When do they happen?
- Before they happen, what does action look like?
- When they happen, what does the action look like after?

In [132]:
hod_in = []
for num, row in spx_data.iterrows():
    if row['hod'] == row['high']:
        hod_in.append(1)
    else:
        hod_in.append(np.nan)
spx_data['hod_in'] = hod_in

In [133]:
lod_in = []
for num, row in spx_data.iterrows():
    if row['lod'] == row['low']:
        lod_in.append(1)
    else:
        lod_in.append(np.nan)
spx_data['lod_in'] = lod_in

In [134]:
# Per day, ffill for the 1s, and then fillna 0
for day in sorted(set(spx_data.index.date)):
    day_str = datetime.datetime.strftime(day, '%Y-%m-%d')
    spx_data.loc[day_str, 'lod_in'] = spx_data.loc[day_str, 'lod_in'].ffill()
    spx_data.loc[day_str, 'lod_in'] = spx_data.loc[day_str, 'lod_in'].fillna(0)
    spx_data.loc[day_str, 'hod_in'] = spx_data.loc[day_str, 'hod_in'].ffill()
    spx_data.loc[day_str, 'hod_in'] = spx_data.loc[day_str, 'hod_in'].fillna(0)

In [135]:
def get_top_probability(time, prev_close_pct):
    '''
    Function that returns the historical probability that the top is in, given a time of day and magnitude.
    '''
    # Filter the data for the given time and magnitude
    relevant_data = spx_data[(spx_data['time'] == time) & (spx_data['prev_close_pct'] >= prev_close_pct)]
    
    # Calculate the probability
    top_probability = relevant_data['hod_in'].mean()
    # bot_probability = relevant_data['lod_in'].mean()
    
    return top_probability

def get_low_probability(time, prev_close_pct):
    '''
    Function that returns the historical probability that the low is in, given a time of day and magnitude.
    '''
    # Filter the data for the given time and magnitude
    relevant_data = spx_data[(spx_data['time'] == time) & (spx_data['prev_close_pct'] <= prev_close_pct)]
    
    # Calculate the probability
    probability = relevant_data['lod_in'].mean()
    
    return probability

In [136]:
time = datetime.time(15,0)
prev_close_pct = -0.007
# spx_data.loc[(spx_data['time'] == time) & (spx_data['prev_close_pct'] >= prev_close_pct)]
get_top_probability(time, prev_close_pct)

0.6058981233243967

In [137]:
get_low_probability(time, prev_close_pct)

0.5019762845849802

In [138]:
spx_data[['time','hod_in','lod_in','gap_open_pct','open']]

Unnamed: 0_level_0,time,hod_in,lod_in,gap_open_pct,open
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:30:00,09:30:00,0.0,0.0,,8802.22
2020-01-02 09:35:00,09:35:00,0.0,0.0,,8815.19
2020-01-02 09:40:00,09:40:00,0.0,0.0,,8826.72
2020-01-02 09:45:00,09:45:00,0.0,0.0,,8824.52
2020-01-02 09:50:00,09:50:00,0.0,0.0,,8817.85
...,...,...,...,...,...
2023-12-29 15:35:00,15:35:00,1.0,1.0,-0.000004,16838.74
2023-12-29 15:40:00,15:40:00,1.0,1.0,-0.000004,16841.74
2023-12-29 15:45:00,15:45:00,1.0,1.0,-0.000004,16834.79
2023-12-29 15:50:00,15:50:00,1.0,1.0,-0.000004,16826.83


In [139]:
def get_low_probability(time, prev_close_pct):
    '''
    Function that returns the historical probability that the low is in, given a time of day and magnitude.
    '''
    # Filter the data for the given time and magnitude
    relevant_data = spx_data[(spx_data['time'] == time) & (spx_data['prev_close_pct'] <= prev_close_pct)]
    
    # Calculate the probability
    probability = relevant_data['lod_in'].mean()
    
    return probability

In [140]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler
import datetime

# Function to convert time string to seconds past midnight
def convert_time_to_seconds(time_series):
    return time_series.apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)

def apply_convert_time_to_seconds(x):
    return x.apply(convert_time_to_seconds)

time_to_seconds = FunctionTransformer(apply_convert_time_to_seconds, validate=False)

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('time', make_pipeline(time_to_seconds, StandardScaler()), ['time']),
        ('prev_close_pct', StandardScaler(), ['prev_close_pct']),
        ('prev_close_pct_n1', StandardScaler(), ['prev_close_pct_n1']),
        ('prev_close_pct_n2', StandardScaler(), ['prev_close_pct_n2']),
        ('prev_close_pct_n3', StandardScaler(), ['prev_close_pct_n3']),
        ('day_open_pct', StandardScaler(), ['day_open_pct']),
        ('day_open_pct_n1', StandardScaler(), ['day_open_pct_n1']),
        ('day_open_pct_n2', StandardScaler(), ['day_open_pct_n2']),
        ('day_open_pct_n3', StandardScaler(), ['day_open_pct_n3']),
        ('gap_open_pct', StandardScaler(), ['gap_open_pct'])
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
    # ('classifier', lgb.LGBMClassifier())
    # ('classifier', xgb.XGBClassifier())
])

df_use = spx_data.dropna(subset=[
    'prev_close_pct',
    'gap_open_pct',
    'day_open_pct_n3',
    'prev_close_pct_n3'
    ])

# Train the model
X = df_use[[
    'time',
    'prev_close_pct',
    'prev_close_pct_n1',
    'prev_close_pct_n2',
    'prev_close_pct_n3',
    'day_open_pct',
    'day_open_pct_n1',
    'day_open_pct_n2',
    'day_open_pct_n3',
    'gap_open_pct'
]]
y_flags = df_use[['lod_in','hod_in']]
# pipeline.fit(X, y)

In [141]:
from sklearn.base import clone

pipeline_lod = clone(pipeline)

In [142]:
# 80/20 split, where date at 80th pct is the cutoff date
date_idx = int(np.floor(len(X.index.date) * .9))
cutoff_date = X.index.date[date_idx]

X_train = X.loc[:cutoff_date]
y_train = y_flags.loc[:cutoff_date]
X_test = X.loc[cutoff_date:]
y_test = y_flags.loc[cutoff_date:]

In [143]:
len(y_train)

67636

In [144]:
len(y_test)

7542

In [145]:
y_train.mean()

lod_in    0.633982
hod_in    0.542315
dtype: float64

In [146]:
y_test.mean()

lod_in    0.621188
hod_in    0.593874
dtype: float64

In [147]:
X.isnull().sum()

time                 0
prev_close_pct       0
prev_close_pct_n1    0
prev_close_pct_n2    0
prev_close_pct_n3    0
day_open_pct         0
day_open_pct_n1      0
day_open_pct_n2      0
day_open_pct_n3      0
gap_open_pct         0
dtype: int64

In [148]:
import joblib
hod_model = pipeline.fit(X_train, y_train['hod_in'])
lod_model = pipeline_lod.fit(X_train, y_train['lod_in'])

# Save models
joblib.dump(hod_model, ticker_dict[tck]["hod_model"])
joblib.dump(lod_model, ticker_dict[tck]["lod_model"])

['models/lod_model_ndx.joblib']

In [149]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score

# Metrics for HOD model
y_pred_hod = hod_model.predict_proba(X_test)[:,-1]

roc_auc_hod = roc_auc_score(y_test['hod_in'], y_pred_hod)
precision_hod = precision_score(y_test['hod_in'], y_pred_hod > 0.5)
recall_hod = recall_score(y_test['hod_in'], y_pred_hod > 0.5)

print(f"ROC AUC Score: {roc_auc_hod}")
print(f"Precision Score: {precision_hod}")
print(f"Recall Score: {recall_hod}")

ROC AUC Score: 0.8503636916412697
Precision Score: 0.8202378176731173
Recall Score: 0.7854431792810895


In [150]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score

# Metrics for LOD model
y_pred_lod = lod_model.predict_proba(X_test)[:,-1]

roc_auc_lod = roc_auc_score(y_test['lod_in'], y_pred_lod)
precision_lod = precision_score(y_test['lod_in'], y_pred_lod > 0.5)
recall_lod = recall_score(y_test['lod_in'], y_pred_lod > 0.5)

print(f"ROC AUC Score: {roc_auc_lod}")
print(f"Precision Score: {precision_lod}")
print(f"Recall Score: {recall_lod}")

ROC AUC Score: 0.8470618514917209
Precision Score: 0.7589961247462631
Recall Score: 0.8779082177161153


In [151]:
# feature_names = pipeline.feature_names_in_
# coefficients = pipeline.named_steps['classifier'].feature_importances_
# # coefficients = pipeline.named_steps['classifier'].coef_[0]
# for feature, coef in zip(feature_names, coefficients):
#     print(f"{feature}: {coef}")

In [152]:
import datetime
import plotly.express as px


time_range = [datetime.time(hour, minute) for hour in range(9, 16) for minute in range(0, 60, 5)]
time_range = [x for x in time_range if x >= datetime.time(9,45)]
df_val = pd.DataFrame()
for t in time_range:
    X_val = X_test.loc[X_test['time'] <= t].copy()
    y_val = y_test.loc[X_val.index, 'hod_in'].copy()

    y_pred_val = pipeline.predict_proba(X_val)[:,-1]

    df_val.loc[t, [
        'roc_auc',
        'precision',
        'recall'
    ]] =  roc_auc_score(y_val, y_pred_val), precision_score(y_val, y_pred_val > 0.5), recall_score(y_val, y_pred_val > 0.5)


fig = px.line(df_val, title='Validation Metrics Over Time (HOD Model)')
fig.show()

In [153]:
import datetime
import plotly.express as px


time_range = [datetime.time(hour, minute) for hour in range(9, 16) for minute in range(0, 60, 5)]
time_range = [x for x in time_range if x >= datetime.time(9,45)]
df_val = pd.DataFrame()
for t in time_range:
    X_val = X_test.loc[X_test['time'] <= t].copy()
    y_val = y_test.loc[X_val.index, 'lod_in'].copy()

    y_pred_val = pipeline_lod.predict_proba(X_val)[:,-1]

    df_val.loc[t, [
        'roc_auc',
        'precision',
        'recall'
    ]] =  roc_auc_score(y_val, y_pred_val), precision_score(y_val, y_pred_val > 0.5), recall_score(y_val, y_pred_val > 0.5)


fig = px.line(df_val, title='Validation Metrics Over Time (LOD Model)')
fig.show()

In [154]:
df_results = X_test.copy()
df_results['pred_hod'] = y_pred_hod
df_results['pred_lod'] = y_pred_lod

In [155]:
X_test.index[0]

Timestamp('2023-08-08 09:45:00')

In [156]:
import plotly.graph_objects as go

# Pick a day in the test set
test_day = '2023-12-29'
test_df = df_results.loc[test_day:test_day]
X_prices = spx_data.loc[test_day:test_day, ['open','high','low','close']]

df_viz = test_df.merge(X_prices, left_index=True, right_index=True)

fig = go.Figure(data=[go.Candlestick(x=df_viz.index,
                open=df_viz['open'],
                high=df_viz['high'],
                low=df_viz['low'],
                close=df_viz['close'])])

fig.add_trace(go.Scatter(x=df_viz.index, y=df_viz['pred_hod'], mode='lines', name='pred_hod', yaxis='y2', line=dict(color='#ff5f5f')))
fig.add_trace(go.Scatter(x=df_viz.index, y=df_viz['pred_lod'], mode='lines', name='pred_lod', yaxis='y2', line=dict(color='#3399cc')))

fig.add_shape(
    type="line",
    x0=df_viz.index.min(),
    x1=df_viz.index.max(),
    y0=0.5,
    y1=0.5,
    yref='y2',
    line=dict(
        color="Red",
        width=1.5,
        dash="dash",
    )
)

fig.update_layout(
    template='plotly_dark',
    yaxis=dict(
        tickformat='.0f'
    ),
    yaxis2=dict(
        overlaying='y',
        side='right',
        tickformat=".0%"
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    title='OHLC vs Prediction Over Time',
    xaxis_rangeslider_visible=False
)

fig.show()

# Inference Pipeline

In [157]:
# Get data
import yfinance as yf
import numpy as np
import pandas as pd
import datetime

def create_features(ticker_str):
    '''
    Function to create dataframe of features for top/bottom model.
    '''
    ticker = yf.Ticker(ticker_str)
    df = ticker.history(period='5d',interval='5m')
    df = df.loc[
        (df.index.time >= datetime.time(9,45)) &\
        (df.index.time < datetime.time(16,0)),
    ['Open','High','Low','Close']]
    df.columns = ['open','high','low','close']

    df['time'] = df.index.time
    df['eod_close'] = df.groupby(df.index.date)['close'].tail(1)
    df['prev_close'] = df['eod_close'].shift(1)
    df['prev_close'] = df['prev_close'].ffill()
    df['eod_close'] = df['eod_close'].bfill()
    df['green_day'] = df['eod_close'] > df['prev_close']

    df['eod_close_pts'] = df['eod_close'] - df['prev_close']
    df['eod_close_pct'] = df['eod_close_pts'] / df['prev_close']

    for day in sorted(set(df.index.date)):
        day_str = datetime.datetime.strftime(day, '%Y-%m-%d')
        day_open = df.loc[day_str, 'open'].iloc[0]
        df.loc[day_str, 'lod'] = df.loc[day_str, 'low'].min()
        df.loc[day_str, 'label_lod'] = (df.loc[day_str, 'low'] == df.loc[day_str, 'lod']).astype(int)
        
        df.loc[day_str, 'hod'] = df.loc[day_str, 'high'].max()
        df.loc[day_str, 'label'] = (df.loc[day_str, 'high'] == df.loc[day_str, 'hod']).astype(int)
        df.loc[day_str, 'day_open'] = day_open
        df.loc[day_str, 'day_open_pts'] = df.loc[day_str, 'close'] - df.loc[day_str, 'day_open']
        df.loc[day_str, 'day_open_pct'] = df.loc[day_str, 'day_open_pts'] / df.loc[day_str, 'day_open']
        df.loc[day_str, 'prev_close_pts'] = df.loc[day_str, 'close'] - df.loc[day_str, 'prev_close']
        df.loc[day_str, 'prev_close_pct'] = df.loc[day_str, 'prev_close_pts'] / df.loc[day_str, 'prev_close']

        # Shifted
        df.loc[day_str, 'prev_close_pct_n1'] = df.loc[day_str, 'prev_close_pct'].shift(1)
        df.loc[day_str, 'prev_close_pct_n2'] = df.loc[day_str, 'prev_close_pct'].shift(2)
        df.loc[day_str, 'prev_close_pct_n3'] = df.loc[day_str, 'prev_close_pct'].shift(3)

        df.loc[day_str, 'day_open_pct_n1'] = df.loc[day_str, 'day_open_pct'].shift(1)
        df.loc[day_str, 'day_open_pct_n2'] = df.loc[day_str, 'day_open_pct'].shift(2)
        df.loc[day_str, 'day_open_pct_n3'] = df.loc[day_str, 'day_open_pct'].shift(3)

    df['gap_open'] = df['day_open'] - df['prev_close']
    df['gap_open_pct'] = df['gap_open'] / df['prev_close']
    return df.dropna(subset=[
        'prev_close_pct',
        'gap_open_pct',
        'day_open_pct_n3',
        'prev_close_pct_n3'
    ])

In [158]:
import joblib

df_feats = create_features(t)

hod_model1 = joblib.load(ticker_dict[tck]['hod_model'])
lod_model1 = joblib.load(ticker_dict[tck]['lod_model'])

def create_preds_df(df_feats):
    predicted_proba_hod = hod_model1.predict_proba(df_feats)[:,-1]
    predicted_proba_lod = lod_model1.predict_proba(df_feats)[:,-1]
    df_viz = df_feats.copy()
    df_viz['pred_hod'] = predicted_proba_hod
    df_viz['pred_lod'] = predicted_proba_lod
    return df_viz

df_viz = create_preds_df(df_feats)

AttributeError: 'datetime.time' object has no attribute 'upper'

In [None]:
# Create df for visual
def create_viz(df_viz, date_str):

    df_use = df_viz.loc[date_str:date_str]
    
    fig = go.Figure(data=[go.Candlestick(x=df_use.index,
                open=df_use['open'],
                high=df_use['high'],
                low=df_use['low'],
                close=df_use['close'])])

    fig.add_trace(go.Scatter(x=df_use.index, y=df_use['pred_hod'], mode='lines', name='pred_hod', yaxis='y2', line=dict(color='#ff5f5f')))
    fig.add_trace(go.Scatter(x=df_use.index, y=df_use['pred_lod'], mode='lines', name='pred_lod', yaxis='y2', line=dict(color='#3399cc')))

    fig.add_shape(
        type="line",
        x0=df_use.index.min(),
        x1=df_use.index.max(),
        y0=0.5,
        y1=0.5,
        yref='y2',
        line=dict(
            color="Red",
            width=1.5,
            dash="dash",
        )
    )

    max_high = df_use['high'].max()
    max_high_time = df_use['high'].idxmax()

    min_low = df_use['low'].min()
    min_low_time = df_use['low'].idxmin()

    fig.add_annotation(
        x=max_high_time,
        y=max_high,
        text=f"{str(int(max_high))}",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=12,
            color="#ffffff"
        ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-30,
        bordercolor="#c7c7c7",
        borderwidth=1,
        borderpad=1,
        bgcolor="#ff5f5f",
        opacity=0.8
    )

    fig.add_annotation(
        x=min_low_time,
        y=min_low,
        text=f"{str(int(min_low))}",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=12,
            color="#ffffff"
        ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=30,  # Adjust the y offset for the annotation
        bordercolor="#c7c7c7",
        borderwidth=1,
        borderpad=1,
        bgcolor="#3399cc",
        opacity=0.8
    )

    fig.update_layout(
        template='plotly_dark',
        yaxis=dict(
            tickformat='.0f'
        ),
        yaxis2=dict(
            overlaying='y',
            side='right',
            tickformat=".0%"
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        title='OHLC vs Prediction Over Time',
        xaxis_rangeslider_visible=False
    )

    fig.show()
    

In [159]:
create_viz(df_viz, '2023-12-28')

ValueError: attempt to get argmax of an empty sequence

In [None]:
XXX

NameError: name 'XXX' is not defined

In [None]:
time_value = datetime.time(13,0)
prev_close_pct = 0.005
day_open_pct = 0.00
gap_open_pct = -0.005

input_data = pd.DataFrame({
    'time': [time_value], 
    'prev_close_pct': [prev_close_pct],
    'day_open_pct': [day_open_pct],
    'gap_open_pct': [gap_open_pct],
    })
predicted_proba = pipeline.predict_proba(input_data)[:,-1]

print(f"Predicted Probability: {predicted_proba[0]}")

In [None]:
get_top_probability(time_value, prev_close_pct)

In [None]:
XXX

In [None]:
daily_data = spx_data.groupby(spx_data.index.date)['eod_close_pct'].max()

In [None]:
daily_data.describe()

In [None]:
daily_data.clip(-0.05,0.05).hist()

In [None]:
spx_data.loc[(spx_data['time'] < datetime.time(10,5)) & (spx_data['label'] == 1), 'prev_close_pct'].describe()

In [None]:
len(set(spx_data.index.date))

In [None]:
import plotly.graph_objects as go

# df_use = spx_data.query('(eod_close_pct > -0.007) & (eod_close_pct < 0.007)')
df_use = spx_data.query('(eod_close_pct < 0.01) & (eod_close_pct > -0.01)')
# df_use = spx_data.copy()
total_days = len(set(spx_data.index.date))
set_days = len(set(df_use.index.date))
# Convert times to string
times = df_use['time'].unique()
times = [t.strftime('%H:%M:%S') for t in times]
freqs = df_use.groupby('time')['label'].mean().cumsum()

# Calculate means
means = df_use.groupby('time')['prev_close_pct'].mean()

# Create a Plotly figure
fig = go.Figure()

# Add a line trace
fig.add_trace(go.Scatter(x=times, y=means, mode='lines', name='Mean'))

fig.add_trace(go.Scatter(x=times, y=freqs, mode='lines', name='Frequency', yaxis='y2'))
fig.update_layout(
    yaxis2=dict(
        title='Frequency',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig.update_yaxes(
    tickformat=".3%",
)

# Set layout properties
fig.update_layout(
    title=f'Mean vs Time ({str(set_days)} -- {set_days / total_days:.1%})',
    xaxis_title='Time',
    yaxis_title='Mean',
    autosize=False,
    width=1000,
    height=500,
)

# Show the figure
fig.show()

In [None]:
import numpy as np
import pandas as pd
import datetime

spx_data['label_lod'] = np.nan

# Per date, run it.
for day in sorted(set(spx_data.index.date)):
    day_str = datetime.datetime.strftime(day, '%Y-%m-%d')
    spx_data.loc[day_str, 'lod'] = spx_data.loc[day_str, 'low'].min()

    # Label the data: 1 if the current low is the LOD, 0 otherwise
    spx_data.loc[day_str, 'label_lod'] = (spx_data.loc[day_str, 'low'] == spx_data.loc[day_str, 'lod']).astype(int)

    # Find the index of LOD
    lod_index = spx_data.loc[day_str][spx_data.loc[day_str, 'label_lod'] == 1].index

    # If LOD is found within the first 5 bars, label the first 5 bars as 1
    if not lod_index.empty:
        lod_index = lod_index[0]  # In case there are multiple, take the first
        if lod_index <= spx_data.loc[day_str].index[4]:  # Check if LOD is within the first 5 bars
            spx_data.loc[spx_data.loc[day_str].index[:5], 'label_lod'] = 1
        else:
            start_time = spx_data.loc[day_str].index[0]
            end_time = lod_index
            # Calculate the start index using Timedelta
            start_index = end_time - pd.Timedelta(minutes=5 * 4)
            start_index = max(start_index, start_time)
            spx_data.loc[start_index:end_time, 'label_lod'] = 1

spx_data.head()

In [None]:
spx_data['time'] = spx_data.index.time

In [None]:
spx_data['label'].sum()

In [None]:
pd.spx_data.groupby('time')['label'].mean()

In [None]:
import matplotlib.pyplot as plt
d = '2023-12-20'
spx_data.loc[d,'label'].plot();
spx_data.loc[d,'label_lod'].plot();
plt.legend();
print(spx_data.loc[d,'label_lod'].sum())

In [None]:
spx_data['label'].value_counts()

In [None]:
len(set(spx_data.index.date))

In [None]:
spx_data.loc[day]

In [None]:
import pandas as pd

# Load data
# data = pd.read_csv('your_data_file.csv', parse_dates=True)

# Calculate P as the average of open, high, low, and close
spx_data['P'] = spx_data[['open', 'high', 'low', 'close']].mean(axis=1)

# Initialize columns for ON, O1, O2, O3
spx_data['ON'] = spx_data['P'].rolling(window=6).max()
spx_data['O1'] = spx_data['P'].rolling(window=6).apply(lambda x: x.nsmallest(2).iloc[0] if x.idxmax() != x.index[-1] else x.nsmallest(2).iloc[1])
spx_data['O2'] = spx_data['P'].rolling(window=6).apply(lambda x: x.nsmallest(3).iloc[1] if x.idxmax() == x.index[-2] else (x.nsmallest(3).iloc[0] if x.idxmax() != x.index[-1] else x.nsmallest(3).iloc[2]))
spx_data['O3'] = spx_data['P'].rolling(window=6).apply(lambda x: x.nsmallest(3).iloc[2])

# Set iMA (fast_ma)
spx_data['fast_ma'] = spx_data['O3']


In [None]:

# Display the result
spx_data[['Datetime','P', 'ON', 'O1', 'O2', 'O3', 'fast_ma']].tail()


In [None]:
# Initializing the required columns
spx_data['a'] = 0
spx_data['b'] = 0
spx_data['z'] = 0
spx_data['c'] = 0
spx_data['l'] = 0
spx_data['s'] = 0

# Helper functions to calculate lowest_low, highest_high, avg_low, avg_high
spx_data['lowest_low'] = spx_data['low'].rolling(window=5).min()
spx_data['highest_high'] = spx_data['high'].rolling(window=5).max()
spx_data['avg_low'] = spx_data['low'].rolling(window=5).mean()
spx_data['avg_high'] = spx_data['high'].rolling(window=5).mean()

for i, row in spx_data.iterrows():
    if i == 0:
        continue  # Skip the first row

    a_prev = spx_data.at[i - 1, 'a']
    low_prev = spx_data.at[i - 1, 'low']
    high_prev = spx_data.at[i - 1, 'high']

    if a_prev == 1:
        spx_data.at[i, 'b'] = max(spx_data.at[i, 'lowest_low'], spx_data.at[i - 1, 'b'])
        spx_data.at[i, 'z'] = 1 if spx_data.at[i, 'avg_high'] < spx_data.at[i - 1, 'b'] and spx_data.at[i, 'close'] < low_prev else spx_data.at[i - 1, 'z']
        spx_data.at[i, 'a'] = 0 if spx_data.at[i, 'avg_high'] < spx_data.at[i - 1, 'b'] and spx_data.at[i, 'close'] < low_prev else a_prev
        spx_data.at[i, 'c'] = spx_data.at[i, 'highest_high'] if spx_data.at[i, 'avg_high'] < spx_data.at[i - 1, 'b'] and spx_data.at[i, 'close'] < low_prev else spx_data.at[i - 1, 'c']
    elif a_prev == 0:
        spx_data.at[i, 'c'] = min(spx_data.at[i, 'highest_high'], spx_data.at[i - 1, 'c'])
        spx_data.at[i, 'z'] = 0 if spx_data.at[i, 'avg_low'] > spx_data.at[i - 1, 'c'] and spx_data.at[i, 'close'] > high_prev else spx_data.at[i - 1, 'z']
        spx_data.at[i, 'a'] = 1 if spx_data.at[i, 'avg_low'] > spx_data.at[i - 1, 'c'] and spx_data.at[i, 'close'] > high_prev else a_prev
        spx_data.at[i, 'b'] = spx_data.at[i, 'lowest_low'] if spx_data.at[i, 'avg_low'] > spx_data.at[i - 1, 'c'] and spx_data.at[i, 'close'] > high_prev else spx_data.at[i - 1, 'b']
    else:
        # Maintain the previous values
        spx_data.at[i, 'b'] = spx_data.at[i - 1, 'b']
        spx_data.at[i, 'z'] = spx_data.at[i - 1, 'z']
        spx_data.at[i, 'a'] = a_prev
        spx_data.at[i, 'c'] = spx_data.at[i - 1, 'c']

    # Adjusting 'l' and 's'
    if spx_data.at[i, 'z'] == 0:
        spx_data.at[i, 'l'] = spx_data.at[i - 1, 's'] if spx_data.at[i - 1, 'z'] != 0 else max(spx_data.at[i - 1, 'b'], spx_data.at[i - 1, 'l'])
        spx_data.at[i, 's'] = 0
    elif spx_data.at[i - 1, 'z'] != 1:
        spx_data.at[i, 's'] = spx_data.at[i - 1, 'l']
        spx_data.at[i, 'l'] = 0
    elif spx_data.at[i, 'z'] == 1:
        spx_data.at[i, 's'] = min(spx_data.at[i, 'c'], spx_data.at[i - 1, 's'])
        spx_data.at[i, 'l'] = spx_data.at[i - 1, 'l']
    else:
        spx_data.at[i, 'l'] = spx_data.at[i - 1, 'l']
        spx_data.at[i, 's'] = spx_data.at[i - 1, 's']

    # Setting 'trail'
    spx_data.at[i, 'trail'] = spx_data.at[i, 'l'] if spx_data.at[i, 'l'] > 0 else spx_data.at[i, 's']

# # Display the result
# print(spx_data[['a', 'b', 'z', 'c', 'l', 's', 'trail']])


In [None]:
spx_data[['close', 'fast_ma']].tail().plot()

# Backtesting

In [None]:
import bt
import pandas as pd

# Load your data
# data = pd.read_csv('your_data_file.csv', index_col='Date', parse_dates=True)

# Custom Algo for selecting where to trade
class SelectWhere(bt.Algo):
    def __init__(self, signal):
        self.signal = signal

    def __call__(self, target):
        if target.now in self.signal.index:
            signal_value = self.signal.loc[target.now]
            target.temp['weights'] = signal_value.astype(float)
        return True

# Create the signal for the strategy
signal = (data['close'] > data['fast_ma']) & (data['close'].shift() < data['slow_ma'])

# Create the strategy
strategy = bt.Strategy('AboveFastMA',
                       [SelectWhere(signal),
                        bt.algos.WeighTarget('weights'),
                        bt.algos.Rebalance()])

# Create a backtest and run it
test = bt.Backtest(strategy, data)

# Set up the backtest with transaction costs and slippage
commissions = bt.fees.FixedPerTrade(1) # $1 per trade
slippage = bt.slippage.FixedSlippage(0.05) # Fixed slippage per share, adjust as needed

# Set the commissions and slippage to the backtest
test.set_fees(commissions=commissions, slippage=slippage)

# Run the backtest
res = bt.run(test)

# Plot the results
res.plot()
