In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt

In [2]:
candles = pd.read_csv("data/SPY.csv.gz", parse_dates=["date"])
candles

Unnamed: 0,date,open,high,low,close,volume
0,2000-01-03,98.3617,98.3617,95.4590,96.4957,8164300
1,2000-01-04,95.2309,95.5834,92.6495,92.7221,8089800
2,2000-01-05,92.8465,93.9039,91.0634,92.8880,12177900
3,2000-01-06,92.6392,93.8832,91.3951,91.3951,6227200
4,2000-01-07,93.0953,96.7030,92.9295,96.7030,8066500
...,...,...,...,...,...,...
5675,2022-07-25,395.7500,396.4700,393.2100,395.5700,47399696
5676,2022-07-26,393.8400,394.0600,389.9500,390.8900,52491160
5677,2022-07-27,394.3600,402.8800,394.0500,401.0400,82180077
5678,2022-07-28,401.8900,406.8000,398.1500,406.0700,73761459


In [3]:
del candles["volume"]
candles["prev_close"] = candles["close"].shift(1)
candles["gap"] = candles["open"] - candles["prev_close"]
candles["gap_percent"] = candles["gap"] / candles["prev_close"] * 100
candles["gap_filled"] = (candles["low"] <= candles["prev_close"]) & (candles["prev_close"] <= candles["high"])
candles.dropna(axis="rows", inplace=True)
candles.reset_index(drop=True, inplace=True)
candles = candles.loc[abs(candles["gap_percent"]) >= 0.05].reset_index(drop=True)
candles

Unnamed: 0,date,open,high,low,close,prev_close,gap,gap_percent,gap_filled
0,2000-01-04,95.2309,95.5834,92.6495,92.7221,96.4957,-1.2648,-1.310732,False
1,2000-01-05,92.8465,93.9039,91.0634,92.8880,92.7221,0.1244,0.134164,True
2,2000-01-06,92.6392,93.8832,91.3951,91.3951,92.8880,-0.2488,-0.267849,True
3,2000-01-07,93.0953,96.7030,92.9295,96.7030,91.3951,1.7002,1.860275,False
4,2000-01-10,97.0348,97.4701,96.2261,97.0348,96.7030,0.3318,0.343112,True
...,...,...,...,...,...,...,...,...,...
5093,2022-07-25,395.7500,396.4700,393.2100,395.5700,395.0900,0.6600,0.167051,True
5094,2022-07-26,393.8400,394.0600,389.9500,390.8900,395.5700,-1.7300,-0.437344,False
5095,2022-07-27,394.3600,402.8800,394.0500,401.0400,390.8900,3.4700,0.887718,False
5096,2022-07-28,401.8900,406.8000,398.1500,406.0700,401.0400,0.8500,0.211949,True


In [4]:
gap_fill_count = candles.groupby("gap_filled").size()
gap_fill_count

gap_filled
False    1796
True     3302
dtype: int64

In [5]:
print("Naive gap fill rate {:.2f}%".format(gap_fill_count[True]/gap_fill_count.sum()*100))

Naive gap fill rate 64.77%


In [6]:
candles["day_of_week"] = candles["date"].dt.day_name()
candles["month"] = candles["date"].dt.month_name()
candles

Unnamed: 0,date,open,high,low,close,prev_close,gap,gap_percent,gap_filled,day_of_week,month
0,2000-01-04,95.2309,95.5834,92.6495,92.7221,96.4957,-1.2648,-1.310732,False,Tuesday,January
1,2000-01-05,92.8465,93.9039,91.0634,92.8880,92.7221,0.1244,0.134164,True,Wednesday,January
2,2000-01-06,92.6392,93.8832,91.3951,91.3951,92.8880,-0.2488,-0.267849,True,Thursday,January
3,2000-01-07,93.0953,96.7030,92.9295,96.7030,91.3951,1.7002,1.860275,False,Friday,January
4,2000-01-10,97.0348,97.4701,96.2261,97.0348,96.7030,0.3318,0.343112,True,Monday,January
...,...,...,...,...,...,...,...,...,...,...,...
5093,2022-07-25,395.7500,396.4700,393.2100,395.5700,395.0900,0.6600,0.167051,True,Monday,July
5094,2022-07-26,393.8400,394.0600,389.9500,390.8900,395.5700,-1.7300,-0.437344,False,Tuesday,July
5095,2022-07-27,394.3600,402.8800,394.0500,401.0400,390.8900,3.4700,0.887718,False,Wednesday,July
5096,2022-07-28,401.8900,406.8000,398.1500,406.0700,401.0400,0.8500,0.211949,True,Thursday,July


In [7]:
# Bucket gap_percent by size
number_of_buckets = 10
interval_size = 0.1
min_val = 0.05

cut_bins = [round(z, 2) for z in np.arange(0.05, min_val + number_of_buckets * interval_size, interval_size)] + [100]
cut_labels = [round(z + interval_size / 2, 2) for z in np.arange(0.05, min_val + number_of_buckets * interval_size, interval_size)]

candles["gap_size"] = pd.cut(abs(candles["gap_percent"]), bins=cut_bins, labels=cut_labels)
candles

Unnamed: 0,date,open,high,low,close,prev_close,gap,gap_percent,gap_filled,day_of_week,month,gap_size
0,2000-01-04,95.2309,95.5834,92.6495,92.7221,96.4957,-1.2648,-1.310732,False,Tuesday,January,1.0
1,2000-01-05,92.8465,93.9039,91.0634,92.8880,92.7221,0.1244,0.134164,True,Wednesday,January,0.1
2,2000-01-06,92.6392,93.8832,91.3951,91.3951,92.8880,-0.2488,-0.267849,True,Thursday,January,0.3
3,2000-01-07,93.0953,96.7030,92.9295,96.7030,91.3951,1.7002,1.860275,False,Friday,January,1.0
4,2000-01-10,97.0348,97.4701,96.2261,97.0348,96.7030,0.3318,0.343112,True,Monday,January,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...
5093,2022-07-25,395.7500,396.4700,393.2100,395.5700,395.0900,0.6600,0.167051,True,Monday,July,0.2
5094,2022-07-26,393.8400,394.0600,389.9500,390.8900,395.5700,-1.7300,-0.437344,False,Tuesday,July,0.4
5095,2022-07-27,394.3600,402.8800,394.0500,401.0400,390.8900,3.4700,0.887718,False,Wednesday,July,0.9
5096,2022-07-28,401.8900,406.8000,398.1500,406.0700,401.0400,0.8500,0.211949,True,Thursday,July,0.2


In [8]:
gap_fill_by_size = candles.groupby(["gap_size", "gap_filled"]).size()
gap_fill_by_size.groupby("gap_size").apply(lambda g: g / g.sum() * 100)

gap_size  gap_filled
0.1       False         10.982659
          True          89.017341
0.2       False         26.460860
          True          73.539140
0.3       False         31.367292
          True          68.632708
0.4       False         37.407407
          True          62.592593
0.5       False         44.418052
          True          55.581948
0.6       False         49.013158
          True          50.986842
0.7       False         48.945148
          True          51.054852
0.8       False         58.064516
          True          41.935484
0.9       False         57.246377
          True          42.753623
1.0       False         62.908497
          True          37.091503
dtype: float64

Gap fill rates decrease as the gap size increases.

In [9]:
gap_fill_by_month = candles.groupby(["month", "gap_filled"]).size()
gap_fill_by_month.groupby("month").apply(lambda g: g / g.sum() * 100)

month      gap_filled
April      False         33.333333
           True          66.666667
August     False         35.147392
           True          64.852608
December   False         35.960591
           True          64.039409
February   False         29.743590
           True          70.256410
January    False         32.619048
           True          67.380952
July       False         36.986301
           True          63.013699
June       False         33.634312
           True          66.365688
March      False         36.129032
           True          63.870968
May        False         38.073394
           True          61.926606
November   False         37.121212
           True          62.878788
October    False         34.909910
           True          65.090090
September  False         38.847118
           True          61.152882
dtype: float64

Month has no discernible effect on gap fill rate.

In [10]:
gap_fill_by_day_of_week = candles.groupby(["day_of_week", "gap_filled"]).size()
gap_fill_by_day_of_week.groupby("day_of_week").apply(lambda g: g / g.sum() * 100)

day_of_week  gap_filled
Friday       False         35.009671
             True          64.990329
Monday       False         41.393875
             True          58.606125
Thursday     False         34.387352
             True          65.612648
Tuesday      False         35.627907
             True          64.372093
Wednesday    False         30.194175
             True          69.805825
dtype: float64

Monday has a slightly lower gap fill rate.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [12]:
# One-hot encode categorical features like day_of_week and month
day_of_week = pd.get_dummies(candles["day_of_week"])
month = pd.get_dummies(candles["month"])
x = candles[["gap_size"]].join([day_of_week, month])
x

Unnamed: 0,gap_size,Friday,Monday,Thursday,Tuesday,Wednesday,April,August,December,February,January,July,June,March,May,November,October,September
0,1.0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0.1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
2,0.3,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0.3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5093,0.2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5094,0.4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
5095,0.9,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
5096,0.2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [13]:
y = candles["gap_filled"].replace({True: "Filled", False: "NoFill"})
y

0       NoFill
1       Filled
2       Filled
3       NoFill
4       Filled
         ...  
5093    Filled
5094    NoFill
5095    NoFill
5096    Filled
5097    NoFill
Name: gap_filled, Length: 5098, dtype: object

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

In [16]:
logistic_predictions = logistic.predict(x_test)
print("Logistic Regression accuracy {:.1%}".format(metrics.accuracy_score(y_test, logistic_predictions)))

Logistic Regression accuracy 66.7%


In [17]:
metrics.ConfusionMatrixDisplay.from_estimator(logistic, x_test, y_test)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fbb8f5aed90>

In [18]:
svm_model = svm.LinearSVC()
svm_model.fit(x_train, y_train)

In [19]:
svm_predictions = svm_model.predict(x_test)
print("SVM accuracy {:.1%}".format(metrics.accuracy_score(y_test, svm_predictions)))

SVM accuracy 66.8%


In [20]:
tree = DecisionTreeClassifier(max_depth=4)
tree.fit(x_train, y_train)

In [21]:
tree_predictions = tree.predict(x_test)
print("Random Forest accuracy {:.1%}".format(metrics.accuracy_score(y_test, tree_predictions)))

Random Forest accuracy 67.3%


In [22]:
# Partial gap fill
conditions = [candles["gap_filled"] == True, candles["gap_percent"] > 0, candles["gap_percent"] < 0]
choices = [math.nan, candles["low"] - candles["prev_close"], candles["high"] - candles["prev_close"]]

candles["remaining_gap"] = np.select(conditions, choices)
candles["partial_gap_fill_percent"] = 1 - candles["remaining_gap"] / candles["gap"]
partial_fill = candles.dropna()
partial_fill

Unnamed: 0,date,open,high,low,close,prev_close,gap,gap_percent,gap_filled,day_of_week,month,gap_size,remaining_gap,partial_gap_fill_percent
0,2000-01-04,95.2309,95.5834,92.6495,92.7221,96.4957,-1.2648,-1.310732,False,Tuesday,January,1.0,-0.9123,0.278700
3,2000-01-07,93.0953,96.7030,92.9295,96.7030,91.3951,1.7002,1.860275,False,Friday,January,1.0,1.5344,0.097518
5,2000-01-11,96.7445,96.9311,95.2102,95.8737,97.0348,-0.2903,-0.299171,False,Tuesday,January,0.3,-0.1037,0.642783
7,2000-01-13,95.8529,96.7030,95.0650,96.2054,94.9199,0.9330,0.982934,False,Thursday,January,1.0,0.1451,0.844480
8,2000-01-14,97.2213,97.8434,96.8481,97.5116,96.2054,1.0159,1.055970,False,Friday,January,1.0,0.6427,0.367359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5088,2022-07-15,382.5500,385.2500,380.5400,385.1300,377.9100,4.6400,1.227806,False,Friday,July,1.0,2.6300,0.433190
5090,2022-07-19,386.0800,392.8700,385.3900,392.2700,381.9500,4.1300,1.081293,False,Tuesday,July,1.0,3.4400,0.167070
5094,2022-07-26,393.8400,394.0600,389.9500,390.8900,395.5700,-1.7300,-0.437344,False,Tuesday,July,0.4,-1.5100,0.127168
5095,2022-07-27,394.3600,402.8800,394.0500,401.0400,390.8900,3.4700,0.887718,False,Wednesday,July,0.9,3.1600,0.089337


In [23]:
print("Naive partial gap fill reaches {:.2f}% on average".format(partial_fill["partial_gap_fill_percent"].mean()*100))
print(partial_fill.groupby(["gap_size"])["partial_gap_fill_percent"].quantile(0.3))

Naive partial gap fill reaches 49.26% on average
gap_size
0.1    0.315509
0.2    0.333333
0.3    0.282632
0.4    0.236374
0.5    0.245923
0.6    0.272563
0.7    0.231221
0.8    0.254145
0.9    0.194804
1.0    0.220689
Name: partial_gap_fill_percent, dtype: float64


The 30th percentile is around 20% (i.e. majority of no-fills filled at least 20%). This implies that there is still trading opportunity when the gap size is large with a more conservative target price.