In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/s1.csv')
data.head()

Unnamed: 0,date,time,open,high,low,close,volume,open_interest
0,2017-01-02,09:15:00,100.0,100.28,100.0,100.02,7000.0,100000.0
1,2017-01-02,09:16:00,100.02,100.07,100.02,100.06,3509.0,101212.0
2,2017-01-02,09:17:00,100.05,100.06,99.7,99.73,5105.0,101212.0
3,2017-01-02,09:18:00,99.71,99.79,99.63,99.79,4959.0,101212.0
4,2017-01-02,09:19:00,99.78,99.82,99.69,99.7,2398.0,102480.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673384 entries, 0 to 673383
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           673384 non-null  object 
 1   time           673384 non-null  object 
 2   open           673384 non-null  float64
 3   high           673384 non-null  float64
 4   low            673384 non-null  float64
 5   close          673384 non-null  float64
 6   volume         673384 non-null  float64
 7   open_interest  673384 non-null  float64
dtypes: float64(6), object(2)
memory usage: 41.1+ MB


In [None]:
data.describe()

Unnamed: 0,open,high,low,close,volume,open_interest
count,673384.0,673384.0,673384.0,673384.0,673384.0,673384.0
mean,176.523172,176.580113,176.466306,176.52331,1474.010413,101125.616752
std,44.845303,44.850573,44.839592,44.845318,1896.214092,37082.227375
min,88.27,88.76,87.98,88.28,0.0,0.0
25%,139.49,139.53,139.46,139.49,446.0,82293.0
50%,169.29,169.35,169.23,169.29,892.0,98964.0
75%,210.53,210.6,210.46,210.53,1795.0,117471.0
max,274.03,274.17,273.91,274.03,293657.0,908767.0


In [None]:
data.date.value_counts().value_counts().sort_index()

count
220       1
271       1
316       1
317       1
347       1
358       2
365       1
368       1
370       2
371       1
372       3
373       4
374       5
375    1773
Name: count, dtype: int64

In [None]:
cts = data.groupby('date').size()
less = cts[cts < 370].reset_index().groupby(0).agg(list)
less

Unnamed: 0_level_0,date
0,Unnamed: 1_level_1
220,[2017-07-10]
271,[2021-01-11]
316,[2020-03-13]
317,[2020-03-23]
347,[2023-11-28]
358,"[2023-10-23, 2024-02-27]"
365,[2023-08-25]
368,[2024-03-27]


In [None]:
#days older than 2022 with less than 370 candles will be discarded
olddates = data[data['date'] < '2022'].groupby('date').size()
removedates = olddates[olddates < 370].index
cd = data.drop(data[data['date'].isin(removedates)].index)

In [None]:
cd.date.value_counts().value_counts().sort_index()

count
347       1
358       2
365       1
368       1
370       2
371       1
372       3
373       4
374       5
375    1773
Name: count, dtype: int64

In [None]:
cd.to_csv('/content/drive/MyDrive/cd.csv', index=False)

In [None]:
df = pd.DataFrame({'datetime': pd.to_datetime(cd['date'] + ' ' + cd['time']),
                   'day': pd.to_datetime(cd['date'] + ' ' + cd['time']).dt.dayofweek,
                   'time': cd['time'].str[:2],
                   'oc': cd['close'] - cd['open'],
                   'hl': cd['high'] - cd['low'],
                   'vol': cd['volume'],
                   'opin': cd['open_interest']})
#another column 'colour' would be added once the threshold for neutrality is decided

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 672260 entries, 0 to 673383
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   datetime  672260 non-null  datetime64[ns]
 1   day       672260 non-null  int32         
 2   time      672260 non-null  object        
 3   oc        672260 non-null  float64       
 4   hl        672260 non-null  float64       
 5   vol       672260 non-null  float64       
 6   opin      672260 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int32(1), object(1)
memory usage: 38.5+ MB


In [None]:
n = 10
df['ema'] = np.zeros(len(df))
dates = cd['date'].unique()
df['ema'] = cd['close'].ewm(span=n, adjust=False).mean()

In [None]:
df['opnorm'] = cd['open'] - df['ema']
df['hinorm'] = cd['high'] - df['ema']
df['lonorm'] = cd['low'] - df['ema']
df['clnorm'] = cd['close'] - df['ema']
df.tail(15)

Unnamed: 0,datetime,day,time,oc,hl,vol,opin,ema,opnorm,hinorm,lonorm,clnorm
673369,2024-05-24 15:15:00,4,15,-0.04,0.23,3688.0,111354.0,269.42292,0.00708,0.08708,-0.14292,-0.03292
673370,2024-05-24 15:16:00,4,15,0.0,0.1,1243.0,111900.0,269.426025,0.013975,0.083975,-0.016025,0.013975
673371,2024-05-24 15:17:00,4,15,-0.04,0.11,544.0,111900.0,269.417657,0.002343,0.062343,-0.047657,-0.037657
673372,2024-05-24 15:18:00,4,15,-0.03,0.13,934.0,111900.0,269.405356,-0.025356,0.024644,-0.105356,-0.055356
673373,2024-05-24 15:19:00,4,15,0.06,0.11,866.0,111995.0,269.408018,-0.048018,0.051982,-0.058018,0.011982
673374,2024-05-24 15:20:00,4,15,0.04,0.07,835.0,111995.0,269.41747,0.00253,0.05253,-0.01747,0.04253
673375,2024-05-24 15:21:00,4,15,-0.01,0.06,792.0,111995.0,269.423384,0.036616,0.046616,-0.013384,0.026616
673376,2024-05-24 15:22:00,4,15,-0.04,0.09,590.0,111655.0,269.420951,0.029049,0.029049,-0.060951,-0.010951
673377,2024-05-24 15:23:00,4,15,0.01,0.04,827.0,111655.0,269.417141,-0.027141,-0.017141,-0.057141,-0.017141
673378,2024-05-24 15:24:00,4,15,-0.03,0.07,2121.0,111655.0,269.40857,-0.00857,0.02143,-0.04857,-0.03857


In [None]:
#data subsets for checking clusters and running EDA
usecd = cd.tail(60000)
usedf = df.tail(60000)

In [None]:
#collapsing plots not needed now
#they affected performance negatively
'''import plotly.graph_objects as go
fig = go.Figure(data=[go.Candlestick(x=usedf['datetime'],
                open=usecd['open'],
                high=usecd['high'],
                low=usecd['low'],
                close=usecd['close'])])
fig.show()'''

'''fig = go.Figure(data=[go.Candlestick(x=usedf['datetime'],
                open=usedf['opnorm'],
                high=usedf['hinorm'],
                low=usedf['lonorm'],
                close=usedf['clnorm'])])
fig.show()'''

"fig = go.Figure(data=[go.Candlestick(x=usedf['datetime'],\n                open=usedf['opnorm'],\n                high=usedf['hinorm'],\n                low=usedf['lonorm'],\n                close=usedf['clnorm'])])\nfig.show()"

In [None]:
#scatter plots

'''plt.scatter(usedf['day'], usedf['oc'])
plt.show()'''
#tuesday had more spikes and falls, saturday is latent, more fluctuations are at the start and end of work week

'''sns.boxplot(x='time', y='oc', data=usedf)
plt.xlabel('time')
plt.ylabel('C-O')
plt.title('Boxplot with Categories on X-axis')
plt.show()
plt.show()'''
#high volatility in first two hours and the last hour before market closing, tranquil at the closing and during mealtime
#volatility generally sees a slowing trend

'''plt.scatter(usecd['open_interest'], usedf['oc'])
plt.show()'''
#needs more analysis

'''plt.scatter(usedf['oc'], usedf['vol'])
plt.show()'''
#this relationship needs more analysis

'''plt.scatter(usedf['day'], usedf['vol'])
plt.show()'''
#trading goes slow on saturday
#tuesday, which had been found to be volatile, has fewer transactions
#wednesday was tranquil, but has volume spikes
#wednesday is the day for prudent trading, as fluctuations are low and volume is high

'''plt.scatter(df['time'].str[:2], df['vol'])
plt.show()'''
#volume slumps in midday, and reduces at the closing time

'''plt.scatter(usedf['vol'], usecd['open_interest'])
plt.show()'''
#more analysis is required

'''plt.scatter(usedf['ema'], usedf['oc'])
plt.show()'''
#more clarity is needed

'''plt.scatter(df['oc'], df['hl'])
plt.show()'''
#as expected, high hl leads to high oc magnitude

'''plt.scatter(usedf['lonorm'], usedf['oc'])
plt.show()'''
#obvious: oc and clnorm are linearly related, oc and opnorm are inversely related
#+ve hinorm and -ve lonorm usually lead to +ve oc

'''plt.scatter(usedf['ema'], usedf['vol'])
plt.show()'''
#more analysis is needed

'''plt.plot(usedf['ema'], usecd['open_interest'])
plt.show()'''
#caught off guard, needs to be scrutinised

'''plt.scatter(usedf['hinorm'], usedf['clnorm'])
plt.show()'''
#OHCL relations: nothing interesting at the first glance, may become useful later

"plt.scatter(usedf['hinorm'], usedf['clnorm'])\nplt.show()"

In [None]:
#using oc as a check for candle colour
cdf = usedf[['ema', 'oc', 'vol', 'opin']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(cdf)
labels = kmeans.predict(cdf)
u_labels = np.unique(labels)
pcdf = PCA(2).fit_transform(cdf)
'''for i in u_labels:
    plt.scatter(pcdf[labels == i, 0], pcdf[labels == i, 1], label = i)
plt.legend()
plt.show()'''

'for i in u_labels:\n    plt.scatter(pcdf[labels == i, 0], pcdf[labels == i, 1], label = i)\nplt.legend()\nplt.show()'

In [None]:
kmeans.cluster_centers_

array([[ 2.53402350e+02,  7.02390886e-04,  9.14430142e+02,
         1.15908443e+05],
       [ 2.58157180e+02, -5.47180346e-04,  8.82946119e+02,
         9.23707342e+04],
       [ 2.52583907e+02,  8.65136493e-04,  1.29145919e+03,
         1.51800389e+05]])

In [None]:
#distribution plots

'''mean = np.mean(usedf[['day', 'oc']])
std = np.std(usedf[['day', 'oc']])
pdf = stats.norm.pdf(usedf[['day', 'oc']], mean, std)
plt.plot(usedf[['day', 'oc']], pdf)
plt.show()'''
#usedf[['day', 'oc']].plot.kde()
'''k = stats.gaussian_kde(usedf['day'], 'oc']])
plt.scatter(usedf[['day', 'oc']], k)
plt.show()'''
#sns.displot(usedf, x='day', y='oc')

#sns.displot(usedf, x='time', y='oc')
#sns.displot(usedf, x='opin', y='oc')
#sns.displot(usedf, x='vol', y='oc')
#sns.displot(usedf, x='day', y='vol')
#sns.displot(usedf, x='time', y='vol')
#sns.displot(usedf, x='opin', y='vol')
#sns.displot(usedf, x='ema', y='oc')
#sns.displot(usedf, x='ema', y='vol')
#sns.displot(usedf, x='ema', y='opin')
#sns.displot(df, x='ema', y='opin')

"k = stats.gaussian_kde(usedf['day'], 'oc']])\nplt.scatter(usedf[['day', 'oc']], k)\nplt.show()"

In [None]:
df[['ema_1st_grad', 'ema_2nd_grad', 'vol_1st_grad', 'vol_2nd_grad', 'vol_3rd_grad', 'opin_1st_grad', 'opin_2nd_grad']] = np.zeros((len(df), 7))

df['ema_1st_grad'] = df['ema'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['ema_2nd_grad'] = df['ema_1st_grad'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['vol_1st_grad'] = df['vol'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['vol_2nd_grad'] = df['vol_1st_grad'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['vol_3rd_grad'] = df['vol_2nd_grad'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['opin_1st_grad'] = df['opin'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

df['opin_2nd_grad'] = df['opin_1st_grad'].rolling(window=n, min_periods=1).apply(lambda x: np.gradient(x)[-1] if len(x) > 1 else np.array([x]), raw=True)

In [None]:
df[['ema_1st_grad', 'ema_2nd_grad', 'vol_1st_grad', 'vol_2nd_grad', 'vol_3rd_grad', 'opin_1st_grad', 'opin_2nd_grad']].head(15)

Unnamed: 0,ema_1st_grad,ema_2nd_grad,vol_1st_grad,vol_2nd_grad,vol_3rd_grad,opin_1st_grad,opin_2nd_grad
0,100.02,100.02,7000.0,7000.0,7000.0,100000.0,100000.0
1,0.007273,-100.012727,-3491.0,-10491.0,-17491.0,1212.0,-98788.0
2,-0.05405,-0.061322,1596.0,5087.0,15578.0,0.0,-1212.0
3,-0.033313,0.020736,-146.0,-1742.0,-6829.0,0.0,0.0
4,-0.04362,-0.010307,-2561.0,-2415.0,-673.0,1268.0,1268.0
5,-0.062962,-0.019342,2613.0,5174.0,7589.0,0.0,-1268.0
6,-0.018787,0.044175,-1093.0,-3706.0,-8880.0,0.0,0.0
7,-0.02628,-0.007493,-1134.0,-41.0,3665.0,664.0,664.0
8,-0.028775,-0.002495,-930.0,204.0,245.0,0.0,-664.0
9,-0.023543,0.005232,-597.0,333.0,129.0,0.0,0.0


In [None]:
df[['sma', 'vwap', 'macd-signal', '50sma-200sma', 'ultimate_oscillator', 'disparity_index', 'obv', 'a/d', 'rsi', 'adx', 'vol_freq', 'hi_freq', 'lo_freq']] = np.zeros((len(df), 13))

df['sma'] = cd['close'].rolling(window=n, min_periods=1).mean()

cvp = (cd['volume'] * cd['close']).cumsum()
cv = cd['volume'].cumsum()
df['vwap'] = cvp / cv

df['disparity_index'] = ((cd['close'] - df['sma']) / df['sma']) * 100

clv = ((cd['close'] - cd['low']) - (cd['high'] - cd['close'])) / (cd['high'] - cd['low'])
clv = clv.fillna(0)
df['a/d'] = (clv * df['vol']).cumsum()

df['vol_freq'] = df['vol'].rolling(window=n, min_periods=1).apply(lambda x: x.value_counts().get(x.iloc[-1], 0))

df['hi_freq'] = cd['high'].rolling(window=n, min_periods=1).apply(lambda x: x.value_counts().get(x.iloc[-1], 0))

df['lo_freq'] = cd['low'].rolling(window=n, min_periods=1).apply(lambda x: x.value_counts().get(x.iloc[-1], 0))

In [None]:
df[['sma', 'vwap', 'macd-signal']].head(15)

Unnamed: 0,sma,vwap,macd-signal
0,100.02,100.02,0.0
1,100.04,100.033356,0.0
2,99.936667,99.934174,0.0
3,99.9,99.899422,0.0
4,99.86,99.878603,0.0
5,99.808333,99.819757,0.0
6,99.797143,99.808733,0.0
7,99.78125,99.797597,0.0
8,99.764444,99.789093,0.0
9,99.751,99.783802,0.0


In [None]:
macd = cd['close'].ewm(span=12, adjust=False).mean() - cd['close'].ewm(span=26, adjust=False).mean()
df['macd-signal'] = macd - macd.ewm(span=9, adjust=False).mean()

In [None]:
sma50 = cd['close'].rolling(window=50, min_periods=1).mean()
sma200 = cd['close'].rolling(window=200, min_periods=1).mean()
df['50sma-200sma'] = sma50 - sma200

In [None]:
priorclose = cd['close'].shift(1).fillna(0)
bp = cd['close'] - pd.DataFrame({'l': cd['low'], 'pc': priorclose}).min(axis=1)
tr = pd.DataFrame({'h': cd['high'], 'pc': priorclose}).max(axis=1) - pd.DataFrame({'l': cd['low'], 'pc': priorclose}).min(axis=1)
epsilon = 1e-10
tr = tr.replace(0, epsilon)
avg1 = bp.rolling(window=7, min_periods=1).sum() / tr.rolling(window=7, min_periods=1).sum()
avg2 = bp.rolling(window=14, min_periods=1).sum() / tr.rolling(window=14, min_periods=1).sum()
avg3 = bp.rolling(window=28, min_periods=1).sum() / tr.rolling(window=28, min_periods=1).sum()
df['ultimate_oscillator'] = 100 * (4 * avg1 + 2 * avg2 + avg3) / (4 + 2 + 1)

In [None]:
pc = cd['close'].diff()
df.loc[pc > 0, 'obv'] = df['vol']
df.loc[pc < 0, 'obv'] = -df['vol']
df['obv'] = df['obv'].cumsum()

In [None]:
g = pc.where(pc > 0, 0)
l = pc.where(pc < 0, 0)
ag = g.rolling(window=n, min_periods=1).mean()
al = l.rolling(window=n, min_periods=1).mean()
al = al.replace(0, epsilon)
rs = ag / al
df['rsi'] = 100 - (100 / (1 + rs))
df['rsi'].iloc[0] = 50

In [None]:
hicl = (cd['high'] - cd['close'].shift(1)).abs().fillna(0)
locl = (cd['low'] - cd['close'].shift(1)).abs().fillna(0)
tr = pd.DataFrame({'hl': df['hl'], 'hicl': hicl, 'locl': locl}).max(axis=1)
up = cd['high'] - cd['high'].shift(1).fillna(0)
down = cd['low'] - cd['low'].shift(1).fillna(0)
plusdm = np.where((up > down) & (up > 0), up, 0)
minusdm = np.where((down > up) & (down > 0), down, 0)
plusdm = pd.Series(plusdm, index=cd.index)
minusdm = pd.Series(minusdm, index=cd.index)
atr = tr.rolling(window=n, min_periods=1).mean()
atr = atr.replace(0, epsilon)
plusdm = plusdm.rolling(window=n, min_periods=1).mean()
minusdm = minusdm.rolling(window=n, min_periods=1).mean()
plusdi = (plusdm / atr) * 100
minusdi = (minusdm / atr) * 100
dx = (abs(plusdi - minusdi) / abs(plusdi + minusdi + epsilon)) * 100
df['adx'] = dx.rolling(window=n, min_periods=1).mean()

In [None]:
df.head(15)

Unnamed: 0,datetime,day,time,oc,hl,vol,opin,ema,opnorm,hinorm,...,50sma-200sma,ultimate_oscillator,disparity_index,obv,a/d,rsi,adx,vol_freq,hi_freq,lo_freq
0,2017-01-02 09:15:00,0,9,0.02,0.28,7000.0,100000.0,100.02,-0.02,0.26,...,0.0,7.142857,0.0,0.0,-6000.0,50.0,100.0,1.0,1.0,1.0
1,2017-01-02 09:16:00,0,9,0.04,0.05,3509.0,101212.0,100.027273,-0.007273,0.042727,...,0.0,18.181818,0.019992,3509.0,-3894.6,100.0,99.98006,1.0,1.0,1.0
2,2017-01-02 09:17:00,0,9,-0.32,0.36,5105.0,101212.0,99.973223,0.076777,0.086777,...,0.0,13.043478,-0.206798,-1596.0,-8148.766667,-13.793103,99.973413,1.0,1.0,1.0
3,2017-01-02 09:18:00,0,9,0.08,0.16,4959.0,101212.0,99.93991,-0.22991,-0.14991,...,0.0,29.411765,-0.11011,3363.0,-3189.766667,-43.478261,99.97009,1.0,1.0,1.0
4,2017-01-02 09:19:00,0,9,-0.08,0.13,2398.0,102480.0,99.89629,-0.11629,-0.07629,...,0.0,26.530612,-0.160224,965.0,-5218.84359,-31.25,99.944187,1.0,1.0,1.0
5,2017-01-02 09:20:00,0,9,-0.14,0.16,5011.0,102480.0,99.833328,-0.143328,-0.123328,...,0.0,22.807018,-0.258829,-4046.0,-10229.84359,-21.276596,99.926918,1.0,1.0,1.0
6,2017-01-02 09:21:00,0,9,0.16,0.19,3918.0,102480.0,99.814541,-0.244541,-0.084541,...,0.0,33.834586,-0.067279,-128.0,-6311.84359,-96.551724,99.914587,1.0,1.0,1.0
7,2017-01-02 09:22:00,0,9,-0.06,0.15,2784.0,103144.0,99.788261,-0.058261,-0.058261,...,0.0,40.39897,-0.111494,-2912.0,-5755.04359,-80.0,99.89539,1.0,2.0,1.0
8,2017-01-02 09:23:00,0,9,-0.04,0.12,1854.0,103144.0,99.759486,-0.089486,-0.059486,...,0.0,39.650591,-0.134762,-4766.0,-6064.04359,-71.794872,99.880458,1.0,1.0,2.0
9,2017-01-02 09:24:00,0,9,0.0,0.07,1257.0,103144.0,99.735943,-0.105943,-0.095943,...,0.0,49.333985,-0.121302,-4766.0,-5166.186447,-71.794872,99.868512,1.0,1.0,1.0


In [None]:
#saving df to avoid recomputation
df.to_csv('/content/drive/MyDrive/df.csv', index=False)