### Predicting Stock market Movement with markov Chain

In [1]:
%matplotlib inline

import io, base64, os, json, re
import pandas as pd
import numpy as np
import datetime
from random import randint
import ast
from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt

In [2]:
stock_data = pd.read_csv('data/HINDUNILVR.csv')

In [3]:
stock_data.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-03,HINDLEVER,EQ,2136.6,2300.0,2307.55,2300.0,2307.55,2307.55,2307.08,24903,5745330000000.0,,,
1,2000-01-04,HINDLEVER,EQ,2307.55,2381.0,2381.0,2250.0,2270.0,2281.05,2298.15,58514,13447400000000.0,,,
2,2000-01-05,HINDLEVER,EQ,2281.05,2200.0,2250.0,2105.0,2183.5,2199.9,2211.17,80296,17754820000000.0,,,
3,2000-01-06,HINDLEVER,EQ,2199.9,2224.9,2360.0,2210.0,2300.0,2288.2,2301.8,190032,43741570000000.0,,,
4,2000-01-07,HINDLEVER,EQ,2288.2,2300.0,2452.0,2282.25,2395.0,2412.95,2380.24,192582,45839190000000.0,,,


In [4]:
stock_data.columns

Index(['Date', 'Symbol', 'Series', 'Prev Close', 'Open', 'High', 'Low', 'Last',
       'Close', 'VWAP', 'Volume', 'Turnover', 'Trades', 'Deliverable Volume',
       '%Deliverble'],
      dtype='object')

In [5]:
stock_data = stock_data[['Date','Open', 'High', 'Low','Close','Volume']]
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2000-01-03,2300.0,2307.55,2300.0,2307.55,24903
1,2000-01-04,2381.0,2381.0,2250.0,2281.05,58514
2,2000-01-05,2200.0,2250.0,2105.0,2199.9,80296
3,2000-01-06,2224.9,2360.0,2210.0,2288.2,190032
4,2000-01-07,2300.0,2452.0,2282.25,2412.95,192582


In [6]:
randint(10, 30)

12

In [7]:
stock_data.shape

(5306, 6)

In [8]:
stock_data['Open_Change'] = (stock_data['Open'] - stock_data['Open'].shift(1)) / stock_data['Open'].shift(1)
stock_data['High_Change'] = (stock_data['High'] - stock_data['High'].shift(1)) / stock_data['High'].shift(1)
stock_data['Low_Change'] = (stock_data['Low'] - stock_data['Low'].shift(1)) / stock_data['Low'].shift(1)
stock_data['Close_Change'] = (stock_data['Close'] - stock_data['Close'].shift(1)) / stock_data['Close'].shift(1)
stock_data['Volume_Change'] = (stock_data['Volume'] - stock_data['Volume'].shift(1)) / stock_data['Volume'].shift(1)

In [9]:
stock_data.dropna(inplace=True)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change
1,2000-01-04,2381.0,2381.0,2250.0,2281.05,58514,0.035217,0.03183,-0.021739,-0.011484,1.349677
2,2000-01-05,2200.0,2250.0,2105.0,2199.9,80296,-0.076018,-0.055019,-0.064444,-0.035576,0.372253
3,2000-01-06,2224.9,2360.0,2210.0,2288.2,190032,0.011318,0.048889,0.049881,0.040138,1.366643
4,2000-01-07,2300.0,2452.0,2282.25,2412.95,192582,0.033754,0.038983,0.032692,0.054519,0.013419
5,2000-01-10,2445.0,2499.0,2410.0,2431.25,86591,0.063043,0.019168,0.055975,0.007584,-0.550368


In [10]:
# Creating Flags based on percentage changes to build transition matrix

stock_data['Open_Ind'] = pd.qcut(stock_data['Open_Change'], 3, labels=["L", "M", "H"])
stock_data['High_Ind'] = pd.qcut(stock_data['High_Change'], 3, labels=["L", "M", "H"])
stock_data['Low_Ind'] = pd.qcut(stock_data['Low_Change'], 3, labels=["L", "M", "H"])
stock_data['Close_Ind'] = pd.qcut(stock_data['Close_Change'], 3, labels=["L", "M", "H"])
stock_data['Volume_Ind'] = pd.qcut(stock_data['Volume_Change'], 3, labels=["L", "M", "H"])

stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,Open_Ind,High_Ind,Low_Ind,Close_Ind,Volume_Ind
1,2000-01-04,2381.0,2381.0,2250.0,2281.05,58514,0.035217,0.03183,-0.021739,-0.011484,1.349677,H,H,L,L,H
2,2000-01-05,2200.0,2250.0,2105.0,2199.9,80296,-0.076018,-0.055019,-0.064444,-0.035576,0.372253,L,L,L,L,H
3,2000-01-06,2224.9,2360.0,2210.0,2288.2,190032,0.011318,0.048889,0.049881,0.040138,1.366643,H,H,H,H,H
4,2000-01-07,2300.0,2452.0,2282.25,2412.95,192582,0.033754,0.038983,0.032692,0.054519,0.013419,H,H,H,H,M
5,2000-01-10,2445.0,2499.0,2410.0,2431.25,86591,0.063043,0.019168,0.055975,0.007584,-0.550368,H,H,H,H,L


In [11]:
cols = ['Open_Ind','High_Ind','Low_Ind','Close_Ind','Volume_Ind']
stock_data['predictor_flag'] = stock_data[cols].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,Open_Ind,High_Ind,Low_Ind,Close_Ind,Volume_Ind,predictor_flag
1,2000-01-04,2381.0,2381.0,2250.0,2281.05,58514,0.035217,0.03183,-0.021739,-0.011484,1.349677,H,H,L,L,H,HHLLH
2,2000-01-05,2200.0,2250.0,2105.0,2199.9,80296,-0.076018,-0.055019,-0.064444,-0.035576,0.372253,L,L,L,L,H,LLLLH
3,2000-01-06,2224.9,2360.0,2210.0,2288.2,190032,0.011318,0.048889,0.049881,0.040138,1.366643,H,H,H,H,H,HHHHH
4,2000-01-07,2300.0,2452.0,2282.25,2412.95,192582,0.033754,0.038983,0.032692,0.054519,0.013419,H,H,H,H,M,HHHHM
5,2000-01-10,2445.0,2499.0,2410.0,2431.25,86591,0.063043,0.019168,0.055975,0.007584,-0.550368,H,H,H,H,L,HHHHL


In [12]:
stock_data['movement'] = (stock_data['Open'].shift(-1) - stock_data['Open'])/stock_data['Open']
stock_data['movement_flag'] = stock_data['movement'].apply(lambda row: 0 if row<=0 else 1)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,Open_Ind,High_Ind,Low_Ind,Close_Ind,Volume_Ind,predictor_flag,movement,movement_flag
1,2000-01-04,2381.0,2381.0,2250.0,2281.05,58514,0.035217,0.03183,-0.021739,-0.011484,1.349677,H,H,L,L,H,HHLLH,-0.076018,0
2,2000-01-05,2200.0,2250.0,2105.0,2199.9,80296,-0.076018,-0.055019,-0.064444,-0.035576,0.372253,L,L,L,L,H,LLLLH,0.011318,1
3,2000-01-06,2224.9,2360.0,2210.0,2288.2,190032,0.011318,0.048889,0.049881,0.040138,1.366643,H,H,H,H,H,HHHHH,0.033754,1
4,2000-01-07,2300.0,2452.0,2282.25,2412.95,192582,0.033754,0.038983,0.032692,0.054519,0.013419,H,H,H,H,M,HHHHM,0.063043,1
5,2000-01-10,2445.0,2499.0,2410.0,2431.25,86591,0.063043,0.019168,0.055975,0.007584,-0.550368,H,H,H,H,L,HHHHL,0.002045,1


In [13]:
stock_data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Open_Change',
       'High_Change', 'Low_Change', 'Close_Change', 'Volume_Change',
       'Open_Ind', 'High_Ind', 'Low_Ind', 'Close_Ind', 'Volume_Ind',
       'predictor_flag', 'movement', 'movement_flag'],
      dtype='object')

In [14]:
stock_data = stock_data [['Date','predictor_flag','movement_flag']]
stock_data.head()

Unnamed: 0,Date,predictor_flag,movement_flag
1,2000-01-04,HHLLH,0
2,2000-01-05,LLLLH,1
3,2000-01-06,HHHHH,1
4,2000-01-07,HHHHM,1
5,2000-01-10,HHHHL,1


In [15]:
# Enumerating flags

unique_combination_list = stock_data['predictor_flag'].unique().tolist()
predictor_flag_to_idx = {}

for i in range(len(unique_combination_list)):
    predictor_flag_to_idx[unique_combination_list[i]] = i 

predictor_flag_to_idx

{'HHLLH': 0,
 'LLLLH': 1,
 'HHHHH': 2,
 'HHHHM': 3,
 'HHHHL': 4,
 'MHLHH': 5,
 'HLMLL': 6,
 'MLLLM': 7,
 'LLMLM': 8,
 'LMLLH': 9,
 'LLHMM': 10,
 'MHHHL': 11,
 'MHMHM': 12,
 'HHMMH': 13,
 'LLLLL': 14,
 'LHHHM': 15,
 'HLLHL': 16,
 'MMLLL': 17,
 'LLLLM': 18,
 'MMLHH': 19,
 'LLMLL': 20,
 'LLLMM': 21,
 'HHHMH': 22,
 'LLMHL': 23,
 'LHMHH': 24,
 'LLHHL': 25,
 'HLLLL': 26,
 'LLLHM': 27,
 'HMHLL': 28,
 'LLLMH': 29,
 'HHHMM': 30,
 'LLHLM': 31,
 'MHHHM': 32,
 'MHLLH': 33,
 'LLLML': 34,
 'HHMLL': 35,
 'LLLHH': 36,
 'HMLLH': 37,
 'LHHHH': 38,
 'LLLHL': 39,
 'MHHHH': 40,
 'MHHLL': 41,
 'MMLHM': 42,
 'MLHHM': 43,
 'LMLLL': 44,
 'HHLLL': 45,
 'HHHLL': 46,
 'LLMML': 47,
 'LMLHH': 48,
 'HMHML': 49,
 'MHHMM': 50,
 'MLLHM': 51,
 'MLHLL': 52,
 'LMLHL': 53,
 'LMLHM': 54,
 'HLMML': 55,
 'LHLHM': 56,
 'HLHLM': 57,
 'MLMML': 58,
 'MLMLH': 59,
 'HLHLL': 60,
 'LHMHM': 61,
 'HHHML': 62,
 'MMLLM': 63,
 'LLMHM': 64,
 'HHMHH': 65,
 'HLHHL': 66,
 'HLHML': 67,
 'MLMLL': 68,
 'MLLLH': 69,
 'MLMHL': 70,
 'MMMMM': 71,
 '

In [16]:
stock_data['predictor_flag_id'] = stock_data.apply(lambda row: predictor_flag_to_idx[row['predictor_flag']], axis=1)
stock_data['predictor_flag_id'] = stock_data['predictor_flag_id'].astype(str)
stock_data.head()

Unnamed: 0,Date,predictor_flag,movement_flag,predictor_flag_id
1,2000-01-04,HHLLH,0,0
2,2000-01-05,LLLLH,1,1
3,2000-01-06,HHHHH,1,2
4,2000-01-07,HHHHM,1,3
5,2000-01-10,HHHHL,1,4


In [17]:
# To create transition matrix, we need to generate random sequence of events that preceeds a upward or downward movement
simulation_data = pd.DataFrame()

for iter in tqdm(range(0, 100000)):
    seq_len = randint(5, 10) ## to observe the effect of maximum 10 lags
    row_start = randint(0, len(stock_data)-seq_len)
    stock_slice = stock_data.iloc[row_start:row_start+seq_len]
    target_date = stock_slice['Date'].max()
    label = stock_slice[stock_slice['Date']==target_date]['movement_flag'].values[0]
    stock_slice['label'] = label
    stock_slice['final_date'] = target_date
    stock_slice['run_id'] = iter
    simulation_data = simulation_data.append(stock_slice)
    
simulation_data.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":
100%|██████████| 100000/100000 [33:42<00:00, 49.45it/s]


Unnamed: 0,Date,predictor_flag,movement_flag,predictor_flag_id,label,final_date,run_id
3398,2013-08-07,LLLLH,0,1,1,2013-08-22,0
3399,2013-08-08,MLMML,0,58,1,2013-08-22,0
3400,2013-08-12,MHHHM,1,32,1,2013-08-22,0
3401,2013-08-13,HHHHL,1,4,1,2013-08-22,0
3402,2013-08-14,HLHLM,0,57,1,2013-08-22,0


In [18]:
# reduce the set
simulation_data
compressed_simulation_data = simulation_data.groupby(['run_id', 
                                     'final_date'])['predictor_flag_id'].apply(lambda x: "[%s]" % ',    '.join(x)).reset_index()

compressed_simulation_data.head()

Unnamed: 0,run_id,final_date,predictor_flag_id
0,0,2013-08-22,"[1, 58, 32, 4, 57, 69, 14, ..."
1,1,2015-07-02,"[49, 40, 6, 154, 116, 2, 49,..."
2,2,2019-11-20,"[137, 37, 14, 34, 71, 73, 99..."
3,3,2008-05-06,"[2, 138, 46, 48, 103, 47, 63..."
4,4,2001-01-22,"[85, 14, 7, 18, 36, 2, 6, ..."


In [19]:
compressed_simulation_data.shape

(100000, 3)

In [20]:
compressed_simulation_data = compressed_simulation_data.merge(simulation_data[['run_id', 
                                     'final_date','label']].drop_duplicates(), how='left', on = ['run_id', 'final_date'])

compressed_simulation_data.head()

Unnamed: 0,run_id,final_date,predictor_flag_id,label
0,0,2013-08-22,"[1, 58, 32, 4, 57, 69, 14, ...",1
1,1,2015-07-02,"[49, 40, 6, 154, 116, 2, 49,...",1
2,2,2019-11-20,"[137, 37, 14, 34, 71, 73, 99...",0
3,3,2008-05-06,"[2, 138, 46, 48, 103, 47, 63...",0
4,4,2001-01-22,"[85, 14, 7, 18, 36, 2, 6, ...",1


In [21]:
compressed_simulation_data.shape

(100000, 4)

In [22]:
print(f"Min date : {stock_data.Date.min()}")
print(f"max date : {stock_data.Date.max()}")

Min date : 2000-01-04
max date : 2021-04-30


In [23]:
train_data = compressed_simulation_data[compressed_simulation_data['final_date']<='2020-12-31']
test_data = compressed_simulation_data[compressed_simulation_data['final_date']>'2020-12-31']

In [24]:
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [25]:
train_data.head()

Unnamed: 0,run_id,final_date,predictor_flag_id,label
0,0,2013-08-22,"[1, 58, 32, 4, 57, 69, 14, ...",1
1,1,2015-07-02,"[49, 40, 6, 154, 116, 2, 49,...",1
2,2,2019-11-20,"[137, 37, 14, 34, 71, 73, 99...",0
3,3,2008-05-06,"[2, 138, 46, 48, 103, 47, 63...",0
4,4,2001-01-22,"[85, 14, 7, 18, 36, 2, 6, ...",1


In [26]:
test_data.head()

Unnamed: 0,run_id,final_date,predictor_flag_id,label
0,70,2021-02-04,"[1, 18, 14, 73, 80, 71]",1
1,199,2021-03-31,"[116, 4, 95, 18, 1, 146, 2, ...",1
2,212,2021-02-12,"[40, 85, 18, 23, 3, 78]",0
3,544,2021-03-03,"[91, 47, 166, 112, 9, 47, 2,...",0
4,551,2021-04-01,"[34, 116, 4, 95, 18, 1, 146,...",0


In [27]:
train_data_array = []
test_data_array = []


for i in range(len(train_data)):
    predictor_list = ast.literal_eval(train_data['predictor_flag_id'][i])
    train_data_array.append(predictor_list)


for i in range(len(test_data)):
    predictor_list = ast.literal_eval(test_data['predictor_flag_id'][i])
    test_data_array.append(predictor_list)

In [28]:
train_data_array

[[1, 58, 32, 4, 57, 69, 14, 29, 30, 34],
 [49, 40, 6, 154, 116, 2, 49, 75],
 [137, 37, 14, 34, 71, 73, 99, 74, 83, 99],
 [2, 138, 46, 48, 103, 47, 63, 113],
 [85, 14, 7, 18, 36, 2, 6, 34, 93],
 [29, 83, 34, 2, 62, 0, 18],
 [123, 40, 103, 24, 57, 123, 1, 18],
 [40, 202, 121, 144, 1, 77, 30, 40],
 [2, 3, 155, 12, 186, 32, 78, 21, 93, 63],
 [4, 2, 26, 18, 20, 18],
 [87, 49, 75, 73, 14, 62, 128],
 [121, 195, 80, 63, 1],
 [195, 68, 74, 32, 3, 28, 38],
 [34, 44, 14, 97, 2, 91, 47, 40, 62],
 [3, 3, 120, 141, 94],
 [163, 152, 185, 167, 2, 100, 77, 62, 65],
 [120, 34, 152, 99, 76, 49, 76, 4, 3, 49],
 [71, 74, 63, 118, 13, 1, 14, 18],
 [132, 11, 69, 156, 157, 54, 137, 13],
 [203, 47, 58, 40, 202, 121, 144, 1],
 [128, 70, 105, 58, 71],
 [62, 36, 62, 69, 87, 136, 82],
 [1, 104, 122, 85, 104],
 [18, 1, 71, 32, 28, 21],
 [7, 21, 191, 28, 104],
 [134, 58, 69, 135, 32, 3, 91],
 [32, 3, 62, 131, 2, 3, 49, 1],
 [118, 65, 4, 161, 40, 73, 34],
 [1, 40, 4, 3, 2, 123, 2, 14, 94],
 [14, 104, 17, 69, 38],
 [1

In [29]:
y_train = train_data['label']
y_test = test_data['label']

In [30]:
## Initialize transtion matrix (A) and intial distribution function (Pi)

v = len(unique_combination_list)

A0 = np.ones((v,v))
Pi0 = np.ones(v)

A1 = np.ones((v,v))
Pi1 = np.ones(v)

In [31]:
def compute_counts(predictor_list, A, pi):
    for pred in predictor_list:
        last_idx = None
        for idx in pred:
            if last_idx is None: # For first word
                pi[idx] +=1
            else:
                A[last_idx,idx] +=1 # Count transition

            idx = last_idx

In [32]:
compute_counts([t for t,y in zip(train_data_array, y_train) if y==0],A0,Pi0)
compute_counts([t for t,y in zip(train_data_array, y_train) if y==1],A1,Pi1)

In [33]:
A0

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [34]:
## Normalize the A and Pi

A0 /= A0.sum(axis=1,keepdims=True)
Pi0 /= Pi0.sum()


A1 /= A1.sum(axis=1,keepdims=True)
Pi1 /= Pi1.sum()

In [35]:
A0

array([[0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444],
       [0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444],
       [0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444],
       ...,
       [0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444],
       [0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444],
       [0.00444444, 0.00444444, 0.00444444, ..., 0.00444444, 0.00444444,
        0.00444444]])

In [36]:
## Compute Log Probabilities

logA0 = np.log10(A0)
logPi0 = np.log10(Pi0)


logA1 = np.log10(A1)
logPi1 = np.log10(Pi1)

In [37]:
## Compute Priors

count0 = sum(y==0 for y in y_train)
count1 = sum(y==1 for y in y_train)

p0 = count0/len(y_train)
p1 = count1/len(y_train)

logp0 = np.log10(p0)
logp1 = np.log(p1)

print(p0,p1)

0.5163802908440979 0.4836197091559022


In [38]:
## Classifier

class MarkovClassifier:
    def __init__(self, logAs, logPis,logpriors):
        self.logAs = logAs
        self.logPis = logPis
        self.logpriors = logpriors

        self.k = len(logpriors) # number of classes

    def compute_log_likelihood(self, input_,class_):
        logA = self.logAs[class_]
        logPi = self.logPis[class_]

        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None: # For first word
                logprob += logPi[idx]
            else:
                logprob += logA[last_idx,idx] # Count transition

            idx = last_idx

        return logprob

    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posterior = [self.compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.k)]

            pred = np.argmax(posterior)

            predictions[i] = pred

        return predictions

In [39]:
clf = MarkovClassifier([logA0,logA1],[logPi0,logPi1],[logp0,logp1])

In [40]:
Ptrain = clf.predict(train_data_array)

In [41]:
print(f"Accuracy {np.mean(Ptrain==y_train)}")

Accuracy 0.5592757332033471


In [42]:
Ptest = clf.predict(test_data_array)
print(f"Accuracy {np.mean(Ptest==y_test)}")

Accuracy 0.5314136125654451
