In [1]:
# pip install nsepy

In [46]:
import nsepy
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [3]:
# Stock list extract from Groww - One time

stock_list = ['RELIANCE', 'TCS', 'HDFCBANK', 'INFY', 'ICICIBANK', 'HINDUNILVR',  'BAJFINANCE', 'SBIN', \
              'HDFC', 'BHARTIARTL', 'KOTAKBANK', 'ITC', 'WIPRO', 'HCLTECH', 'ASIANPAINT', 'BAJAJFINSV',\
             'LT', 'AXISBANK', 'MARUTI', 'SUNPHARMA', 'ONGC', 'TITAN', 'ULTRACEMCO', 'JSWSTEEL',\
             'ADANIPORTS', 'NESTLEIND', 'TATASTEEL', 'POWERGRID', 'TATAMOTORS', 'NTPC', 'TECHM', 'HINDALCO', \
             'IOC', 'DIVISLAB', 'HDFCLIFE', 'GRASIM', 'COALINDIA', 'SBILIFE', 'BAJAJ-AUTO', 'M&M', \
             'SHREECEM', 'CIPLA', 'BPCL', 'BRITANNIA', 'INDUSINDBK', 'TATACONSUM', 'DRREDDY', 'EICHERMOT', \
             'UPL', 'HEROMOTOCO']


In [4]:
# stock_data = pd.DataFrame()

# for sym in stock_list:
#     d = nsepy.get_history(symbol=sym,start=datetime.date(2010,1,1),end=datetime.date(2021,12,31))
#     stock_data = pd.concat([stock_data, d])
    
# stock_data.to_csv('stock_data_technical.csv')

In [5]:
df = pd.read_csv("stock_data_technical.csv")

In [6]:
df[df["Symbol"] == "SBIN"].corr(method ='pearson').style.background_gradient(cmap='coolwarm').set_precision(3)

Unnamed: 0,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
Prev Close,1.0,0.999,0.999,0.999,0.998,0.998,0.999,-0.56,-0.227,-0.338,-0.579,0.033
Open,0.999,1.0,1.0,1.0,1.0,1.0,1.0,-0.56,-0.226,-0.338,-0.579,0.033
High,0.999,1.0,1.0,1.0,1.0,1.0,1.0,-0.559,-0.223,-0.335,-0.578,0.032
Low,0.999,1.0,1.0,1.0,1.0,1.0,1.0,-0.561,-0.229,-0.341,-0.58,0.033
Last,0.998,1.0,1.0,1.0,1.0,1.0,1.0,-0.559,-0.226,-0.338,-0.579,0.032
Close,0.998,1.0,1.0,1.0,1.0,1.0,1.0,-0.559,-0.226,-0.338,-0.579,0.032
VWAP,0.999,1.0,1.0,1.0,1.0,1.0,1.0,-0.56,-0.226,-0.338,-0.579,0.032
Volume,-0.56,-0.56,-0.559,-0.561,-0.559,-0.559,-0.56,1.0,0.84,0.903,0.898,-0.228
Turnover,-0.227,-0.226,-0.223,-0.229,-0.226,-0.226,-0.226,0.84,1.0,0.922,0.75,-0.2
Trades,-0.338,-0.338,-0.335,-0.341,-0.338,-0.338,-0.338,0.903,0.922,1.0,0.8,-0.205


In [7]:
# Calculate VIF to check for multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df[["Prev Close", "Open", "High", "Low","Last","Close","VWAP", "Volume", "Turnover", "Deliverable Volume", "%Deliverble"]]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns


vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,feature,VIF
0,Prev Close,2264.799898
1,Open,11873.542093
2,High,24482.363261
3,Low,23446.227267
4,Last,64342.008198
5,Close,91304.378264
6,VWAP,79044.67258
7,Volume,3.530856
8,Turnover,2.220427
9,Deliverable Volume,3.020041


Exploratory data analysis (Checking Correlation and VIF) -  There is very high VIF amongst predictors indicating very high multicollinearity. Based on the VIF values, we can go with Volume and Turnover

In [8]:
# Function to extract data for every script from dataframe built from nsepy (df)

# Input - dataframe extracted from nsepy (df),script(s), Number of timeperiods (n_timeperiod), 
# gap between timeperiods(gap_in_timeperiod). Eg. extract_script(df, "INFY", 3, 2) will extract volumes for 3 timeperiods, 
# where the gaps between the timeperiods is 2 days 


def extract_script(df, s, n_timeperiod, gap_in_timeperiod):
    df['Date']= pd.to_datetime(df['Date'])
    df = df[df["Symbol"] == s].sort_values(by = ["Date"], ascending= False)
    
    # Next day open to be copied in the previous day for prediction comparision 
    df['Next_Open_7D'] = df["Open"].shift(7)
    df['Next_Open_30D'] = df["Open"].shift(30)
    df['Next_Open_90D'] = df["Open"].shift(90)
    
    # "Buy sign" based on whether Next Open is higher or lower than current Open. 
    df["Label_7D"] = np.where(df['Next_Open_7D'] >= df["Open"], 1, 0)
    df["Label_30D"] = np.where(df['Next_Open_30D'] >= df["Open"], 1, 0)
    df["Label_90D"] = np.where(df['Next_Open_90D'] >= df["Open"], 1, 0)
    
    df = df[["Date", "Symbol",  "Volume", "Turnover", "Open",  "Deliverable Volume", "%Deliverble", \
             "Next_Open_7D", "Next_Open_30D","Next_Open_90D",\
             "Label_7D","Label_30D", "Label_90D" ]]
    
    # Get previous days volume based on n_timeperiod
    for i in range(1,n_timeperiod):
        col_name = "Volume" + str(-i * gap_in_timeperiod)
        df[col_name] = df["Volume"].shift(-i * gap_in_timeperiod)
        
    df["Symbol"] = df["Symbol"].astype('category')
        
    df = df.dropna()
    
    cols = sorted(df.columns)

    return df[cols]

In [9]:
# Test extract_script

clean_df = extract_script(df, "INFY", 3, 2)
# clean_df = extract_script(df, "SBIN", 7)
clean_df

Unnamed: 0,%Deliverble,Date,Deliverable Volume,Label_30D,Label_7D,Label_90D,Next_Open_30D,Next_Open_7D,Next_Open_90D,Open,Symbol,Turnover,Volume,Volume-2,Volume-4
11833,0.6258,2021-08-23,3872899,0,0,1,1671.00,1709.50,1890.00,1735.75,INFY,1.077919e+15,6189051,6686090.0,4564109.0
11832,0.6256,2021-08-20,3883003,0,0,1,1672.50,1700.15,1884.50,1716.10,INFY,1.076408e+15,6206972,9510390.0,5612790.0
11831,0.6716,2021-08-18,4490519,0,0,1,1665.10,1703.05,1883.50,1729.00,INFY,1.162962e+15,6686090,4564109.0,5659579.0
11830,0.6641,2021-08-17,6316007,0,1,1,1703.00,1725.00,1880.70,1703.90,INFY,1.645334e+15,9510390,5612790.0,4899639.0
11829,0.7232,2021-08-16,3300912,0,1,1,1659.25,1734.00,1860.00,1707.70,INFY,7.776599e+14,4564109,5659579.0,6846517.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,0.4767,2011-07-11,548628,0,0,0,2213.90,2772.80,2720.00,2953.90,INFY,3.357122e+14,1150815,718934.0,802966.0
9325,0.5528,2011-07-08,391804,0,0,0,2274.00,2721.00,2731.05,3000.00,INFY,2.124590e+14,708724,430028.0,514324.0
9324,0.5489,2011-07-07,394655,0,0,0,2465.00,2735.00,2768.30,2955.20,INFY,2.138882e+14,718934,802966.0,800308.0
9323,0.5161,2011-07-06,221924,0,0,0,2396.00,2744.00,2795.00,2947.00,INFY,1.270130e+14,430028,514324.0,1417730.0


In [10]:
# LINEAR REGRESSION MODEL for predicting the Open price after "pred_ahead" days considering volumes for "n_timeperiod" previous time 
# periods with gap_in_timeperiod gaps in days between observations

def linear_reg(df, stocklist, n_timeperiod, gap_in_timeperiod = 2, pred_ahead = 7):
    
    final_df = pd.DataFrame()
    for stock in stock_list:
        clean_df = extract_script(df, stock, n_timeperiod, gap_in_timeperiod)
        final_df = pd.concat([final_df, clean_df], axis = 0)

        # Create a new column - Year for test train split

    final_df["Year"] = pd.DatetimeIndex(final_df['Date']).year

    # We will train on old data and test on new data. Train on data till 2018 and test on remaining data

    df_train = final_df[final_df["Year"] < 2019]
    df_test = final_df[final_df["Year"] > 2018]

    # Collate all columns for model fitting. 
    # Everything starting with Volume to be added in col_list 

    col_list = [col for col in df_train.columns if col.startswith("Volume")]
    # col_list.append("Symbol")
    # col_list.append("Turnover")
    # col_list.append("Deliverable Volume")
    # col_list.append("%Deliverble")

    # Test Train split

    X_train = df_train[col_list]
    X_test = df_test[col_list]

    # Depending on pred_ahead, pick the correspondinf y for training and testing

    if pred_ahead == 7:
        Y_train = df_train["Next_Open_7D"]
        Y_test = df_test["Next_Open_7D"]

    if pred_ahead == 30:
        Y_train = df_train["Next_Open_30D"]
        Y_test = df_test["Next_Open_30D"]

    if pred_ahead == 90:
        Y_train = df_train["Next_Open_90D"]
        Y_test = df_test["Next_Open_90D"]

    reg = LinearRegression().fit(X_train, Y_train)
    Y_pred = reg.predict(X_test)

    # Calculate Adjusted R^2
    Adj_R2_LR = 1 - ( 1-reg.score(X_test, Y_test) ) * ( len(Y_test) - 1 ) / ( len(Y_test) - X_test.shape[1] - 1 )

    # Concatenate Y_pred to df_test
    Y_pred = pd.DataFrame(Y_pred, columns = ['Next_Open_' + str(pred_ahead) + "D_Pred"])
    
    df_test.reset_index(inplace = True)
    Y_pred.reset_index(inplace = True)
    
    df_test = pd.concat([df_test, Y_pred], axis =1)
    
    return df_test, Adj_R2_LR

In [11]:
# Testing - 
df_test, Adj_R2_LR = linear_reg(df, stock_list, 4, 3, pred_ahead = 7)
Adj_R2_LR

-0.2127045783665289

In [12]:
df_test

Unnamed: 0,index,%Deliverble,Date,Deliverable Volume,Label_30D,Label_7D,Label_90D,Next_Open_30D,Next_Open_7D,Next_Open_90D,Open,Symbol,Turnover,Volume,Volume-3,Volume-6,Volume-9,Year,index.1,Next_Open_7D_Pred
0,2890,0.5139,2021-08-23,2336976,1,1,1,2555.10,2273.0,2373.00,2174.00,RELIANCE,9.809454e+14,4547802,5841743.0,3755507.0,3458546.0,2021,0,1294.468915
1,2889,0.4315,2021-08-20,1877339,1,1,1,2553.00,2276.9,2400.00,2143.00,RELIANCE,9.377503e+14,4350228,10123204.0,4238859.0,7670583.0,2021,1,901.717713
2,2888,0.4920,2021-08-18,2287876,1,1,1,2501.95,2250.0,2391.00,2174.00,RELIANCE,1.009141e+15,4650008,5898384.0,5500708.0,9807831.0,2021,2,941.794100
3,2887,0.5234,2021-08-17,3057597,1,1,1,2525.00,2237.0,2375.60,2168.85,RELIANCE,1.265999e+15,5841743,3755507.0,3458546.0,6077861.0,2021,3,1235.273997
4,2886,0.4134,2021-08-16,4185061,1,1,1,2570.00,2208.0,2361.55,2149.35,RELIANCE,2.204234e+15,10123204,4238859.0,7670583.0,5453682.0,2021,4,811.691870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32450,144205,0.3485,2019-01-07,126109,0,0,0,2690.00,2909.0,2725.00,3020.00,HEROMOTOCO,1.077233e+14,361881,597712.0,260717.0,655657.0,2019,32450,2049.001120
32451,144204,0.2512,2019-01-04,101891,0,0,0,2785.40,2879.7,2690.00,3030.00,HEROMOTOCO,1.220780e+14,405667,344239.0,357087.0,525895.0,2019,32451,2059.640355
32452,144203,0.2101,2019-01-03,77388,0,0,0,2755.20,2901.0,2523.00,3032.00,HEROMOTOCO,1.117172e+14,368413,279055.0,506895.0,388448.0,2019,32452,2063.107333
32453,144202,0.1938,2019-01-02,115842,0,0,0,2800.00,2923.0,2495.00,3115.00,HEROMOTOCO,1.831159e+14,597712,260717.0,655657.0,947007.0,2019,32453,2022.017366


In [13]:
pred_ahead = 7
n_timeperiod_list = [2, 3, 5, 10]
gap_in_timeperiod_list = [2, 3, 5, 10] 

Adj_R2 = {}

for t_p in n_timeperiod_list:
    for g in gap_in_timeperiod_list:
        df_test, Adj_R_squared_LR = linear_reg(df, stock_list, t_p, g, pred_ahead)
        k = str(t_p) + " " + str(g)
        Adj_R2[k] = Adj_R_squared_LR
Adj_R2

{'2 2': -0.1448733907056745,
 '2 3': -0.14995750908831273,
 '2 5': -0.15120977483947318,
 '2 10': -0.148301804964297,
 '3 2': -0.1810167820402997,
 '3 3': -0.1890327692765863,
 '3 5': -0.1882588866550161,
 '3 10': -0.18891585326632554,
 '5 2': -0.2220965736185645,
 '5 3': -0.22853486572975057,
 '5 5': -0.23121008742420135,
 '5 10': -0.23454601503559602,
 '10 2': -0.2633274731226978,
 '10 3': -0.27173704766654083,
 '10 5': -0.2790634382669419,
 '10 10': -0.2913304669463008}

In [14]:
pred_ahead = 30
n_timeperiod_list = [2, 3, 5, 10]
gap_in_timeperiod_list = [2, 3, 5, 10] 

Adj_R2 = {}

for t_p in n_timeperiod_list:
    for g in gap_in_timeperiod_list:
        df_test, Adj_R_squared_LR = linear_reg(df, stock_list, t_p, g, pred_ahead)
        k = str(t_p) + " " + str(g)
        Adj_R2[k] = Adj_R_squared_LR
Adj_R2

{'2 2': -0.14646581035937412,
 '2 3': -0.15153428275018643,
 '2 5': -0.1528074331136393,
 '2 10': -0.14991423060541864,
 '3 2': -0.18254270937382255,
 '3 3': -0.19054864798899707,
 '3 5': -0.1897997158765512,
 '3 10': -0.19058021485804733,
 '5 2': -0.2235672446477197,
 '5 3': -0.22999026067303108,
 '5 5': -0.23276407992406623,
 '5 10': -0.23635002346884693,
 '10 2': -0.2647659024457585,
 '10 3': -0.2733606885955988,
 '10 5': -0.2808057269623996,
 '10 10': -0.29289078346528163}

In [15]:
pred_ahead = 90
n_timeperiod_list = [2, 3, 5, 10]
gap_in_timeperiod_list = [2, 3, 5, 10] 

Adj_R2 = {}

for t_p in n_timeperiod_list:
    for g in gap_in_timeperiod_list:
        df_test, Adj_R_squared_LR = linear_reg(df, stock_list, t_p, g, pred_ahead = pred_ahead)
        k = str(t_p) + " " + str(g)
        Adj_R2[k] = Adj_R_squared_LR
Adj_R2

{'2 2': -0.15387917531537632,
 '2 3': -0.1590651187910801,
 '2 5': -0.16038391425630039,
 '2 10': -0.1575644721800784,
 '3 2': -0.19085927857917406,
 '3 3': -0.19903867335811976,
 '3 5': -0.19837126443494602,
 '3 10': -0.19924923637603076,
 '5 2': -0.2328574138277224,
 '5 3': -0.23949083552481154,
 '5 5': -0.24238567977341297,
 '5 10': -0.24595188724478212,
 '10 2': -0.2750902445930017,
 '10 3': -0.28378087036866506,
 '10 5': -0.29132018503162205,
 '10 10': -0.30442242651157003}

In [16]:
# Calculate accuracy of trend prediction after 7 days, using 7 previous time periods at gaps of 3 days

df_test, Adj_R2_LR = linear_reg(df, stock_list, 7, 3, pred_ahead = 7)

In [17]:
# Label for predicted stock price after 7 days

df_test["Label_7D_Pred"] = np.where(df_test['Next_Open_7D_Pred'] >= df_test["Open"], 1, 0)
df_test[["Symbol", "Open", "Next_Open_7D", "Label_7D", "Next_Open_7D_Pred",  "Label_7D_Pred"]]

Unnamed: 0,Symbol,Open,Next_Open_7D,Label_7D,Next_Open_7D_Pred,Label_7D_Pred
0,RELIANCE,2174.00,2273.0,1,1176.835303,0
1,RELIANCE,2143.00,2276.9,1,881.476932,0
2,RELIANCE,2174.00,2250.0,1,955.983151,0
3,RELIANCE,2168.85,2237.0,1,1195.947578,0
4,RELIANCE,2149.35,2208.0,1,890.467429,0
...,...,...,...,...,...,...
32441,HEROMOTOCO,3020.00,2909.0,0,2065.934251,0
32442,HEROMOTOCO,3030.00,2879.7,0,2056.808907,0
32443,HEROMOTOCO,3032.00,2901.0,0,2046.295043,0
32444,HEROMOTOCO,3115.00,2923.0,0,2056.215366,0


In [18]:
accuracy_score(df_test["Label_7D"], df_test["Label_7D_Pred"])

0.48788756703445724

In [19]:
# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_7D"], df_test[df_test["Symbol"] == s]["Label_7D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

{'TATACONSUM': 0.6951566951566952,
 'TITAN': 0.6030534351145038,
 'TECHM': 0.5938931297709924,
 'KOTAKBANK': 0.5801526717557252,
 'SBILIFE': 0.5694656488549619,
 'GRASIM': 0.566412213740458,
 'ITC': 0.5618320610687023,
 'HCLTECH': 0.549618320610687,
 'HDFCLIFE': 0.5389312977099237,
 'HEROMOTOCO': 0.5267175572519084,
 'MARUTI': 0.5236641221374045,
 'UPL': 0.5221374045801527,
 'EICHERMOT': 0.5190839694656488,
 'NTPC': 0.5114503816793893,
 'COALINDIA': 0.5083969465648855,
 'BPCL': 0.5022900763358779,
 'IOC': 0.500763358778626,
 'SUNPHARMA': 0.49923664122137407,
 'HINDUNILVR': 0.4961832061068702,
 'POWERGRID': 0.4916030534351145,
 'SBIN': 0.4900763358778626,
 'ASIANPAINT': 0.48854961832061067,
 'DIVISLAB': 0.48854961832061067,
 'ADANIPORTS': 0.4870229007633588,
 'TATAMOTORS': 0.4870229007633588,
 'BAJAJ-AUTO': 0.4870229007633588,
 'BHARTIARTL': 0.4854961832061069,
 'CIPLA': 0.4854961832061069,
 'BRITANNIA': 0.48244274809160304,
 'ONGC': 0.4763358778625954,
 'INDUSINDBK': 0.4748091603053435

In [20]:
# Lets check how close were the predictions for the top 3 stocks

top_3 = df_test[(df_test["Symbol"] == "TATACONSUM") | (df_test["Symbol"] == "TITAN") | (df_test["Symbol"] == "TECHM")][["Symbol", "Next_Open_7D", "Next_Open_7D_Pred"]]

top_3["variation"] = (df_test["Next_Open_7D"] - df_test["Next_Open_7D_Pred"]) * 100/ df_test["Next_Open_7D"] 

top_3.sort_values(by = ["Symbol"])

top_3["variation"].mean()

# There is very high variation in the predicted stock price

-74.70144843836279

In [21]:
# Calculate accuracy of trend prediction after 30 days, using 7 previous time periods at gaps of 5 days

df_test, Adj_R2_LR = linear_reg(df, stock_list, 7, 5, pred_ahead = 30)

In [22]:
# Label for predicted stock price after 30 days

df_test["Label_30D_Pred"] = np.where(df_test['Next_Open_30D_Pred'] >= df_test["Open"], 1, 0)
df_test[["Symbol", "Open", "Next_Open_30D", "Label_30D", "Next_Open_30D_Pred",  "Label_30D_Pred"]]

Unnamed: 0,Symbol,Open,Next_Open_30D,Label_30D,Next_Open_30D_Pred,Label_30D_Pred
0,RELIANCE,2174.00,2555.10,1,1069.715810,0
1,RELIANCE,2143.00,2553.00,1,1098.108437,0
2,RELIANCE,2174.00,2501.95,1,1244.517315,0
3,RELIANCE,2168.85,2525.00,1,1137.897459,0
4,RELIANCE,2149.35,2570.00,1,997.712385,0
...,...,...,...,...,...,...
32429,HEROMOTOCO,3020.00,2690.00,0,2128.011967,0
32430,HEROMOTOCO,3030.00,2785.40,0,2109.152066,0
32431,HEROMOTOCO,3032.00,2755.20,0,2077.188120,0
32432,HEROMOTOCO,3115.00,2800.00,0,2102.377488,0


In [23]:
accuracy_score(df_test["Label_30D"], df_test["Label_30D_Pred"])

0.45927113522846397

In [24]:
# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_30D"], df_test[df_test["Symbol"] == s]["Label_30D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

{'TATACONSUM': 0.8259587020648967,
 'SBILIFE': 0.7068702290076336,
 'TITAN': 0.6992366412213741,
 'GRASIM': 0.6946564885496184,
 'TECHM': 0.6778625954198473,
 'HDFCLIFE': 0.6290076335877862,
 'KOTAKBANK': 0.5969465648854961,
 'HCLTECH': 0.5679389312977099,
 'LT': 0.5419847328244275,
 'BPCL': 0.5419847328244275,
 'BRITANNIA': 0.5358778625954198,
 'ADANIPORTS': 0.5312977099236641,
 'HEROMOTOCO': 0.5267175572519084,
 'ITC': 0.5236641221374045,
 'IOC': 0.5022900763358779,
 'DIVISLAB': 0.5022900763358779,
 'EICHERMOT': 0.500763358778626,
 'NTPC': 0.4946564885496183,
 'INDUSINDBK': 0.4900763358778626,
 'MARUTI': 0.48396946564885496,
 'COALINDIA': 0.4778625954198473,
 'UPL': 0.4778625954198473,
 'M&M': 0.4763358778625954,
 'HINDUNILVR': 0.4732824427480916,
 'POWERGRID': 0.4687022900763359,
 'CIPLA': 0.44732824427480916,
 'BHARTIARTL': 0.44580152671755724,
 'JSWSTEEL': 0.4213740458015267,
 'BAJAJ-AUTO': 0.4183206106870229,
 'SUNPHARMA': 0.4152671755725191,
 'HDFC': 0.41374045801526715,
 'ASIAN

In [25]:
# Lets check how close were the predictions for the top 3 stocks

top_3 = df_test[(df_test["Symbol"] == "TATACONSUM") | (df_test["Symbol"] == "TITAN") | (df_test["Symbol"] == "SBILIFE")][["Symbol", "Next_Open_30D", "Next_Open_30D_Pred"]]

top_3["variation"] = (df_test["Next_Open_30D"] - df_test["Next_Open_30D_Pred"]) * 100/ df_test["Next_Open_30D"] 

top_3.sort_values(by = ["Symbol"])

top_3["variation"].mean()

# There is very high variation in the predicted stock price

-87.83138531676178

In [26]:
# Calculate accuracy of trend prediction after 90 days, using 10 previous time periods at gaps of 7 days

df_test, Adj_R2_LR = linear_reg(df, stock_list, 10, 7, pred_ahead = 90)

In [27]:
# Label for predicted stock price after 90 days

df_test["Label_90D_Pred"] = np.where(df_test['Next_Open_90D_Pred'] >= df_test["Open"], 1, 0)
df_test[["Symbol", "Open", "Next_Open_90D", "Label_90D", "Next_Open_90D_Pred",  "Label_90D_Pred"]]

Unnamed: 0,Symbol,Open,Next_Open_90D,Label_90D,Next_Open_90D_Pred,Label_90D_Pred
0,RELIANCE,2174.00,2373.00,1,996.433331,0
1,RELIANCE,2143.00,2400.00,1,836.489065,0
2,RELIANCE,2174.00,2391.00,1,200.294390,0
3,RELIANCE,2168.85,2375.60,1,292.298340,0
4,RELIANCE,2149.35,2361.55,1,149.786862,0
...,...,...,...,...,...,...
32396,HEROMOTOCO,3020.00,2725.00,0,2201.866857,0
32397,HEROMOTOCO,3030.00,2690.00,0,2217.126999,0
32398,HEROMOTOCO,3032.00,2523.00,0,2207.409735,0
32399,HEROMOTOCO,3115.00,2495.00,0,2151.228459,0


In [28]:
accuracy_score(df_test["Label_90D"], df_test["Label_90D_Pred"])

0.42832011357674143

In [29]:
# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_90D"], df_test[df_test["Symbol"] == s]["Label_90D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

{'TATACONSUM': 0.9117647058823529,
 'SBILIFE': 0.8091603053435115,
 'TITAN': 0.732824427480916,
 'HDFCLIFE': 0.7221374045801526,
 'GRASIM': 0.6931297709923664,
 'HEROMOTOCO': 0.6885496183206107,
 'TECHM': 0.6595419847328244,
 'EICHERMOT': 0.601526717557252,
 'ITC': 0.5786259541984733,
 'BPCL': 0.5740458015267176,
 'ADANIPORTS': 0.5450381679389313,
 'KOTAKBANK': 0.5099236641221374,
 'LT': 0.4854961832061069,
 'HINDUNILVR': 0.48091603053435117,
 'M&M': 0.467175572519084,
 'BRITANNIA': 0.4488549618320611,
 'HDFCBANK': 0.4442748091603053,
 'CIPLA': 0.4396946564885496,
 'BHARTIARTL': 0.4351145038167939,
 'IOC': 0.4351145038167939,
 'ONGC': 0.4305343511450382,
 'NTPC': 0.42900763358778626,
 'MARUTI': 0.42748091603053434,
 'HCLTECH': 0.4259541984732824,
 'SUNPHARMA': 0.416793893129771,
 'DIVISLAB': 0.41374045801526715,
 'JSWSTEEL': 0.40610687022900765,
 'BAJAJ-AUTO': 0.3938931297709924,
 'COALINDIA': 0.39083969465648855,
 'UPL': 0.38778625954198476,
 'AXISBANK': 0.38625954198473283,
 'POWERGR

In [30]:
# Lets check how close were the predictions for the top 3 stocks

top_3 = df_test[(df_test["Symbol"] == "TATACONSUM") | (df_test["Symbol"] == "TITAN") | (df_test["Symbol"] == "SBILIFE")][["Symbol", "Next_Open_90D", "Next_Open_90D_Pred"]]

top_3["variation"] = (df_test["Next_Open_90D"] - df_test["Next_Open_90D_Pred"]) * 100/ df_test["Next_Open_90D"] 

top_3.sort_values(by = ["Symbol"])

top_3["variation"].mean()

# There is very high variation in the predicted stock price

-77.02462066626883

<b>Observations for Linear Regression</b>: Poor accuracy in general with best label prediction accuracy being 48% observed for T+7 prediction. For T+30 and T+90 it is still lower. At an individual stock level, label prediction has decent accuracy ranging from  about 65% for T+7 for top 3 stocks to 85% for T+90.

In [31]:

# LOGISTIC REGRESSION MODEL for prediction the trend (1(Up) or 0 (Down)) after pred_ahead days considering volumes for "n_timeperiod" previous time 
# periods with gap_in_timeperiod gaps in days between observations

def log_reg(df, stocklist, n_timeperiod, gap_in_timeperiod = 2, pred_ahead = 7):
    
    final_df = pd.DataFrame()
    for stock in stock_list:
        clean_df = extract_script(df, stock, n_timeperiod, gap_in_timeperiod)
        final_df = pd.concat([final_df, clean_df], axis = 0)

        # Create a new column - Year for test train split

    final_df["Year"] = pd.DatetimeIndex(final_df['Date']).year

    # We will train on old data and test on new data. Train on data till 2018 and test on remaining data

    df_train = final_df[final_df["Year"] < 2019]
    df_test = final_df[final_df["Year"] > 2018]

    # Collate all columns for model fitting. 
    # Everything starting with Volume to be added in col_list 

    col_list = [col for col in df_train.columns if col.startswith("Volume")]
    # col_list.append("Symbol")
    # col_list.append("Turnover")
    # col_list.append("Deliverable Volume")
    # col_list.append("%Deliverble")

    # Test Train split

    X_train = df_train[col_list]
    X_test = df_test[col_list]

    # Depending on pred_ahead, pick the corresponding y label for training and testing

    if pred_ahead == 7:
        Y_train = df_train["Label_7D"]
        Y_test = df_test["Label_7D"]

    if pred_ahead == 30:
        Y_train = df_train["Label_30D"]
        Y_test = df_test["Label_30D"]

    if pred_ahead == 90:
        Y_train = df_train["Label_90D"]
        Y_test = df_test["Label_90D"]

    log_LR = LogisticRegression(random_state=0).fit(X_train, Y_train)
    Y_pred = log_LR.predict(X_test)

    # Calculate Adjusted R^2
    Adj_R2_LR = 1 - ( 1-log_LR.score(X_test, Y_test) ) * ( len(Y_test) - 1 ) / ( len(Y_test) - X_test.shape[1] - 1 )

    # Concatenate Y_pred to df_test
    Y_pred = pd.DataFrame(Y_pred, columns = ['Label_' + str(pred_ahead) + "D_Pred"])
    
    df_test.reset_index(inplace = True)
    Y_pred.reset_index(inplace = True)
    
    df_test = pd.concat([df_test, Y_pred], axis =1)
    
    return df_test, Adj_R2_LR

In [32]:
# Logistic Regression for 7 days ahead prediction

df_test, Adj_R2_Log_R = log_reg(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 3, pred_ahead = 7)
acc = accuracy_score(df_test["Label_7D"], df_test["Label_7D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_Log_R,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_7D"], df_test[df_test["Symbol"] == s]["Label_7D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}



R-squared : 0.5392891242208993 Accuracy : 0.5393885224681009


{'TATACONSUM': 0.7008547008547008,
 'DIVISLAB': 0.6396946564885496,
 'INFY': 0.6244274809160305,
 'TITAN': 0.6137404580152672,
 'TATASTEEL': 0.6106870229007634,
 'TECHM': 0.6091603053435114,
 'HCLTECH': 0.601526717557252,
 'ASIANPAINT': 0.6,
 'BAJFINANCE': 0.5954198473282443,
 'BAJAJFINSV': 0.5938931297709924,
 'WIPRO': 0.5862595419847328,
 'NESTLEIND': 0.5816793893129771,
 'ICICIBANK': 0.5740458015267176,
 'JSWSTEEL': 0.5725190839694656,
 'ULTRACEMCO': 0.5709923664122137,
 'SBILIFE': 0.5709923664122137,
 'GRASIM': 0.5648854961832062,
 'UPL': 0.5618320610687023,
 'DRREDDY': 0.5541984732824428,
 'RELIANCE': 0.5511450381679389,
 'AXISBANK': 0.5465648854961832,
 'ADANIPORTS': 0.5465648854961832,
 'TCS': 0.5435114503816794,
 'HINDALCO': 0.5435114503816794,
 'HDFCLIFE': 0.5435114503816794,
 'SHREECEM': 0.5435114503816794,
 'BHARTIARTL': 0.5419847328244275,
 'HDFCBANK': 0.5389312977099237,
 'HDFC': 0.5282442748091603,
 'BAJAJ-AUTO': 0.5236641221374045,
 'INDUSINDBK': 0.5221374045801527,
 'SU

In [33]:
# Logistic Regression for 30 days ahead prediction

df_test, Adj_R2_Log_R = log_reg(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 5, pred_ahead = 30)
acc = accuracy_score(df_test["Label_30D"], df_test["Label_30D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_Log_R,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_30D"], df_test[df_test["Symbol"] == s]["Label_30D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.6104789192934851 Accuracy : 0.6105629894555097


{'TATACONSUM': 0.8407079646017699,
 'INFY': 0.732824427480916,
 'BAJFINANCE': 0.7297709923664122,
 'DIVISLAB': 0.7175572519083969,
 'TECHM': 0.7129770992366412,
 'SBILIFE': 0.7129770992366412,
 'TITAN': 0.7038167938931298,
 'BAJAJFINSV': 0.6992366412213741,
 'GRASIM': 0.6931297709923664,
 'TCS': 0.6870229007633588,
 'ICICIBANK': 0.6854961832061068,
 'ASIANPAINT': 0.6824427480916031,
 'TATASTEEL': 0.6809160305343511,
 'HCLTECH': 0.6702290076335878,
 'ULTRACEMCO': 0.6702290076335878,
 'JSWSTEEL': 0.6595419847328244,
 'DRREDDY': 0.6580152671755726,
 'HDFCLIFE': 0.6564885496183206,
 'HINDALCO': 0.6534351145038167,
 'AXISBANK': 0.650381679389313,
 'NESTLEIND': 0.6473282442748092,
 'SHREECEM': 0.6458015267175573,
 'WIPRO': 0.6274809160305344,
 'BHARTIARTL': 0.6198473282442748,
 'SUNPHARMA': 0.6122137404580152,
 'ADANIPORTS': 0.601526717557252,
 'HINDUNILVR': 0.6,
 'BAJAJ-AUTO': 0.5954198473282443,
 'UPL': 0.5954198473282443,
 'TATAMOTORS': 0.5938931297709924,
 'HDFCBANK': 0.5923664122137404,

In [34]:
# Logistic Regression for 90 days ahead prediction

df_test, Adj_R2_Log_R = log_reg(df, stock_list, n_timeperiod = 10, gap_in_timeperiod = 7, pred_ahead = 90)
acc = accuracy_score(df_test["Label_90D"], df_test["Label_90D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_Log_R, acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_90D"], df_test[df_test["Symbol"] == s]["Label_90D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.6436047016146909 Accuracy : 0.6437147001635752


{'TATACONSUM': 0.9411764705882353,
 'DIVISLAB': 0.8610687022900764,
 'BAJFINANCE': 0.8381679389312977,
 'ICICIBANK': 0.8229007633587786,
 'ASIANPAINT': 0.8015267175572519,
 'NESTLEIND': 0.7893129770992366,
 'SBILIFE': 0.7893129770992366,
 'BAJAJFINSV': 0.7877862595419848,
 'BHARTIARTL': 0.7816793893129771,
 'SUNPHARMA': 0.7694656488549618,
 'HINDUNILVR': 0.7435114503816794,
 'TITAN': 0.7419847328244275,
 'TCS': 0.7297709923664122,
 'HDFCLIFE': 0.7251908396946565,
 'INFY': 0.716030534351145,
 'SBIN': 0.7114503816793893,
 'HDFC': 0.7007633587786259,
 'TECHM': 0.6992366412213741,
 'GRASIM': 0.6931297709923664,
 'KOTAKBANK': 0.6885496183206107,
 'CIPLA': 0.683969465648855,
 'ULTRACEMCO': 0.6793893129770993,
 'HINDALCO': 0.6763358778625954,
 'RELIANCE': 0.6732824427480916,
 'HCLTECH': 0.6534351145038167,
 'DRREDDY': 0.6458015267175573,
 'TATASTEEL': 0.6335877862595419,
 'TATAMOTORS': 0.6335877862595419,
 'BAJAJ-AUTO': 0.6244274809160305,
 'SHREECEM': 0.6229007633587786,
 'WIPRO': 0.61526717

<b>Observations for Logistic Regression: </b> Better accuracy as compared to Linear Regression. In the top 5 for the 3 timeperiods, there are common stocks as compared to Linear Regression output. We also see new entrants like BAJAJFINANCE, INFY and DIVISLAB  which have higher label prediction accuracy. 

In [35]:

# # Support Vector Machines (SVM) MODEL for prediction the trend (1(Up) or 0 (Down)) after pred_ahead days considering volumes for "n_timeperiod" previous time 
# # periods with gap_in_timeperiod gaps in days between observations

# def svm(df, stocklist, n_timeperiod, gap_in_timeperiod = 2, pred_ahead = 7):
    
#     final_df = pd.DataFrame()
#     for stock in stock_list:
#         clean_df = extract_script(df, stock, n_timeperiod, gap_in_timeperiod)
#         final_df = pd.concat([final_df, clean_df], axis = 0)

#         # Create a new column - Year for test train split

#     final_df["Year"] = pd.DatetimeIndex(final_df['Date']).year

#     # We will train on old data and test on new data. Train on data till 2018 and test on remaining data

#     df_train = final_df[final_df["Year"] < 2019]
#     df_test = final_df[final_df["Year"] > 2018]

#     # Collate all columns for model fitting. 
#     # Everything starting with Volume to be added in col_list 

#     col_list = [col for col in df_train.columns if col.startswith("Volume")]
#     # col_list.append("Symbol")
#     # col_list.append("Turnover")
#     # col_list.append("Deliverable Volume")
#     # col_list.append("%Deliverble")

#     # Test Train split

#     X_train = df_train[col_list]
#     X_test = df_test[col_list]

#     # Depending on pred_ahead, pick the corresponding y label for training and testing

#     if pred_ahead == 7:
#         Y_train = df_train["Label_7D"]
#         Y_test = df_test["Label_7D"]

#     if pred_ahead == 30:
#         Y_train = df_train["Label_30D"]
#         Y_test = df_test["Label_30D"]

#     if pred_ahead == 90:
#         Y_train = df_train["Label_90D"]
#         Y_test = df_test["Label_90D"]
    
# #     Try different kernels. Linear didnt yield good result.  
#     svm = make_pipeline(StandardScaler(), SVC(kernel = "rbf", gamma='auto'))
#     svm.fit(X_train, Y_train)
#     Y_pred = svm.predict(X_test)

#     # Calculate Adjusted R^2
#     Adj_R2_svm = 1 - ( 1-svm.score(X_test, Y_test) ) * ( len(Y_test) - 1 ) / ( len(Y_test) - X_test.shape[1] - 1 )

#     # Concatenate Y_pred to df_test
#     Y_pred = pd.DataFrame(Y_pred, columns = ['Label_' + str(pred_ahead) + "D_Pred"])
    
#     df_test.reset_index(inplace = True)
#     Y_pred.reset_index(inplace = True)
    
#     df_test = pd.concat([df_test, Y_pred], axis =1)
    
#     return df_test, Adj_R2_svm


In [36]:
# # SVM classification for 7 days ahead prediction

# df_test, Adj_R2_svm = svm(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 3, pred_ahead = 7)
# acc = accuracy_score(df_test["Label_7D"], df_test["Label_7D_Pred"])
# print("R-squared : {} Accuracy : {}".format(Adj_R2_svm,acc)) 

# # Checking if the accuracy is better for some stocks in the list
# stock_acc = {}
# for s in stock_list:
#     stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_7D"], df_test[df_test["Symbol"] == s]["Label_7D_Pred"])

# {k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}



In [40]:
# DECISION TREE for prediction the trend (1(Up) or 0 (Down)) after pred_ahead days considering volumes for "n_timeperiod" previous time 
# periods with gap_in_timeperiod gaps in days between observations

def decision_tree(df, stocklist, n_timeperiod, gap_in_timeperiod = 2, pred_ahead = 7):
    
    final_df = pd.DataFrame()
    for stock in stock_list:
        clean_df = extract_script(df, stock, n_timeperiod, gap_in_timeperiod)
        final_df = pd.concat([final_df, clean_df], axis = 0)

        # Create a new column - Year for test train split

    final_df["Year"] = pd.DatetimeIndex(final_df['Date']).year

    # We will train on old data and test on new data. Train on data till 2018 and test on remaining data

    df_train = final_df[final_df["Year"] < 2019]
    df_test = final_df[final_df["Year"] > 2018]

    # Collate all columns for model fitting. 
    # Everything starting with Volume to be added in col_list 

    col_list = [col for col in df_train.columns if col.startswith("Volume")]
    # col_list.append("Symbol")
    # col_list.append("Turnover")
    # col_list.append("Deliverable Volume")
    # col_list.append("%Deliverble")

    # Test Train split

    X_train = df_train[col_list]
    X_test = df_test[col_list]

    # Depending on pred_ahead, pick the corresponding y label for training and testing

    if pred_ahead == 7:
        Y_train = df_train["Label_7D"]
        Y_test = df_test["Label_7D"]

    if pred_ahead == 30:
        Y_train = df_train["Label_30D"]
        Y_test = df_test["Label_30D"]

    if pred_ahead == 90:
        Y_train = df_train["Label_90D"]
        Y_test = df_test["Label_90D"]
    
#   decision tree fitting

    dtree = tree.DecisionTreeClassifier()   
    dtree.fit(X_train, Y_train)

    Y_pred = dtree.predict(X_test)

    # Calculate Adjusted R^2
    Adj_R2_DT = 1 - ( 1-dtree.score(X_test, Y_test) ) * ( len(Y_test) - 1 ) / ( len(Y_test) - X_test.shape[1] - 1 )

    # Concatenate Y_pred to df_test
    Y_pred = pd.DataFrame(Y_pred, columns = ['Label_' + str(pred_ahead) + "D_Pred"])
    
    df_test.reset_index(inplace = True)
    Y_pred.reset_index(inplace = True)
    
    df_test = pd.concat([df_test, Y_pred], axis =1)
    
    return df_test, Adj_R2_DT


In [41]:
# Decision Tree for 7 days ahead prediction


df_test, Adj_R2_DT = decision_tree(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 3, pred_ahead = 7)
acc = accuracy_score(df_test["Label_7D"], df_test["Label_7D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_DT,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_7D"], df_test[df_test["Symbol"] == s]["Label_7D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.5097875980836897 Accuracy : 0.5098933612771991


{'SBILIFE': 0.5526717557251909,
 'BAJAJ-AUTO': 0.5450381679389313,
 'CIPLA': 0.5312977099236641,
 'NESTLEIND': 0.5297709923664122,
 'INFY': 0.5282442748091603,
 'TATASTEEL': 0.5282442748091603,
 'ICICIBANK': 0.5267175572519084,
 'TITAN': 0.5251908396946565,
 'HINDALCO': 0.5251908396946565,
 'TCS': 0.5236641221374045,
 'ASIANPAINT': 0.5236641221374045,
 'LT': 0.5236641221374045,
 'JSWSTEEL': 0.5236641221374045,
 'ADANIPORTS': 0.5236641221374045,
 'HDFCBANK': 0.5221374045801527,
 'BPCL': 0.5206106870229008,
 'INDUSINDBK': 0.5206106870229008,
 'HCLTECH': 0.5190839694656488,
 'SUNPHARMA': 0.5190839694656488,
 'BAJFINANCE': 0.517557251908397,
 'POWERGRID': 0.516030534351145,
 'MARUTI': 0.5145038167938931,
 'KOTAKBANK': 0.5129770992366413,
 'M&M': 0.5129770992366413,
 'RELIANCE': 0.5099236641221374,
 'HEROMOTOCO': 0.5099236641221374,
 'COALINDIA': 0.5083969465648855,
 'HINDUNILVR': 0.5068702290076336,
 'TECHM': 0.5068702290076336,
 'DRREDDY': 0.5068702290076336,
 'ITC': 0.5053435114503817,
 

In [42]:
# Decision Tree for 30 days ahead prediction

df_test, Adj_R2_DT = decision_tree(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 5, pred_ahead = 30)
acc = accuracy_score(df_test["Label_30D"], df_test["Label_30D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_DT,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_30D"], df_test[df_test["Symbol"] == s]["Label_30D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.5087735496339104 Accuracy : 0.5088795708207436


{'ICICIBANK': 0.5694656488549619,
 'HCLTECH': 0.5572519083969466,
 'AXISBANK': 0.5541984732824428,
 'BAJAJFINSV': 0.5435114503816794,
 'NTPC': 0.5435114503816794,
 'BAJAJ-AUTO': 0.5419847328244275,
 'RELIANCE': 0.5389312977099237,
 'HDFCBANK': 0.5374045801526718,
 'ONGC': 0.5328244274809161,
 'DIVISLAB': 0.5328244274809161,
 'HINDUNILVR': 0.5297709923664122,
 'SHREECEM': 0.5297709923664122,
 'INDUSINDBK': 0.5297709923664122,
 'SBILIFE': 0.5236641221374045,
 'KOTAKBANK': 0.517557251908397,
 'IOC': 0.517557251908397,
 'TITAN': 0.516030534351145,
 'ULTRACEMCO': 0.516030534351145,
 'TATASTEEL': 0.516030534351145,
 'EICHERMOT': 0.516030534351145,
 'NESTLEIND': 0.5145038167938931,
 'COALINDIA': 0.5145038167938931,
 'POWERGRID': 0.5129770992366413,
 'SUNPHARMA': 0.5114503816793893,
 'ITC': 0.5083969465648855,
 'DRREDDY': 0.5083969465648855,
 'GRASIM': 0.5068702290076336,
 'BPCL': 0.5068702290076336,
 'WIPRO': 0.5053435114503817,
 'TECHM': 0.5053435114503817,
 'ASIANPAINT': 0.5038167938931297,

In [43]:
# Decision Tree for 90 days ahead prediction

df_test, Adj_R2_DT = decision_tree(df, stock_list, n_timeperiod = 10, gap_in_timeperiod = 7, pred_ahead = 90)
acc = accuracy_score(df_test["Label_90D"], df_test["Label_90D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_DT,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_90D"], df_test[df_test["Symbol"] == s]["Label_90D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.5194961517611787 Accuracy : 0.5196444554180426


{'ICICIBANK': 0.648854961832061,
 'NESTLEIND': 0.6106870229007634,
 'NTPC': 0.5954198473282443,
 'AXISBANK': 0.5908396946564886,
 'BAJAJFINSV': 0.5740458015267176,
 'ITC': 0.5694656488549619,
 'ULTRACEMCO': 0.5587786259541985,
 'TATASTEEL': 0.5572519083969466,
 'SHREECEM': 0.5557251908396946,
 'INDUSINDBK': 0.5480916030534351,
 'IOC': 0.5465648854961832,
 'DIVISLAB': 0.5465648854961832,
 'GRASIM': 0.5450381679389313,
 'TITAN': 0.5435114503816794,
 'HINDALCO': 0.5435114503816794,
 'ONGC': 0.5389312977099237,
 'HDFC': 0.5374045801526718,
 'BAJFINANCE': 0.5343511450381679,
 'JSWSTEEL': 0.5328244274809161,
 'BAJAJ-AUTO': 0.5312977099236641,
 'HDFCBANK': 0.5282442748091603,
 'TATACONSUM': 0.5261437908496732,
 'WIPRO': 0.5221374045801527,
 'SBILIFE': 0.5221374045801527,
 'POWERGRID': 0.5206106870229008,
 'HDFCLIFE': 0.5190839694656488,
 'CIPLA': 0.517557251908397,
 'DRREDDY': 0.516030534351145,
 'KOTAKBANK': 0.5114503816793893,
 'ADANIPORTS': 0.5114503816793893,
 'BHARTIARTL': 0.509923664122

In [58]:
# RANDOM FOREST for prediction the trend (1(Up) or 0 (Down)) after pred_ahead days considering volumes for "n_timeperiod" previous time 
# periods with gap_in_timeperiod gaps in days between observations

def random_forest(df, stocklist, n_timeperiod, gap_in_timeperiod = 2, pred_ahead = 7):
    
    final_df = pd.DataFrame()
    for stock in stock_list:
        clean_df = extract_script(df, stock, n_timeperiod, gap_in_timeperiod)
        final_df = pd.concat([final_df, clean_df], axis = 0)

        # Create a new column - Year for test train split

    final_df["Year"] = pd.DatetimeIndex(final_df['Date']).year

    # We will train on old data and test on new data. Train on data till 2018 and test on remaining data

    df_train = final_df[final_df["Year"] < 2019]
    df_test = final_df[final_df["Year"] > 2018]

    # Collate all columns for model fitting. 
    # Everything starting with Volume to be added in col_list 

    col_list = [col for col in df_train.columns if col.startswith("Volume")]
    # col_list.append("Symbol")
    # col_list.append("Turnover")
    # col_list.append("Deliverable Volume")
    # col_list.append("%Deliverble")

    # Test Train split

    X_train = df_train[col_list]
    X_test = df_test[col_list]

    # Depending on pred_ahead, pick the corresponding y label for training and testing

    if pred_ahead == 7:
        Y_train = df_train["Label_7D"]
        Y_test = df_test["Label_7D"]

    if pred_ahead == 30:
        Y_train = df_train["Label_30D"]
        Y_test = df_test["Label_30D"]

    if pred_ahead == 90:
        Y_train = df_train["Label_90D"]
        Y_test = df_test["Label_90D"]
    
#   random forest fitting

    rf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators = 500)    
    rf.fit(X_train, Y_train)

    Y_pred = rf.predict(X_test)

    # Calculate Adjusted R^2
    Adj_R2_RF = 1 - ( 1-rf.score(X_test, Y_test) ) * ( len(Y_test) - 1 ) / ( len(Y_test) - X_test.shape[1] - 1 )

    # Concatenate Y_pred to df_test
    Y_pred = pd.DataFrame(Y_pred, columns = ['Label_' + str(pred_ahead) + "D_Pred"])
    
    df_test.reset_index(inplace = True)
    Y_pred.reset_index(inplace = True)
    
    df_test = pd.concat([df_test, Y_pred], axis =1)
    
    return df_test, Adj_R2_RF


In [59]:
# Random Forest for 7 days ahead prediction

df_test, Adj_R2_RF = random_forest(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 3, pred_ahead = 7)
acc = accuracy_score(df_test["Label_7D"], df_test["Label_7D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_RF,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_7D"], df_test[df_test["Symbol"] == s]["Label_7D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.5389191991282698 Accuracy : 0.5390186771867103


{'TATACONSUM': 0.698005698005698,
 'DIVISLAB': 0.6396946564885496,
 'INFY': 0.6229007633587786,
 'TITAN': 0.6122137404580152,
 'TECHM': 0.6076335877862595,
 'TATASTEEL': 0.6061068702290077,
 'HCLTECH': 0.6,
 'ASIANPAINT': 0.6,
 'BAJFINANCE': 0.5954198473282443,
 'BAJAJFINSV': 0.5938931297709924,
 'WIPRO': 0.5877862595419847,
 'NESTLEIND': 0.5801526717557252,
 'SBILIFE': 0.5770992366412214,
 'ULTRACEMCO': 0.5725190839694656,
 'JSWSTEEL': 0.5725190839694656,
 'ICICIBANK': 0.5709923664122137,
 'GRASIM': 0.566412213740458,
 'UPL': 0.5587786259541985,
 'DRREDDY': 0.5557251908396946,
 'RELIANCE': 0.5541984732824428,
 'ADANIPORTS': 0.5480916030534351,
 'TCS': 0.5450381679389313,
 'HINDALCO': 0.5450381679389313,
 'SHREECEM': 0.5450381679389313,
 'HDFCLIFE': 0.5419847328244275,
 'BHARTIARTL': 0.5404580152671755,
 'HDFCBANK': 0.5389312977099237,
 'AXISBANK': 0.5389312977099237,
 'HDFC': 0.5282442748091603,
 'BAJAJ-AUTO': 0.5221374045801527,
 'SBIN': 0.517557251908397,
 'BRITANNIA': 0.51755725190

In [60]:
# Random Forest for 30 days ahead prediction

df_test, Adj_R2_RF = random_forest(df, stock_list, n_timeperiod = 7, gap_in_timeperiod = 5, pred_ahead = 30)
acc = accuracy_score(df_test["Label_30D"], df_test["Label_30D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_RF,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_30D"], df_test[df_test["Symbol"] == s]["Label_30D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.610263049800575 Accuracy : 0.6103471665536165


{'TATACONSUM': 0.8377581120943953,
 'BAJFINANCE': 0.7297709923664122,
 'INFY': 0.7282442748091603,
 'DIVISLAB': 0.7175572519083969,
 'TECHM': 0.7129770992366412,
 'SBILIFE': 0.7129770992366412,
 'TITAN': 0.7038167938931298,
 'BAJAJFINSV': 0.6992366412213741,
 'GRASIM': 0.6931297709923664,
 'TCS': 0.6870229007633588,
 'ICICIBANK': 0.683969465648855,
 'ASIANPAINT': 0.6824427480916031,
 'TATASTEEL': 0.6778625954198473,
 'HCLTECH': 0.6702290076335878,
 'ULTRACEMCO': 0.6702290076335878,
 'JSWSTEEL': 0.6580152671755726,
 'DRREDDY': 0.6580152671755726,
 'HDFCLIFE': 0.6564885496183206,
 'HINDALCO': 0.6519083969465649,
 'AXISBANK': 0.6473282442748092,
 'NESTLEIND': 0.6473282442748092,
 'SHREECEM': 0.6458015267175573,
 'BHARTIARTL': 0.6229007633587786,
 'WIPRO': 0.6183206106870229,
 'SUNPHARMA': 0.6106870229007634,
 'ADANIPORTS': 0.6061068702290077,
 'HINDUNILVR': 0.5984732824427481,
 'TATAMOTORS': 0.5954198473282443,
 'BAJAJ-AUTO': 0.5954198473282443,
 'HDFCBANK': 0.5923664122137404,
 'RELIANCE

In [61]:
# Random Forest for 90 days ahead prediction

df_test, Adj_R2_RF = random_forest(df, stock_list, n_timeperiod = 10, gap_in_timeperiod = 7, pred_ahead = 90)
acc = accuracy_score(df_test["Label_90D"], df_test["Label_90D_Pred"])
print("R-squared : {} Accuracy : {}".format(Adj_R2_RF,acc)) 

# Checking if the accuracy is better for some stocks in the list
stock_acc = {}
for s in stock_list:
    stock_acc[s] = accuracy_score(df_test[df_test["Symbol"] == s]["Label_90D"], df_test[df_test["Symbol"] == s]["Label_90D_Pred"])

{k: v for k, v in sorted(stock_acc.items(), key=lambda item: item[1], reverse = True)}

R-squared : 0.6436047016146909 Accuracy : 0.6437147001635752


{'TATACONSUM': 0.9411764705882353,
 'DIVISLAB': 0.8610687022900764,
 'BAJFINANCE': 0.8381679389312977,
 'ICICIBANK': 0.8229007633587786,
 'ASIANPAINT': 0.8015267175572519,
 'NESTLEIND': 0.7893129770992366,
 'SBILIFE': 0.7893129770992366,
 'BAJAJFINSV': 0.7877862595419848,
 'BHARTIARTL': 0.7816793893129771,
 'SUNPHARMA': 0.7694656488549618,
 'HINDUNILVR': 0.7435114503816794,
 'TITAN': 0.7419847328244275,
 'TCS': 0.7297709923664122,
 'HDFCLIFE': 0.7251908396946565,
 'INFY': 0.716030534351145,
 'SBIN': 0.7114503816793893,
 'HDFC': 0.7007633587786259,
 'TECHM': 0.6992366412213741,
 'GRASIM': 0.6931297709923664,
 'KOTAKBANK': 0.6885496183206107,
 'CIPLA': 0.683969465648855,
 'ULTRACEMCO': 0.6793893129770993,
 'HINDALCO': 0.6763358778625954,
 'RELIANCE': 0.6732824427480916,
 'HCLTECH': 0.6534351145038167,
 'DRREDDY': 0.6458015267175573,
 'TATASTEEL': 0.6335877862595419,
 'TATAMOTORS': 0.6335877862595419,
 'BAJAJ-AUTO': 0.6244274809160305,
 'SHREECEM': 0.6229007633587786,
 'WIPRO': 0.61526717