In [123]:
from datetime import datetime as dt
import numpy as np
import pandas as pd
import talib


data_df = pd.read_csv("/Users/prateekagrawal/Downloads/reddy_historical1.csv", index_col=[0]).drop('co_code', axis=1)
data_df['date'] = data_df['date'].map(lambda x: dt.strptime(x, '%m/%d/%Y %I:%M:%S %p').date())
data_df['volume'] = data_df['volume'].astype(float)
data_df['target'] = np.append(data_df['close'][1:].values, [np.nan])
data_df = data_df.set_index('date')

# Simple Moving Average
data_df['sma_5'] = talib.SMA(data_df['close'].values, timeperiod=5)
data_df['sma_10'] = talib.SMA(data_df['close'].values, timeperiod=10)

# Exponential Moving Average
data_df['ema_20'] = talib.EMA(data_df['close'].values, timeperiod=20)

# Momentum 6 Month / Momentum 12 Month
data_df['mtm6_mtm12'] = talib.MOM(data_df['close'].values, timeperiod=126)/talib.MOM(data_df['close'].values, 
                                  timeperiod=252)

# Stochastic Relative Strength Index
data_df['fastk'], data_df['fastd'] = talib.STOCHRSI(data_df['close'].values, timeperiod=14, fastk_period=5,
                                                    fastd_period=3, fastd_matype=0)

# Rate Of Change
data_df['roc_10'] = talib.ROC(data_df['close'].values, timeperiod=10)

# Bollinger Bands
data_df['bband_upper'], data_df['bband_middle'], data_df['bband_lower'] = talib.BBANDS(data_df['close'].values,
                                                                                     timeperiod=5, nbdevup=2, nbdevdn=2,
                                                                                     matype=0)

# Moving Average Convergence Divergence
data_df['macd'], data_df['macdsignal'], data_df['macdhist'] = talib.MACD(data_df['close'].values, fastperiod=12, 
                                                                         slowperiod=26, signalperiod=9)

# On Balance Volume
data_df['obv'] = talib.OBV(data_df['close'].values, data_df['volume'].values)

# Commodity Channel Index
data_df['cci_14'] = talib.CCI(data_df['high'].values, data_df['low'].values, data_df['close'].values, timeperiod=14)


# Average True Range
data_df['atr_14'] = talib.ATR(data_df['high'].values, data_df['low'].values, data_df['close'].values, timeperiod=14)

data_df = data_df.dropna(axis=0, how='any')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


target = data_df.pop('target')
X_train, X_test, y_train, y_test = train_test_split(data_df, target, test_size=0.20, shuffle=False)
lin_reg = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
lin_reg.fit(X_train, y_train)
lin_reg.score(X_train, y_train)
lin_reg.score(X_test, y_test)
target = pd.DataFrame(target)
target['pred'] = lin_reg.predict(data_df)
target = target.join(data_df[['open', 'high', 'low', 'close']])
target = target[['open', 'high', 'low', 'close', 'target', 'pred']]
target.loc[:, 'accuracy'] = np.nan
for i in range(len(target)):
    if (target.iloc[i, 4] > target.iloc[i, 3]) and (target.iloc[i, 5] > target.iloc[i, 3]):
        target.iloc[i, 6] = 1
    elif (target.iloc[i, 4] < target.iloc[i, 3]) and (target.iloc[i, 5] < target.iloc[i, 3]):
        target.iloc[i, 6] = 1
    else:
        target.iloc[i, 6] = 0

In [124]:
data_df.dropna()

Unnamed: 0_level_0,open,high,low,close,volume,target,sma_5,sma_10,ema_20,mtm6_mtm12,...,roc_10,bband_upper,bband_middle,bband_lower,macd,macdsignal,macdhist,obv,cci_14,atr_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-10-23,628.00,639.9,622.00,636.05,448779.0,630.05,619.46,622.390,630.875658,1.034286,...,-1.265135,638.500840,619.46,600.419160,-6.205090,-5.538564,-0.666526,3272105.0,18.946982,20.130722
2007-10-24,630.00,640.0,625.05,630.05,838067.0,612.80,623.82,621.675,630.797024,0.805113,...,-1.122097,640.424289,623.82,607.215711,-5.488580,-5.528567,0.039987,2434038.0,25.655037,19.760671
2007-10-25,629.70,638.0,605.00,612.80,1165187.0,617.55,623.61,619.850,629.083022,0.821262,...,-2.892005,640.731846,623.61,606.488154,-6.240732,-5.671000,-0.569732,1268851.0,-42.766457,20.706337
2007-10-26,617.00,625.0,604.60,617.55,425943.0,617.35,623.85,619.870,627.984639,0.913364,...,0.032397,640.587383,623.85,607.112617,-6.379988,-5.812797,-0.567190,1694794.0,-55.949621,20.684456
2007-10-29,619.10,623.8,606.10,617.35,199162.0,616.30,622.76,619.785,626.971816,0.896585,...,-0.137496,640.318633,622.76,605.201367,-6.432340,-5.936706,-0.495634,1495632.0,-50.401450,20.471280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-09,2624.75,2676.0,2600.30,2663.40,466444.0,2633.50,2652.81,2703.055,2689.949913,-2.100719,...,-3.752823,2713.436839,2652.81,2592.183161,4.651275,24.875188,-20.223913,62583777.0,-96.414502,69.801423
2019-10-10,2626.15,2666.9,2612.05,2633.50,288141.0,2651.40,2641.64,2687.880,2684.573731,-1.786486,...,-5.448344,2690.698145,2641.64,2592.581855,-0.079174,19.884316,-19.963490,62295636.0,-90.810327,68.733465
2019-10-11,2645.00,2659.0,2607.00,2651.40,546907.0,2651.90,2636.49,2675.685,2681.414328,-0.942443,...,-4.397209,2673.477155,2636.49,2599.502845,-2.356544,15.436144,-17.792688,62842543.0,-75.589192,67.538217
2019-10-14,2652.00,2682.6,2621.00,2651.90,301437.0,2684.65,2643.33,2663.390,2678.603440,-0.833566,...,-4.430870,2676.321369,2643.33,2610.338631,-4.074067,11.534102,-15.608168,63143980.0,-53.134820,67.114059


In [125]:
data_df["close_lr"] = np.log(data_df["close"])-np.log(data_df["close"].shift(1))

In [126]:
data_df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'target', 'sma_5', 'sma_10',
       'ema_20', 'mtm6_mtm12', 'fastk', 'fastd', 'roc_10', 'bband_upper',
       'bband_middle', 'bband_lower', 'macd', 'macdsignal', 'macdhist', 'obv',
       'cci_14', 'atr_14', 'close_lr'],
      dtype='object')

In [127]:
data_df["open_lr"] = np.log(data_df["open"])-np.log(data_df["open"].shift(1))
data_df["high_lr"] = np.log(data_df["high"])-np.log(data_df["high"].shift(1))
data_df["low_lr"] = np.log(data_df["low"])-np.log(data_df["low"].shift(1))
data_df["target_lr"] = np.log(data_df["target"])-np.log(data_df["target"].shift(1))

data_df["volume_lr"] = np.log(data_df["volume"])-np.log(data_df["volume"].shift(1))
data_df["sma_5_lr"] = np.log(data_df["sma_5"])-np.log(data_df["sma_5"].shift(1))
data_df["sma_10_lr"] = np.log(data_df["sma_10"])-np.log(data_df["sma_10"].shift(1))
data_df["ema_20_lr"] = np.log(data_df["ema_20"])-np.log(data_df["ema_20"].shift(1))
#data_df["mtm6_mtm12_lr"] = np.log(data_df["mtm6_mtm12"])-np.log(data_df["mtm6_mtm12"].shift(1))
#data_df["fastk_lr"] = np.log(data_df["fastk"])-np.log(data_df["fastk"].shift(1))
#data_df["roc_10_lr"] = np.log(data_df["roc_10"])-np.log(data_df["roc_10"].shift(1))
data_df["bband_upper_lr"] = np.log(data_df["bband_upper"])-np.log(data_df["bband_upper"].shift(1))
data_df["bband_middle_lr"] = np.log(data_df["bband_middle"])-np.log(data_df["bband_middle"].shift(1))
data_df["bband_lower_lr"] = np.log(data_df["bband_lower"])-np.log(data_df["bband_lower"].shift(1))
#data_df["macd_lr"] = np.log(data_df["macd"])-np.log(data_df["macd"].shift(1))
#data_df["macdsignal_lr"] = np.log(data_df["macdsignal"])-np.log(data_df["macdsignal"].shift(1))
#data_df["macdhist_lr"] = np.log(data_df["macdhist"])-np.log(data_df["macdhist"].shift(1))
#data_df["obv_lr"] = np.log(data_df["obv"])-np.log(data_df["obv"].shift(1))
#data_df["cci_14_lr"] = np.log(data_df["cci_14"])-np.log(data_df["cci_14"].shift(1))
#data_df["atr_14_lr"] = np.log(data_df["atr_14"])-np.log(data_df["atr_14"].shift(1))

In [128]:
data_df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'target', 'sma_5', 'sma_10',
       'ema_20', 'mtm6_mtm12', 'fastk', 'fastd', 'roc_10', 'bband_upper',
       'bband_middle', 'bband_lower', 'macd', 'macdsignal', 'macdhist', 'obv',
       'cci_14', 'atr_14', 'close_lr', 'open_lr', 'high_lr', 'low_lr',
       'target_lr', 'volume_lr', 'sma_5_lr', 'sma_10_lr', 'ema_20_lr',
       'bband_upper_lr', 'bband_middle_lr', 'bband_lower_lr'],
      dtype='object')

In [129]:
data_df2 = data_df[["open_lr", "high_lr", "low_lr", "close_lr", "volume_lr", "sma_5_lr", "sma_10_lr", "ema_20_lr", "bband_upper_lr", "bband_lower_lr", "target_lr", "mtm6_mtm12", "fastk", "fastd", "roc_10", "macd", "macdsignal", "macdhist", "obv", "cci_14", "atr_14",  ]]

In [130]:
data_df2 = data_df2.dropna(axis=0, how='any')

In [131]:
data_df2.describe()

Unnamed: 0,open_lr,high_lr,low_lr,close_lr,volume_lr,sma_5_lr,sma_10_lr,ema_20_lr,bband_upper_lr,bband_lower_lr,...,mtm6_mtm12,fastk,fastd,roc_10,macd,macdsignal,macdhist,obv,cci_14,atr_14
count,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,...,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0,2962.0
mean,0.000488,0.000486,0.00049,0.000486,-9.2e-05,0.000492,0.00049,0.000488,0.000486,0.000498,...,0.078555,51.414276,51.41572,0.661814,4.823288,4.804093,0.019195,43407940.0,15.474517,53.804862
std,0.021514,0.017675,0.022652,0.018444,0.676449,0.008413,0.005858,0.004152,0.013888,0.014459,...,15.912345,41.547449,32.8916,5.806697,43.220165,40.048585,14.538713,22594290.0,108.848667,21.948112
min,-0.176234,-0.182788,-0.334132,-0.157366,-5.813163,-0.051908,-0.029575,-0.019213,-0.156687,-0.15498,...,-614.25,0.0,-9.710751e-14,-25.794264,-248.245025,-221.360486,-111.03841,-6968550.0,-377.485789,16.30778
25%,-0.009526,-0.007807,-0.007624,-0.008769,-0.386658,-0.003699,-0.002594,-0.001897,-0.004305,-0.004615,...,0.190943,0.0,22.08896,-2.561646,-15.670577,-14.006542,-5.360874,33218560.0,-70.415342,37.049179
50%,0.000299,0.0,0.001408,0.000474,-0.019716,0.000687,0.000766,0.000674,0.000155,0.000215,...,0.514225,54.060592,52.03714,0.758726,6.219008,6.023783,0.58449,45641910.0,28.492504,49.359786
75%,0.010763,0.008517,0.009724,0.010031,0.354889,0.005291,0.004064,0.003006,0.005716,0.006329,...,0.912112,100.0,81.31005,4.120197,27.645282,26.789107,6.55457,62873630.0,97.669639,68.10699
max,0.233615,0.126212,0.306636,0.101481,6.951863,0.037975,0.021364,0.017954,0.093652,0.090063,...,75.625,100.0,100.0,24.562022,157.0993,143.57888,48.841265,78679060.0,333.781893,147.303422


In [132]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


target = data_df2.pop('target_lr')
X_train, X_test, y_train, y_test = train_test_split(data_df2, target, test_size=0.20, shuffle=False)
lin_reg = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
lin_reg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [133]:
print(lin_reg.score(X_train, y_train))
#lin_reg.score(X_test, y_test)
target = pd.DataFrame(target)
target['pred'] = lin_reg.predict(data_df2)
target = target.join(data_df2[['open_lr', 'high_lr', 'low_lr', 'close_lr']])
target = target[['open_lr', 'high_lr', 'low_lr', 'close_lr', 'target_lr', 'pred']]
target.loc[:, 'accuracy'] = np.nan
#for i in range(len(target)):
 #   if (target.iloc[i, 4] > target.iloc[i, 3]) and (target.iloc[i, 5] > target.iloc[i, 3]):
  #      target.iloc[i, 6] = 1
   ## elif (target.iloc[i, 4] < target.iloc[i, 3]) and (target.iloc[i, 5] < target.iloc[i, 3]):
     #   target.iloc[i, 6] = 1
    #else:
     #   target.iloc[i, 6] = 0
    
for i in range(len(target)):
    if (target.iloc[i,4]>0 and target.iloc[i,5]>0):
        target.iloc[i,6] =1
    elif(target.iloc[i,4]<0 and target.iloc[i,5]<0):
        target.iloc[i,6] =1

0.012567647851500552


In [136]:
target['accuracy'].sum()

1519.0

In [137]:
target.head(20)

Unnamed: 0_level_0,open_lr,high_lr,low_lr,close_lr,target_lr,pred,accuracy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-10-24,0.00318,0.000156,0.004892,-0.009478,-0.027761,0.00073,
2007-10-25,-0.000476,-0.00313,-0.032603,-0.027761,0.007721,0.000234,1.0
2007-10-26,-0.020374,-0.020587,-0.000661,0.007721,-0.000324,-0.000387,1.0
2007-10-29,0.003398,-0.001922,0.002478,-0.000324,-0.001702,-0.00011,1.0
2007-10-30,-0.011044,-0.001524,0.009687,-0.001702,0.008563,0.00146,1.0
2007-10-31,0.009266,0.003446,0.0,0.008563,-0.011081,0.00072,
2007-11-01,-0.009756,0.0,-0.003191,-0.011081,-0.001139,-0.000816,1.0
2007-11-02,-0.009852,-0.005294,-0.006661,-0.001139,-0.017662,-0.000224,1.0
2007-11-05,0.008217,-0.010835,-0.008285,-0.017662,0.016766,0.001077,1.0
2007-11-06,-0.007722,0.002436,0.004979,0.016766,-0.015523,0.001932,
