In [1]:
import numpy as np
import scipy.stats as ss
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from scipy.optimize import minimize
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("NVDA.csv",decimal =',') 
df['Date']= pd.to_datetime(df['Date']) 
df['Date']=df['Date'].dt.date
df.head()

Unnamed: 0,Date,Open,Close,High,Low,Volume
0,2016-01-04,32.290001,32.369999,32.580002,32.040001,8951900
1,2016-01-05,32.98,32.889999,33.439999,32.5,12256800
2,2016-01-06,32.349998,31.530001,32.5,31.16,11233600
3,2016-01-07,30.74,30.280001,30.950001,29.879999,16132600
4,2016-01-08,30.67,29.629999,30.700001,29.57,9961800


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 6 columns):
Date      878 non-null object
Open      879 non-null object
Close     879 non-null object
High      879 non-null object
Low       879 non-null object
Volume    879 non-null object
dtypes: object(6)
memory usage: 41.3+ KB


In [4]:
df['Open']=df['Open'].astype(float)
df['Close']=df['Close'].astype(float)
df['High']=df['High'].astype(float)
df['Low']=df['Low'].astype(float)
df['Volume']=df['Volume'].astype(float)

In [5]:
#Feature Engineering
df['OC']=df['Open']-df['Close']
df['AvgOC']=(df['Open']+df['Close'])/2
df['HL']=df['High']-df['Low']
df['AvgHL']=(df['High']+df['Low'])/2
df['LV']=np.log(df['Volume'])



In [6]:
#Selected Dataset - Variables used for training

X=df[['Close','HL','OC','LV']]

#Target Variable

Y1=df['Open'].as_matrix()
Y1=Y1[0:877]

#Target variable Y as next day Opening value

Y=df['Open'].shift(-1).as_matrix()
T=877

X=X[0:877]
X=np.column_stack([np.ones((T,1)),X])

Y=Y[0:877]
#Random Split into Train and test with pareto principle of 80:20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [7]:
def linear(Xt,Yt):
    invXX = np.linalg.inv(Xt.transpose()@Xt)
    beta_hat = invXX@Xt.transpose()@Yt
    y_hat = Xt@beta_hat
    return y_hat


In [8]:
def GARCH(Y1):
    mu = param0[0]
    omega = param0[1]
    alpha = param0[2]
    beta = param0[3]
    T = Y1.shape[0]
    GARCH_Dens = np.zeros(T)
    sigma2 = np.zeros(T)
    F = np.zeros(T)
    v = np.zeros(T)
    sigma2[0] = omega/(1-alpha)
    
    for t in range(1,T):
        sigma2[t] = omega+alpha*((Y1[t-1]-mu)**2)+beta*(sigma2[t-1])
        F[t] = Y1[t] - mu-np.sqrt(sigma2[t])*np.random.normal(0,1,1)
        v[t] = sigma2[t];
        GARCH_Dens[t] = (1/2)*np.log(2*np.pi)+(1/2)*np.log(v[t])+\
                     (1/2)*(F[t]/v[t]);
        Likelihood = np.sum(GARCH_Dens[1:-1])
    return Likelihood

In [9]:
def GARCH_PROD(params,T,Y1):
    mu = param0[0]
    omega = param0[1]
    alpha = param0[2]
    beta = param0[3]
    #Y = np.zeros(T)
    sigma2 = np.zeros(T)
    sigma2[0]= omega/(1-alpha)
    for t in range(1,T):
        sigma2[t] = omega+alpha*((Y1[t-1]-mu)**2)+beta*(sigma2[t-1])
        Y1[t] = mu+np.sqrt(sigma2[t])*np.random.normal(0,1,1)
    return Y1
    

In [10]:
def GARCH_PROD_t(params, Y0, T):
    mu = params[0]
    omega = np.exp(params[1])
    alpha = 1/(1+np.exp(-params[2]))
    beta = params[3]
    nv = params[4]
    #Y = np.zeros(T)
    Y = (df['Open ']+df['Close'])/2
    #Y = (df['Volume'])
    sigma2 = np.zeros(T)
    Y[0] = Y0
    sigma2[0]= omega/(1-alpha)
    for t in range(1,T):
        sigma2[t] = omega+alpha*((Y[t-1]-mu)**2)+beta*(sigma2[t-1]);
        Y[t] = mu+np.sqrt(sigma2[t])*np.random.normal(nv,1)
    return Y

In [11]:
def Kalman_Filter(YK):

    S=YK.shape[0]
    z = 1
    Z = 1
    T = 0.5
    H = np.var(YK);
    Q = 0.5*np.var(YK)
    #Kalman Filter Starts
    u_predict = np.zeros(S)
    u_update = np.zeros(S)
    p_predict = np.zeros(S)
    p_update = np.zeros(S)
    v = np.zeros(S)
    F = np.zeros(S)
    KF_Dens = np.zeros(S)

    for s in range(1,S):
        if s ==1:
            p_update[s] =1000
            p_predict[s] = T*p_update[1]*np.transpose(T)+Q
        else:
            F[s] = z*p_predict[s-1]*np.transpose(z)+H
            v[s] = YK[s-1]-z*u_predict[s-1]

            u_update[s] =T*u_update[s-1]+p_predict[s-1]*np.transpose(Z)*(1/F[s])*v[s]
            u_predict[s] = T*u_update[s]

            p_update[s] = p_predict[s-1]-p_predict[s-1]*np.transpose(Z)*(1/F[s])*Z*p_predict[s-1]
            p_predict[s] = T*p_update[s]*np.transpose(T)+Q
            KF_Dens[s]=(1/2)*np.log(2*np.pi)+(1/2)*np.log(abs(F[s]))+(1/2)*np.transpose(v[s])*(1/F[s])
            Likelihood=sum(KF_Dens[1:-1])
    return Likelihood

In [12]:
def Kalman_Smoother(params,YKS):
    S=YKS.shape[0]
    Z = params[0]
    T = params[1]
    H = params[2]
    Q = params[3]
    #Kalman Filter Starts
    u_predict = np.zeros(S)
    u_update = np.zeros(S)
    p_predict = np.zeros(S)
    p_update = np.zeros(S)
    v = np.zeros(S)
    F = np.zeros(S)
    KF_Dens = np.zeros(S)

    for s in range(1,S):
        if s ==1:
            p_update[s] =1000
            p_predict[s] = T*p_update[1]*np.transpose(T)+Q
        else:
            F[s] = Z*p_predict[s-1]*np.transpose(Z)+H
            v[s] = YKS[s-1]-Z*u_predict[s-1]
            
            u_update[s] =T*u_update[s-1]+p_predict[s-1]*np.transpose(Z)*(1/F[s])*v[s]
            u_predict[s] = T*u_update[s]

            p_update[s] = p_predict[s-1]-p_predict[s-1]*np.transpose(Z)*(1/F[s])*Z*p_predict[s-1]
            p_predict[s] = T*p_update[s]*np.transpose(T)+Q
        
    u_smooth = np.zeros(S)
    p_smooth = np.zeros(S)
    u_smooth[S-1] = u_update[S-1]
    p_smooth[S-1] = p_update[S-1]
    for t in range (S-1,0,-1):
        u_smooth[t-1] =u_update[t]+p_update[t]*np.transpose(T)/p_predict[t]*(u_smooth[t]-T*u_update[t])
        p_smooth[t-1] = p_update[t]+p_update[t]*np.transpose(T)/p_predict[t]*(p_smooth[t]-p_predict[t])/p_predict[t]
    return u_smooth

In [13]:
def MovingAvg(x, y):
    preds = []
    for i in range(0,y.shape[0]):
        a = df['Open'][len(df)-301+i:].sum() + sum(preds)
        b = a/y.shape[0]
        preds.append(b)
    return preds

In [14]:
Y1Temp=Y1

In [15]:
val=MovingAvg(X,Y1)

In [16]:
df_yhatLinearTrain=pd.DataFrame()
df_yhatGarchTrain=pd.DataFrame()
df_yhatGarchTTrain=pd.DataFrame()
df_yhatKalman=pd.DataFrame()
df_yhatMA=pd.DataFrame()

df_yhatLinearTest=pd.DataFrame()
df_yhatGarchTest=pd.DataFrame()
df_yhatGarchTTest=pd.DataFrame()
Y1Temp=Y1

for x in range(0,100):
    #Linear for Train dataset
    df_yhatLinearTrain[x]=linear(X_train,y_train)
    
#     """Linear for Test dataset"""
#     df_yhatLinearTest[x]=linear(X_test,y_test)

    #Garch for Train dataset
    Y1=Y1Temp
    T = len(Y1)
    param0 = np.array([32,35,0.1,0.1])
    param_star = minimize(GARCH, param0, method ='BFGS', options ={'xtol':1e-8, 'disp':True})
    Y_GARCH = GARCH_PROD(param_star.x,T,Y1)
    df_yhatGarchTrain[x]=Y_GARCH
    
    Y1=Y1Temp
    param0 = np.array([1.3,0.3,8,9])
    param_star = minimize(Kalman_Filter, param0, method = 'BFGS', options ={'xtol':1e-8,'disp': True})
    Y_update = Kalman_Smoother(param_star.x,Y1)
    df_yhatKalman[x]=Y_update
    
    Y1=Y1Temp
    df_yhatMA[x]=MovingAvg(X,Y1)
#     """Garch for Test dataset"""
#     T = len(y_test)
#     mu = 35
#     sig =5
#     param0 = np.array([32,35,0.2,0.5])
#     param_star = minimize(GARCH, param0, method ='BFGS', options ={'xtol':1e-8, 'disp':True})
#     Y_GARCH = GARCH_PROD(param_star.x,T,y_test)
#     df_yhatGarchTest[x]=Y_GARCH


         Current function value: 4.904860
         Iterations: 1
         Function evaluations: 114
         Gradient evaluations: 17
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.822935
         Iterations: 4
         Function evaluations: 239
         Gradient evaluations: 37
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.989127
         Iterations: 3
         Function evaluations: 206
         Gradient evaluations: 31
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.937998
         Iterations: 1
         Function evaluations: 131
         Gradient evaluations: 20
         Current function value: -11109.8864

         Current function value: 5.027315
         Iterations: 1
         Function evaluations: 167
         Gradient evaluations: 26
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.941774
         Iterations: 2
         Function evaluations: 198
         Gradient evaluations: 31
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.935259
         Iterations: 1
         Function evaluations: 138
         Gradient evaluations: 21
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.930585
         Iterations: 1
         Function evaluations: 144
         Gradient evaluations: 22
         Current function value: -11109.8864

         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 5.012589
         Iterations: 1
         Function evaluations: 173
         Gradient evaluations: 27
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
Divide-by-zero encountered: rhok assumed large
         Current function value: 5.043458
         Iterations: 2
         Function evaluations: 174
         Gradient evaluations: 27
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.887684
         Iterations: 2
         Function evaluations: 150
         Gradient evaluations: 23
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluat

         Gradient evaluations: 32
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.907183
         Iterations: 2
         Function evaluations: 132
         Gradient evaluations: 20
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.922014
         Iterations: 1
         Function evaluations: 131
         Gradient evaluations: 20
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.967254
         Iterations: 1
         Function evaluations: 144
         Gradient evaluations: 22
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
    

         Function evaluations: 156
         Gradient evaluations: 24
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.855746
         Iterations: 3
         Function evaluations: 181
         Gradient evaluations: 28
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
Divide-by-zero encountered: rhok assumed large
         Current function value: 4.960071
         Iterations: 3
         Function evaluations: 197
         Gradient evaluations: 31
         Current function value: -11109.886400
         Iterations: 53
         Function evaluations: 432
         Gradient evaluations: 70
         Current function value: 4.975969
         Iterations: 4
         Function evaluations: 200
         Gradient evaluations: 31
         Current function value: -11109.886400
         Iterat

In [17]:
resdata={}
y_hatTrainLinear=df_yhatLinearTrain.mean(axis=1)
reslinT=y_train-y_hatTrainLinear

y_hatTrainGarch=df_yhatGarchTrain.mean(axis=1)
resG=Y1Temp-y_hatTrainGarch

y_hatKalman=df_yhatKalman.mean(axis=1)
resK=Y1Temp-y_hatKalman


y_hatMA=df_yhatMA.mean(axis=1)
resMA=Y1Temp-y_hatMA

resdata={'Linear':reslinT,'Garch':resG,'Kalman':resK,'Moving Average':resMA}


In [18]:
def randomForest(datadict):
    data={'Algo':'','RMSE':None}
    for key,val in datadict.items():
        rmse=np.sqrt(((val) ** 2).mean())
        if(data['RMSE']==None):
            data['Algo']=key
            data['RMSE']=rmse
        elif data['RMSE']>rmse:
            data['Algo']=key
            data['RMSE']=rmse
    return data

In [19]:

rmse=np.sqrt(((reslinT) ** 2).mean())
rmse

2.8080678276157753

In [22]:
ranndomforestoutput=randomForest(resdata)

In [23]:
ranndomforestoutput

{'Algo': 'Linear', 'RMSE': 2.8080678276157753}