Stock Prediction Model with VTI and Gold

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import plot

#for offline plotting
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
# Loading ETF historical pricing from research paper 10/5/18 to 10/4/21
VTI = pd.read_csv('VTI_updated_dataset_original_range.csv')
VTI.head()


Unnamed: 0,Date,VTI,Gold,WTI
0,10/5/2018,136.84,1202.45,74.26
1,10/8/2018,136.74,1187.62,74.27
2,10/9/2018,136.5,1189.17,74.95
3,10/10/2018,132.15,1194.36,73.18
4,10/11/2018,129.34,1223.73,70.97


In [3]:
#overview of the data
VTI.info()
VTI.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    782 non-null    object 
 1    VTI    754 non-null    float64
 2    Gold   780 non-null    float64
 3    WTI    749 non-null    float64
dtypes: float64(3), object(1)
memory usage: 24.6+ KB


Unnamed: 0,VTI,Gold,WTI
count,754.0,780.0,749.0
mean,162.033833,1610.008192,53.21231
std,31.654511,234.605129,13.311075
min,106.35,1187.62,-36.98
25%,137.7,1414.0175,45.15
50%,151.25,1672.08,55.63
75%,187.6775,1809.025,61.82
max,228.15,2063.19,77.68


In [4]:
print(VTI.columns)

Index(['Date', ' VTI ', ' Gold ', ' WTI '], dtype='object')


In [5]:
VTI[VTI.columns[1:]].corr()[' VTI '][:]

 VTI      1.000000
 Gold     0.706281
 WTI      0.460727
Name:  VTI , dtype: float64

In [6]:
#date range of the data
VTI['Date'] = pd.to_datetime(VTI['Date'])
print(f'Dataframe contains stock prices between {VTI.Date.min()} {VTI.Date.max()}') 
print(f'Total days = {(VTI.Date.max()  - VTI.Date.min()).days} days')

Dataframe contains stock prices between 2018-10-05 00:00:00 2021-10-04 00:00:00
Total days = 1095 days


In [7]:
# Plotting Historical performance of the ETF
layout = go.Layout(
    title='VTI ETF Price',
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Price USD',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

VTI_data = [{'x':VTI['Date'], 'y':VTI[' VTI ']}]
plot = go.Figure(data=VTI_data, layout=layout)


iplot(plot)

In [8]:
# Building the regression model
from sklearn.model_selection import train_test_split

#For preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#For model evaluation
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

In [9]:
#Split the data into train and test sets
#df.drop(['C', 'D'], axis=1)
VTI = VTI.dropna()
#X = VTI['.drop([ 'Date',' VTI '], axis=1)']  # Data Matrix containing all features excluding the target and Date
#X = VTI[' Gold ']
X = np.array(VTI[' Gold ']).reshape(-1,1)
Y = VTI[' VTI '] # 1D targer vector
print(X.shape)
print(Y.shape)
#X = np.array(VTI.index).reshape(-1,1)
#Y = VTI['Adj Close']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=101)

(748, 1)
(748,)


In [10]:
# Feature scaling
scaler = StandardScaler().fit(X_train)

In [11]:
# First Model - Linear Regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, Y_train)

#Plot actual and predicted values for train dataset
trace0 = go.Scatter(
    x = X_train.T[0],
    y = Y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = lm.predict(X_train).T,
    mode = 'markers',
    name = 'Predicted'
)
VTI_data = [trace0,trace1]
layout.xaxis.title.text = 'Gold ($/oz)'
plot2 = go.Figure(data=VTI_data, layout=layout)

iplot(plot2)

In [12]:
#Calculate scores for model evaluation
import math 
scores = f'''
{'Metric'.ljust(10)}{'Train'.center(20)}{'Test'.center(20)}
{'RMSE'.ljust(10)}{np.sqrt(mse(Y_train, lm.predict(X_train)))}\t{np.sqrt(mse(Y_test, lm.predict(X_test)))}
{'MAPE'.ljust(10)}{mape(Y_train, lm.predict(X_train))}\t{mape(Y_test, lm.predict(X_test))}
'''
print(scores)


Metric           Train                Test        
RMSE      22.53976030786947	22.200341493523418
MAPE      0.1014005145134224	0.10163995270650904



In [13]:
# Second Model - Logistic Regression
from sklearn.linear_model import LogisticRegression

#To avoid "continuous" error for Logistic Regression and KNN, convert data
from sklearn import preprocessing
from sklearn import utils
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(Y_train)

clf = LogisticRegression()
clf.fit(X_train, training_scores_encoded)


#Plot actual and predicted values for train dataset
trace0 = go.Scatter(
    x = X_train.T[0],
    y = Y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = clf.predict(X_train).T,
    mode = 'markers',
    name = 'Predicted'
)
VTI_data = [trace0,trace1]
layout.xaxis.title.text = 'Gold ($/oz)'
plot2 = go.Figure(data=VTI_data, layout=layout)

iplot(plot2)




In [14]:
#Calculate scores for model evaluation
import math 
scores = f'''
{'Metric'.ljust(10)}{'Train'.center(20)}{'Test'.center(20)}
{'RMSE'.ljust(10)}{np.sqrt(mse(Y_train, clf.predict(X_train)))}\t{np.sqrt(mse(Y_test, clf.predict(X_test)))}
{'MAPE'.ljust(10)}{mape(Y_train, clf.predict(X_train))}\t{mape(Y_test, clf.predict(X_test))}
'''
print(scores)




Metric           Train                Test        
RMSE      204.21493987846534	197.0151667088941
MAPE      0.9710792373967044	0.9273634113464982



In [15]:
# Third Modal - KNN
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, classification_report
#%%time

#warnings.filterwarnings('ignore')

# The param_grid tells Scikit-Learn to evaluate all combinations of the hyperparameter values
param_grid = {'n_neighbors': np.arange(1,50), 'p': [1, 2, 10, 50, 100, 500, 1000], 
              'weights': ["uniform", "distance"]}

knn_clf = KNeighborsClassifier()

knn_cv = GridSearchCV(knn_clf, param_grid, scoring='f1', cv=2, verbose=3, n_jobs=-1)
knn_cv.fit(X_train, training_scores_encoded)


params_optimal_knn = knn_cv.best_params_

knn = KNeighborsClassifier(**params_optimal_knn)

knn.fit(X_train, training_scores_encoded)

y_train_predicted = knn.predict(X_train)

#Plot actual and predicted values for train dataset
trace0 = go.Scatter(
    x = X_train.T[0],
    y = Y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = knn.predict(X_train).T,
    mode = 'markers',
    name = 'Predicted'
)
VTI_data = [trace0,trace1]
layout.xaxis.title.text = 'Gold ($/oz)'
plot2 = go.Figure(data=VTI_data, layout=layout)

iplot(plot2)



Fitting 2 folds for each of 686 candidates, totalling 1372 fits



The least populated class in y has only 1 members, which is less than n_splits=2.


One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 

In [16]:
#Calculate scores for model evaluation
import math 
scores = f'''
{'Metric'.ljust(10)}{'Train'.center(20)}{'Test'.center(20)}
{'RMSE'.ljust(10)}{np.sqrt(mse(Y_train, knn.predict(X_train)))}\t{np.sqrt(mse(Y_test, knn.predict(X_test)))}
{'MAPE'.ljust(10)}{mape(Y_train, knn.predict(X_train))}\t{mape(Y_test, knn.predict(X_test))}
'''
print(scores)


Metric           Train                Test        
RMSE      176.8694358988685	183.3402641174782
MAPE      0.8312792936380314	0.8772784492969083



In [17]:
# Fourth Model - Gaussian NB

#%%time

#warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB, MultinomialNB
param_grid = {'var_smoothing': [0.001, 0.01, 0.1, 1.0]}

gnb = GaussianNB()

gnb = GridSearchCV(gnb, param_grid, scoring='accuracy', cv=2, verbose=1, n_jobs=-1)
gnb.fit(X_train, training_scores_encoded)

params_optimal = gnb.best_params_

gaussianNB_clf = GaussianNB(**params_optimal)

gaussianNB_clf.fit(X_train, training_scores_encoded) 

y_test_predicted = gaussianNB_clf.predict(X_test)

#Plot actual and predicted values for train dataset
trace0 = go.Scatter(
    x = X_train.T[0],
    y = Y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = gaussianNB_clf.predict(X_train).T,
    mode = 'markers',
    name = 'Predicted'
)
VTI_data = [trace0,trace1]
layout.xaxis.title.text = 'Gold ($/oz)'
plot2 = go.Figure(data=VTI_data, layout=layout)

iplot(plot2)

Fitting 2 folds for each of 4 candidates, totalling 8 fits



The least populated class in y has only 1 members, which is less than n_splits=2.



In [18]:
#Calculate scores for model evaluation
import math 
scores = f'''
{'Metric'.ljust(10)}{'Train'.center(20)}{'Test'.center(20)}
{'RMSE'.ljust(10)}{np.sqrt(mse(Y_train, gaussianNB_clf.predict(X_train)))}\t{np.sqrt(mse(Y_test, gaussianNB_clf.predict(X_test)))}
{'MAPE'.ljust(10)}{mape(Y_train, gaussianNB_clf.predict(X_train))}\t{mape(Y_test, gaussianNB_clf.predict(X_test))}
'''
print(scores)


Metric           Train                Test        
RMSE      174.8291583595512	172.9754739262188
MAPE      0.8144440684169445	0.8090186678195345

