In [None]:
# import packages
import numpy as np
import pandas as pd
import time
import datetime

In [None]:
#download data (rolling 12 month S&P Index) from Yahoo Finance
period1 = int(time.mktime((datetime.datetime.now() - datetime.timedelta(days=365)).timetuple()))
period2 = int(time.mktime(datetime.datetime.now().timetuple()))
interval = '1d'
ticker = '^GSPC'
query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
df = pd.read_csv(query_string)

# Data Processing

In [None]:
# add col Diff = Close - High
df['Diff'] = df['Close']-df['Open']

# add col Up, 1 if Diff >=0, -1 if Diff <0
df['Up'] = df['Diff']
df['Up'][df['Diff']>=0] = 1
df['Up'][df['Diff']<0] = -1

# Volume Normalization (MinMaxScaler)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['Volume']] = scaler.fit_transform(df[['Volume']])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Up'][df['Diff']>=0] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Up'][df['Diff']<0] = -1


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Diff,Up
0,2021-12-22,4650.359863,4697.669922,4645.529785,4696.560059,4696.560059,0.236731,46.200196,1.0
1,2021-12-23,4703.959961,4740.740234,4703.959961,4725.790039,4725.790039,0.177066,21.830078,1.0
2,2021-12-27,4733.990234,4791.490234,4733.990234,4791.189941,4791.189941,0.156118,57.199707,1.0
3,2021-12-28,4795.490234,4807.02002,4780.040039,4786.350098,4786.350098,0.146965,-9.140136,-1.0
4,2021-12-29,4788.640137,4804.060059,4778.080078,4793.060059,4793.060059,0.184444,4.419922,1.0


# Dimension Reduction (PCR)

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA()
x = df[['Volume', 'Open', 'Adj Close', 'Low', 'High']]
pca.fit(x)
print('explained variance of each variable:', pca.explained_variance_)
print('cumulative explained variance ratio:', np.cumsum(pca.explained_variance_ratio_))

explained variance of each variable: [3.76550809e+05 1.37439820e+03 4.83707153e+02 1.03594630e+02
 1.27872029e-02]
cumulative explained variance ratio: [0.99481731 0.99844836 0.99972628 0.99999997 1.        ]


In [None]:
# the cumulative explained vaariance ratio of the first three variables has reached 0.9997
# so we keep the first three variables: Volume, Open price and Adj Close price as feature variables
pca = PCA(n_components=3)
pca.fit(x)
x_new = pca.fit_transform(x)
print('new x after transformation:', x_new)

new x after transformation: [[ 1.09803616e+03 -3.61422168e+01 -7.17999239e+00]
 [ 1.19024827e+03 -1.99030405e+01 -2.05788788e+01]
 [ 1.27828143e+03 -4.34226620e+01 -2.19186506e+00]
 [ 1.33752645e+03  6.18052951e-01 -2.88259384e+01]
 [ 1.33498999e+03 -9.10043467e+00 -2.81889801e+01]
 [ 1.33167279e+03  5.80138314e+00 -2.50714111e+01]
 [ 1.30007259e+03  2.34852687e-01 -3.33994955e+01]
 [ 1.31779392e+03 -1.84085196e+01 -1.72681384e+01]
 [ 1.34851145e+03  3.06732454e+00 -1.62193591e+01]
 [ 1.24582519e+03  6.16239704e+01  1.42457788e+01]
 [ 1.14585774e+03 -5.22962635e+00 -9.93746860e+00]
 [ 1.12573859e+03  1.01017816e+01 -1.71960023e+01]
 [ 1.04338933e+03 -1.27673281e+01  2.09131266e+01]
 [ 1.12023960e+03 -3.32865379e+01  1.07510129e+01]
 [ 1.20827758e+03 -2.98380579e+00 -1.78819077e+01]
 [ 1.14647881e+03  5.21108930e+01  1.22272760e+01]
 [ 1.04336753e+03 -2.17589445e+01 -9.30365642e+00]
 [ 9.58181955e+02  3.58245769e+01 -7.44101696e+00]
 [ 8.84249624e+02  3.82671201e+01  3.06955635e+00]
 [ 

In [None]:
# split data into train set (80%) and test set (20%)
cut = int(df.shape[0]*0.8)
y = np.array(df['Up'])
train_x = x_new[:cut+1,:]
test_x = x_new[cut+1:-1,:]
train_y = y[1:cut+2]
test_y = y[cut+2:]

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

(202, 3) (202,) (49, 3) (49,)


# Define and Train Different SVM Models

In [None]:
from sklearn import svm
n = len(test_y)

#  kernel function is linear
lsvm = svm.SVC(kernel = 'linear')
lsvm.fit(train_x, train_y)
l_p = lsvm.predict(test_x)
count = 0
for i in range(n):
    if l_p[i] == test_y[i]:
        count+=1
print('accuracy:', count/n)

accuracy: 0.5510204081632653


In [None]:
# kernel function is sigmoid
ssvm = svm.SVC(kernel = 'sigmoid')
ssvm.fit(train_x, train_y)
s_p = lsvm.predict(test_x)
count = 0
for i in range(n):
    if s_p[i] == test_y[i]:
        count+=1
print('accuracy:', count/n)

accuracy: 0.5510204081632653


In [None]:
# kernal function is polynomial
psvm = svm.SVC(kernel = 'poly')
psvm.fit(train_x, train_y)
p_p = psvm.predict(test_x)
count = 0
for i in range(n):
    if p_p[i] == test_y[i]:
        count+=1
print('accuracy:', count/n)

accuracy: 0.5306122448979592


In [None]:
# kernel function is rbf
rsvm = svm.SVC(kernel = 'rbf')
rsvm.fit(train_x, train_y)
r_p = rsvm.predict(test_x)
count = 0
for i in range(n):
    if r_p[i] == test_y[i]:
        count+=1
print('accuracy:', count/n)

accuracy: 0.5714285714285714
