In [1]:
from functions import *
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, \
QuadraticDiscriminantAnalysis
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

#### GME

In [2]:
stock = 'gme'

In [3]:
stock_data = data_loading_processing(stock)[2]

In [4]:
train_X = stock_data.loc[stock_data.Year == 2020,['mean_return','volatility']]
test_X = stock_data.loc[stock_data.Year == 2021,['mean_return','volatility']]
train_y = np.ravel(stock_data.loc[stock_data.Year == 2020,['Label']].values)
test_y = np.ravel(stock_data.loc[stock_data.Year == 2021,['Label']].values)

In [5]:
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
lda = LinearDiscriminantAnalysis(store_covariance=True)

In [6]:
lda.fit(train_X,train_y)
lda.intercept_, lda.coef_, lda.means_

(array([0.43428967]),
 array([[-0.6935597 , -0.02651876]]),
 array([[ 3.10585179,  7.251883  ],
        [-1.986014  ,  4.76083117]]))

###### Verification of the meaning of sklearn discriminant analysis attributes.  

In [7]:
red_train = stock_data[(stock_data.Label == 'Red') & (stock_data.Year == 2020)]
green_train = stock_data[(stock_data.Label == 'Green') & (stock_data.Year == 2020)]


In [8]:
features = ['mean_return','volatility']

In [9]:
r_cov = red_train.loc[:,['mean_return','volatility']].cov().values

In [10]:
g_cov = green_train.loc[:,['mean_return','volatility']].cov().values

In [11]:
train_X.cov()

Unnamed: 0,mean_return,volatility
mean_return,13.672057,6.194118
volatility,6.194118,15.964541


qda.covariance_ correspond to the 

In [12]:
(g_cov,r_cov)

(array([[ 9.05321199,  6.30429963],
        [ 6.30429963, 23.2240902 ]]),
 array([[ 5.16987056, -0.65199185],
        [-0.65199185,  5.04784942]]))

lda.covariance_ corresponds to the weighted biased covariance estimates

In [13]:
np.allclose(lda.priors_[0]*g_cov*(green_train.shape[0]-1)/(green_train.shape[0]) + \
lda.priors_[1]*r_cov*(red_train.shape[0]-1)/(red_train.shape[0]) ,lda.covariance_)

True

lda.priors_ corresponds to the prior probability of each class.

In [14]:
lda.priors_ 

array([0.52830189, 0.47169811])

In [15]:
def decision_function_explicit(X, means,cov,priors):
    """cov entries will be the same for lda"""
    preds = []
    posteriors = []
    for x in X.values:
        posterior_green = -1/2*(np.subtract(x,means[0]) @ np.linalg.inv(cov[0])\
          @ (x-means[0])) + np.log(priors[0])-1/2*np.log(np.linalg.det(cov[0]))
        posterior_red = -1/2*((x-means[1]) @ np.linalg.inv(cov[1]) @ \
            (x-means[1])) + np.log(priors[1])-1/2*np.log(np.linalg.det(cov[1]))
        if posterior_green >= posterior_red:
            preds.append('Green')
        else:
            preds.append('Red')
        posteriors.append((posterior_green,posterior_red))
    return preds,posteriors

In [16]:
cov_lda = [lda.covariance_,lda.covariance_]

In [17]:
preds = decision_function_explicit(test_X,lda.means_,cov_lda,
                                 lda.priors_)
# == lda.predict(test_X)

In [18]:
lda.predict(test_X) == preds[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [19]:
x_experimental = \
    np.array([[r1*10*(r2*2-1),r3*10*(r4*2-1)] \
                for r1,r2, r3, r4 \
                in zip(np.random.random(1000),np.random.random(1000),
                      np.random.random(1000),np.random.random(1000))])

In [20]:
qda.fit(train_X,train_y)

In [21]:
np.mean(decision_function_explicit(pd.DataFrame(x_experimental),lda.means_,cov_lda,
                                 lda.priors_)[0] == lda.predict(x_experimental))



0.999

On 1000 tries there is ~ one or two predictions based on lda attributes that is different from the lda generated predictions, depending on the random seed.

In [22]:
np.mean(decision_function_explicit(pd.DataFrame(x_experimental),qda.means_,qda.covariance_,
                                 qda.priors_)[0] == qda.predict(x_experimental))



1.0

In [23]:
qda.covariance_

[array([[ 9.05321199,  6.30429963],
        [ 6.30429963, 23.2240902 ]]),
 array([[ 5.16987056, -0.65199185],
        [-0.65199185,  5.04784942]])]

In [24]:
x_experimental[np.where(qda.predict(x_experimental) != 
decision_function_explicit(pd.DataFrame(x_experimental),
                           qda.means_,qda.covariance_,
                                 qda.priors_)[0],True,False)]



array([], shape=(0, 2), dtype=float64)

Below we see that lda.coeff_ @ features + lda.intercept_ gives the log odds that the features correspond to class 'Red'.

In [25]:
np.mean(np.where(np.array([lda.coef_ @ x + lda.intercept_ for x in x_experimental]).flatten()>0,
        'Red','Green') == lda.predict(x_experimental))



1.0

lda.means_ corresponds to the means of the features for the respective classes

In [26]:
lda.means_

array([[ 3.10585179,  7.251883  ],
       [-1.986014  ,  4.76083117]])

In [27]:
red_train.mean_return.mean(), red_train.volatility.mean()

(-1.9860140000000002, 4.760831174875219)

In [28]:
green_train.mean_return.mean(), green_train.volatility.mean()

(3.105851785714285, 7.2518829997933265)

##### Accuracy

In [29]:
preds_linear = lda.predict(test_X)
preds_quad = qda.predict(test_X)

In [30]:
accuracy_score(test_y,preds_linear)

0.9038461538461539

In [31]:
accuracy_score(test_y,preds_quad)

0.8076923076923077

LDA gives better accuracy

##### Confusion Matrix

In [32]:
cm_linear = confusion_matrix(test_y,preds_linear)
cm_quad = confusion_matrix(test_y,preds_quad)

In [33]:
(cm_linear, cm_quad)

(array([[17,  1],
        [ 4, 30]]),
 array([[16,  2],
        [ 8, 26]]))

In [34]:
tpr(cm_linear), tnr(cm_linear)

(0.88, 0.94)

In [35]:
tpr(cm_quad), tnr(cm_quad)

(0.76, 0.89)

In [36]:
preds_linear = np.where(preds_linear == 'Green', 1, 0)
trade_labels(stock_data,2021,preds_linear)

49767.72

In [37]:
preds_quad = np.where(preds_quad == 'Green', 1, 0)
trade_labels(stock_data,2021,preds_quad)

8032.21

In [38]:
stock_data

Unnamed: 0,Year,Year_Week,Begin_Price,Adj Close,Return,Label,Week_Number,mean_return,volatility
0,2017,2017-01,6.36,4.99,0.784591,NoLabel,1,-0.82525,2.466974
1,2017,2017-02,4.99,4.62,0.925852,NoLabel,2,-1.44700,3.705994
2,2017,2017-03,4.62,4.79,1.036797,NoLabel,3,0.92975,1.424463
3,2017,2017-04,4.79,4.94,1.031315,NoLabel,4,0.61380,0.731913
4,2017,2017-05,4.94,5.03,1.018219,NoLabel,5,0.35540,1.023493
...,...,...,...,...,...,...,...,...,...
258,2021,2021-48,49.93,43.1,0.863208,Red,48,-2.83240,4.045923
259,2021,2021-49,43.1,39.75,0.922274,Red,49,-1.44300,6.241354
260,2021,2021-50,39.75,38.91,0.978868,Red,50,-0.09220,8.972470
261,2021,2021-51,38.91,38.03,0.977384,Red,51,-0.55650,1.666374


In [39]:
buy_and_hold(stock_data,2021)

807.43

#### SPY

In [40]:
stock = 'spy'

In [41]:
stock_data = data_loading_processing(stock)[2]

In [42]:
train_X = stock_data.loc[stock_data.Year == 2020,['mean_return','volatility']]
test_X = stock_data.loc[stock_data.Year == 2021,['mean_return','volatility']]
train_y = np.ravel(stock_data.loc[stock_data.Year == 2020,['Label']].values)
test_y = np.ravel(stock_data.loc[stock_data.Year == 2021,['Label']].values)

In [43]:
head_tail(stock_data)

Unnamed: 0,Year,Year_Week,Begin_Price,Adj Close,Return,Label,Week_Number,mean_return,volatility
0,2017,2017-01,225.04,202.38,0.899307,NoLabel,1,0.2185,0.314887
1,2017,2017-02,202.38,202.24,0.999308,NoLabel,2,-0.0136,0.275665
261,2021,2021-51,447.56,458.0,1.023326,Green,51,0.58375,1.198328
262,2021,2021-52,458.0,463.41,1.011812,Green,52,0.29625,0.763861


In [44]:
stock

'spy'

In [53]:
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
lda = LinearDiscriminantAnalysis(store_covariance=True)

In [54]:
lda.fit(train_X,train_y)
qda.fit(train_X,train_y)
# (lda.coef_, qda.decision_function())

In [55]:
preds_linear = lda.predict(test_X)
preds_quad = qda.predict(test_X)

In [58]:
qda.priors_, qda.means_,qda.covariance_

(array([0.54716981, 0.45283019]),
 array([[ 0.64366897,  1.15382074],
        [-0.55140208,  1.89994944]]),
 [array([[0.37190062, 0.37929828],
         [0.37929828, 1.2288957 ]]),
  array([[ 0.58649164, -0.96430721],
         [-0.96430721,  3.18295322]])])

[array([[0.37190062, 0.37929828],
        [0.37929828, 1.2288957 ]]),
 array([[ 0.58649164, -0.96430721],
        [-0.96430721,  3.18295322]])]

##### Accuracy

In [51]:
accuracy_score(test_y,preds_linear)

0.9038461538461539

In [52]:
accuracy_score(test_y,preds_quad)

0.8076923076923077

LDA gives better accuracy

##### Confusion Matrix

In [53]:
cm_linear = confusion_matrix(test_y,preds_linear)
cm_quad = confusion_matrix(test_y,preds_quad)

In [54]:
(cm_linear, cm_quad)

(array([[31,  0],
        [ 5, 16]]),
 array([[31,  0],
        [10, 11]]))

In [55]:
tpr(cm_linear), tnr(cm_linear)

(0.76, 1.0)

In [56]:
tpr(cm_quad), tnr(cm_quad)

(0.52, 1.0)

In [57]:
preds_linear = np.where(preds_linear == 'Green', 1, 0)
trade_labels(stock_data,2021,preds_linear)

163.88

In [58]:
preds_quad = np.where(preds_quad == 'Green', 1, 0)
trade_labels(stock_data,2021,preds_quad)

158.74

In [59]:
stock_data

Unnamed: 0,Year,Year_Week,Begin_Price,Adj Close,Return,Label,Week_Number,mean_return,volatility
0,2017,2017-01,225.04,202.38,0.899307,NoLabel,1,0.21850,0.314887
1,2017,2017-02,202.38,202.24,0.999308,NoLabel,2,-0.01360,0.275665
2,2017,2017-03,202.24,201.97,0.998665,NoLabel,3,-0.03350,0.382903
3,2017,2017-04,201.97,203.95,1.009803,NoLabel,4,0.19700,0.517220
4,2017,2017-05,203.95,204.28,1.001618,NoLabel,5,0.03320,0.463586
...,...,...,...,...,...,...,...,...,...
258,2021,2021-48,445.12,439.74,0.987913,Red,48,-0.23360,1.529530
259,2021,2021-49,439.74,456.54,1.038204,Green,49,0.75660,1.028056
260,2021,2021-50,456.54,447.56,0.98033,Red,50,-0.39140,1.100609
261,2021,2021-51,447.56,458.0,1.023326,Green,51,0.58375,1.198328


In [60]:
buy_and_hold(stock_data,2021)

127.54