In [44]:
import numpy as np
import pandas as pd
import sklearn.linear_model as skl
from sklearn.model_selection import train_test_split, cross_val_score
import statsmodels.api as sm
import scipy.stats as ss
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# import cupy as cp

In [45]:
df = pd.read_csv('./preprocessed_stock_data.csv').dropna().set_index('Date')
pd.set_option('display.width', 1000)  # Set maximum display width
pd.set_option('display.max_columns', None)  # Show all columns

describe_output = df.describe().transpose()
print(describe_output.to_string())


KeyError: "None of ['Date'] are in the columns"

### z-score normalization

In [None]:
from scipy.stats import zscore
import pandas as pd

In [None]:
df_nor = zscore(df)

In [None]:
y_train = df['NVDA']
x = df_nor.drop(['NVDA'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y_train, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Add constant for intercept
x_with_const = add_constant(x_train)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]

print(vif_data.transpose().to_string())

# Drop features that are likely causing multicollinearity
x_train_reduced = x_train.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)
x_test_reduced = x_test.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)

# Add constant for intercept
x_with_const = add_constant(x_train_reduced)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]

print(vif_data.transpose().to_string())

# OLS
x_train_reduced = sm.add_constant(x_train_reduced)
benchmark_prep = sm.OLS(y_train,x_train_reduced).fit()
benchmark_prep.summary()
benchmark_select = x_train_reduced.columns[np.abs(benchmark_prep.tvalues)>=1.96]
x = x_train_reduced[benchmark_select]
benchmark = sm.OLS(y_train,x).fit()
print(benchmark.summary())
y_hat_benchmark1 = benchmark.predict(x)
corr_benchmark1 = ss.pearsonr(y_hat_benchmark1, y_train)[0]
print('benchmark: corr (Y, Y_pred) = '+str(corr_benchmark1))
print('Hard Thresholding selected ' +str(len(benchmark_select)) +' features: ', benchmark_select.values)

(200, 19) (51, 19) (200,) (51,)
               0           1            2          3          4          5          6          7          8         9         10        11        12        13        14         15         16           17           18         19
Feature     const         SMA          EMA       AAPL        AMD       AVGO       INTC       QCOM  ADS_Index    Mkt-RF       SMB       HML       RMW       CMA        RF   CBBTCUSD   CBETHUSD        SP500    NASDAQCOM       DJIA
VIF      1.023709  683.267089  1087.815919  19.090883  20.204097  58.490461  12.955896  39.307177   2.620873  1.468869  2.531297  1.803322  1.966615  1.238401  2.439675  61.290274  54.343314  1772.999897  1313.153114  136.78701
               0         1          2          3         4          5          6         7         8        9         10        11        12         13         14         15
Feature     const      AAPL        AMD       AVGO      INTC       QCOM  ADS_Index    Mkt-RF       SMB      HML

In [None]:
# Ridge
a = 0.5
ridge_prep = skl.Ridge(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
ridge_select = x_train_reduced.columns[np.abs(ridge_prep.coef_)>=0.001]
x = x_train_reduced[ridge_select]
ridge = sm.OLS(y_train,x).fit()
print(ridge.summary())
y_pred_ridge = ridge.predict(x)
corr_ridge = ss.pearsonr(y_pred_ridge, y_train)[0]
print('model 2 Ridge Regression: corr (Y, Y_pred) = '+str(corr_ridge))
print('Ridge Regression selected ' +str(len(ridge_select)) +' features: ', ridge_select.values)

                            OLS Regression Results                            
Dep. Variable:                   NVDA   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.980
Method:                 Least Squares   F-statistic:                     659.1
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          4.31e-151
Time:                        11:54:57   Log-Likelihood:                -569.14
No. Observations:                 200   AIC:                             1170.
Df Residuals:                     184   BIC:                             1223.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         79.3022      0.310    255.977      0.0

In [None]:
# Lasso
a = 0.5
lasso_prep = skl.Lasso(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
lasso_select = x_train_reduced.columns[np.abs(lasso_prep.coef_)!=0.0]
x = x_train_reduced[lasso_select]
lasso = sm.OLS(y_train,x).fit()
print(lasso.summary())
y_pred_lasso = lasso.predict(x)
corr_lasso = ss.pearsonr(y_pred_lasso, y_train)[0]
print('model 3 LASSO: corr (Y, Y_pred) = '+str(corr_lasso))
print('LASSO selected ' +str(len(lasso_select)) +' features: ', lasso_select.values)

                            OLS Regression Results                            
Dep. Variable:                   NVDA   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                     1485.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          1.46e-158
Time:                        11:54:57   Log-Likelihood:                -584.04
No. Observations:                 200   AIC:                             1182.
Df Residuals:                     193   BIC:                             1205.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         79.3110      0.324    244.947      0.0

In [None]:
# Elastic Net
a = 0.5
elastic_prep = skl.ElasticNet(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
elastic_select = x_train_reduced.columns[np.abs(elastic_prep.coef_)!=0.0]
x = x_train_reduced[elastic_select]
elastic = sm.OLS(y_train,x).fit()
print(elastic.summary())
y_pred_elastic = elastic.predict(x)
corr_elastic = ss.pearsonr(y_pred_elastic, y_train)[0]
print('model 4 Elastic Net: corr (Y, Y_pred) = '+str(corr_elastic))
print('ElasticNet selected ' +str(len(elastic_select)) +' features: ', elastic_select.values)

                            OLS Regression Results                            
Dep. Variable:                   NVDA   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.980
Method:                 Least Squares   F-statistic:                     767.6
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          6.43e-154
Time:                        11:54:57   Log-Likelihood:                -569.29
No. Observations:                 200   AIC:                             1167.
Df Residuals:                     186   BIC:                             1213.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         79.3105      0.308    257.549      0.0

In [None]:
# Least Angle
leastAngle_prep = skl.Lars().fit(x_train_reduced, y_train)
leastAngle_select = x_train_reduced.columns[leastAngle_prep.coef_>=0.001]
x = x_train_reduced[leastAngle_select]
leastAngle = sm.OLS(y_train,x).fit()
print(leastAngle.summary())
y_pred_leastAngle = leastAngle.predict(x)
corr_leastAngle = ss.pearsonr(y_pred_leastAngle, y_train)[0]
print('model 1 LARS: corr (Y, Y_pred) = '+str(corr_leastAngle))
print('LARS selected ' +str(len(leastAngle_select)) +' features: ', leastAngle_select.values + '\n')

                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.113
Model:                            OLS   Adj. R-squared (uncentered):              0.086
Method:                 Least Squares   F-statistic:                              4.129
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    0.000632
Time:                        11:54:57   Log-Likelihood:                         -1160.0
No. Observations:                 200   AIC:                                      2332.
Df Residuals:                     194   BIC:                                      2352.
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
print(f"OLS: {benchmark_select.values}")
print(f"ridge: {ridge_select.values}")
print(f"lasso: {lasso_select.values}")
print(f"elastic net: {elastic_select.values}")
print(f"LAS: {leastAngle_select.values}")


OLS: ['const' 'AAPL' 'AVGO' 'INTC' 'QCOM' 'RF' 'CBBTCUSD' 'CBETHUSD']
ridge: ['const' 'AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'SMB'
 'HML' 'RMW' 'CMA' 'RF' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
lasso: ['const' 'AAPL' 'AVGO' 'INTC' 'QCOM' 'RF' 'CBETHUSD']
elastic net: ['const' 'AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'HML'
 'RMW' 'RF' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
LAS: ['AVGO' 'Mkt-RF' 'HML' 'CMA' 'CBETHUSD' 'DJIA']


OLS: ['const' 'AAPL' 'AVGO' 'INTC' 'QCOM' 'RF' 'CBBTCUSD' 'CBETHUSD']
ridge: ['const' 'AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'SMB'
 'HML' 'RMW' 'CMA' 'RF' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
lasso: ['const' 'AAPL' 'AVGO' 'INTC' 'QCOM' 'RF' 'CBETHUSD']
elastic net: ['const' 'AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'HML'
 'RMW' 'RF' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
LAS: ['AVGO' 'Mkt-RF' 'HML' 'CMA' 'CBETHUSD' 'DJIA']

### Without scaling

In [None]:
y_train = df['NVDA']
x = df.drop(['NVDA'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y_train, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(200, 19) (51, 19) (200,) (51,)


In [None]:
# Add constant for intercept
x_with_const = add_constant(x_train)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]

print(vif_data.transpose().to_string())


                  0           1            2          3          4          5          6          7          8         9         10        11        12        13        14         15         16           17           18         19
Feature        const         SMA          EMA       AAPL        AMD       AVGO       INTC       QCOM  ADS_Index    Mkt-RF       SMB       HML       RMW       CMA        RF   CBBTCUSD   CBETHUSD        SP500    NASDAQCOM       DJIA
VIF      6554.654195  683.267089  1087.815919  19.090883  20.204097  58.490461  12.955896  39.307177   2.620873  1.468869  2.531297  1.803322  1.966615  1.238401  2.439675  61.290274  54.343314  1772.999897  1313.153114  136.78701


In [None]:
# Drop features that are likely causing multicollinearity
x_train_reduced = x_train.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)
x_test_reduced = x_test.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)

In [None]:
# Add constant for intercept
x_with_const = add_constant(x_train_reduced)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]

print(vif_data.transpose().to_string())

                  0         1          2          3         4          5          6         7         8        9         10        11        12         13         14         15
Feature        const      AAPL        AMD       AVGO      INTC       QCOM  ADS_Index    Mkt-RF       SMB      HML       RMW       CMA        RF   CBBTCUSD   CBETHUSD       DJIA
VIF      3847.248542  7.640395  13.727902  34.905385  5.273824  16.222789   1.732388  1.363715  2.516719  1.67376  1.912893  1.182506  1.695654  45.678119  39.125587  17.221943


In [None]:
# OLS
x_train_reduced = sm.add_constant(x_train_reduced)
benchmark_prep = sm.OLS(y_train,x_train_reduced).fit()
benchmark_prep.summary()
benchmark_select = x_train_reduced.columns[np.abs(benchmark_prep.tvalues)>=1.96]
x = x_train_reduced[benchmark_select]
benchmark = sm.OLS(y_train,x).fit()
print(benchmark.summary())
y_hat_benchmark1 = benchmark.predict(x)
corr_benchmark1 = ss.pearsonr(y_hat_benchmark1, y_train)[0]
print('benchmark: corr (Y, Y_pred) = '+str(corr_benchmark1))
print('Hard Thresholding selected ' +str(len(benchmark_select)) +' features: ', benchmark_select.values)

                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                          1.011e+04
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   8.01e-244
Time:                        11:54:57   Log-Likelihood:                         -581.34
No. Observations:                 200   AIC:                                      1177.
Df Residuals:                     193   BIC:                                      1200.
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
# Ridge
a = 0.5
ridge_prep = skl.Ridge(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
ridge_select = x_train_reduced.columns[np.abs(ridge_prep.coef_)>=0.001]
x = x_train_reduced[ridge_select]
ridge = sm.OLS(y_train,x).fit()
print(ridge.summary())
y_pred_ridge = ridge.predict(x)
corr_ridge = ss.pearsonr(y_pred_ridge, y_train)[0]
print('model 2 Ridge Regression: corr (Y, Y_pred) = '+str(corr_ridge))
print('Ridge Regression selected ' +str(len(ridge_select)) +' features: ', ridge_select.values)

                            OLS Regression Results                            
Dep. Variable:                   NVDA   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                     672.4
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          1.12e-148
Time:                        11:54:57   Log-Likelihood:                -582.28
No. Observations:                 200   AIC:                             1193.
Df Residuals:                     186   BIC:                             1239.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -69.8806     15.443     -4.525      0.0

In [None]:
# Lasso
a = 0.5
lasso_prep = skl.Lasso(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
lasso_select = x_train_reduced.columns[np.abs(lasso_prep.coef_)!=0.0]
x = x_train_reduced[lasso_select]
lasso = sm.OLS(y_train,x).fit()
print(lasso.summary())
y_pred_lasso = lasso.predict(x)
corr_lasso = ss.pearsonr(y_pred_lasso, y_train)[0]
print('model 3 LASSO: corr (Y, Y_pred) = '+str(corr_lasso))
print('LASSO selected ' +str(len(lasso_select)) +' features: ', lasso_select.values)

                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                              9253.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   6.63e-244
Time:                        11:54:57   Log-Likelihood:                         -576.31
No. Observations:                 200   AIC:                                      1169.
Df Residuals:                     192   BIC:                                      1195.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  model = cd_fast.enet_coordinate_descent(


In [None]:
# Elastic Net
a = 0.5
elastic_prep = skl.ElasticNet(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
elastic_select = x_train_reduced.columns[np.abs(elastic_prep.coef_)!=0.0]
x = x_train_reduced[elastic_select]
elastic = sm.OLS(y_train,x).fit()
print(elastic.summary())
y_pred_elastic = elastic.predict(x)
corr_elastic = ss.pearsonr(y_pred_elastic, y_train)[0]
print('model 4 Elastic Net: corr (Y, Y_pred) = '+str(corr_elastic))
print('ElasticNet selected ' +str(len(elastic_select)) +' features: ', elastic_select.values)

                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                              8226.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   3.93e-242
Time:                        11:54:57   Log-Likelihood:                         -575.77
No. Observations:                 200   AIC:                                      1170.
Df Residuals:                     191   BIC:                                      1199.
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  model = cd_fast.enet_coordinate_descent(


In [None]:
# Least Angle
leastAngle_prep = skl.Lars().fit(x_train_reduced, y_train)
leastAngle_select = x_train_reduced.columns[leastAngle_prep.coef_>=0.001]
x = x_train_reduced[leastAngle_select]
leastAngle = sm.OLS(y_train,x).fit()
print(leastAngle.summary())
y_pred_leastAngle = leastAngle.predict(x)
corr_leastAngle = ss.pearsonr(y_pred_leastAngle, y_train)[0]
print('model 1 LARS: corr (Y, Y_pred) = '+str(corr_leastAngle))
print('LARS selected ' +str(len(leastAngle_select)) +' features: ', leastAngle_select.values + '\n')

                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.994
Model:                            OLS   Adj. R-squared (uncentered):              0.994
Method:                 Least Squares   F-statistic:                              3879.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   8.30e-208
Time:                        11:54:57   Log-Likelihood:                         -662.90
No. Observations:                 200   AIC:                                      1342.
Df Residuals:                     192   BIC:                                      1368.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
print(f"OLS: {benchmark_select.values}")
print(f"ridge: {ridge_select.values}")
print(f"lasso: {lasso_select.values}")
print(f"elastic net: {elastic_select.values}")
print(f"LAS: {leastAngle_select.values}")

OLS: ['AAPL' 'AVGO' 'INTC' 'QCOM' 'RF' 'CBBTCUSD' 'CBETHUSD']
ridge: ['const' 'AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'SMB'
 'HML' 'RMW' 'CMA' 'RF' 'CBETHUSD']
lasso: ['AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
elastic net: ['AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'Mkt-RF' 'CBBTCUSD' 'CBETHUSD' 'DJIA']
LAS: ['AAPL' 'AMD' 'AVGO' 'Mkt-RF' 'HML' 'CMA' 'RF' 'CBETHUSD']
