In [36]:
import math
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn as sk
import sklearn.linear_model as skl
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as ss
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# import cupy as cp

In [37]:
df = pd.read_csv('./featurePrepared.csv').dropna().set_index('Date')
pd.set_option('display.width', 1000)  # Set maximum display width
pd.set_option('display.max_columns', None)  # Show all columns

describe_output = df.describe().transpose()
print(describe_output.to_string())


           count          mean           std           min           25%           50%           75%           max
NVDA       251.0     79.092747     30.492500     40.313805     48.141552     79.504669    104.851032    135.568405
SMA        251.0     74.841786     28.926433     43.254546     45.824969     70.279184     93.270627    126.049420
EMA        251.0     74.814103     28.429192     43.178471     46.657931     70.991475     96.907743    122.835461
AAPL       251.0    189.827509     18.559048    164.405121    174.560921    185.949036    195.181770    234.290756
AMD        251.0    147.360677     28.342445     93.669998    122.160000    152.270004    167.779999    211.380005
AVGO       251.0    122.638521     27.476383     79.609314     95.772259    125.089081    139.818703    181.710449
INTC       251.0     36.445328      7.405938     18.990000     30.898989     35.935158     42.780405     50.089161
QCOM       251.0    154.915609     32.302531    102.638420    127.521931    156.

In [38]:
y = df['NVDA']
x = df.drop(['NVDA'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(200, 19) (51, 19) (200,) (51,)


In [39]:
# Add constant for intercept
x_with_const = add_constant(x_train)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]

print(vif_data)


      Feature          VIF
0       const  6554.654015
1         SMA   683.267099
2         EMA  1087.815929
3        AAPL    19.090882
4         AMD    20.204095
5        AVGO    58.490480
6        INTC    12.955897
7        QCOM    39.307189
8   ADS_Index     2.620873
9      Mkt-RF     1.468869
10        SMB     2.531297
11        HML     1.803322
12        RMW     1.966615
13        CMA     1.238401
14         RF     2.439675
15   CBBTCUSD    61.290283
16   CBETHUSD    54.343321
17      SP500  1773.000010
18  NASDAQCOM  1313.153465
19       DJIA   136.786990


In [40]:
# Drop features that are likely causing multicollinearity
x_train_reduced = x_train.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)
x_test_reduced = x_test.drop(['SMA', 'EMA', 'NASDAQCOM', 'SP500'], axis=1)

In [41]:
# Refit Ridge Regression with reduced feature set
ridge_model_reduced = Ridge(alpha=0.5, fit_intercept=False).fit(x_train_reduced, y_train)

# Get coefficients and selected features
ridge_coefficients_reduced = ridge_model_reduced.coef_
selected_features_reduced = x_train_reduced.columns[np.abs(ridge_coefficients_reduced) >= 0.001]

print(f"Ridge Regression selected {len(selected_features_reduced)} features: {selected_features_reduced}")


Ridge Regression selected 13 features: Index(['AAPL', 'AMD', 'AVGO', 'INTC', 'QCOM', 'ADS_Index', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF', 'CBETHUSD'], dtype='object')


In [42]:
# OLS Regression with reduced features
x_train_ols = x_train_reduced[selected_features_reduced]
ols_model_reduced = sm.OLS(y_train, x_train_ols).fit()

# Print OLS summary
print(ols_model_reduced.summary())


                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                              4704.
Date:                Mon, 25 Nov 2024   Prob (F-statistic):                   1.66e-227
Time:                        12:42:45   Log-Likelihood:                         -592.73
No. Observations:                 200   AIC:                                      1211.
Df Residuals:                     187   BIC:                                      1254.
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [43]:
# Add constant for intercept
x_with_const_reduced = add_constant(x_train_ols)

# Calculate VIF
vif_data_reduced = pd.DataFrame()
vif_data_reduced["Feature"] = x_with_const_reduced.columns
vif_data_reduced["VIF"] = [variance_inflation_factor(x_with_const_reduced.values, i) for i in range(x_with_const_reduced.shape[1])]

print(vif_data_reduced)


      Feature          VIF
0       const  2241.925774
1        AAPL     6.546828
2         AMD    11.301273
3        AVGO    31.026097
4        INTC     4.568019
5        QCOM    15.626900
6   ADS_Index     1.681876
7      Mkt-RF     1.293624
8         SMB     2.423989
9         HML     1.650510
10        RMW     1.844393
11        CMA     1.179238
12         RF     1.677907
13   CBETHUSD    10.650696
