In [58]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
import sklearn.linear_model as skl
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.model_selection import train_test_split

In [59]:
# Load the preprocessed dataset
output_file_path = './DataStandardized.csv'
df = pd.read_csv(output_file_path, index_col=0)

In [60]:
target_column = 'NVDA'
y = df[target_column]
X = df.drop(columns=[target_column])

# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(175, 23) (76, 23) (175,) (76,)


In [61]:
# Add constant for intercept for VIF calculation
x_with_const = add_constant(x_train)

# Calculate initial VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nInitial VIF:")
print(vif_data.transpose().to_string())



Initial VIF:
               0            1            2          3          4          5          6         7          8         9         10        11        12        13        14         15         16           17           18         19         20           21          22        23
Feature     const          SMA          EMA       AAPL        AMD       AVGO       INTC      QCOM  ADS_Index    Mkt-RF       SMB       HML       RMW       CMA        RF   CBBTCUSD   CBETHUSD        SP500    NASDAQCOM       DJIA       MACD  MACD_Signal         OBV       RSI
VIF      1.046403  5432.125096  6320.607299  34.820568  24.053884  61.313146  30.185955  56.05525   3.433977  1.671425  2.652426  1.884839  2.063098  1.400054  2.997564  76.168897  65.006473  2494.355882  1949.765768  195.04729  88.655368    48.712691  133.554203  8.028027


In [62]:
features_to_remove = ['SMA', 'EMA', 'NASDAQCOM', 'SP500', 'CBETHUSD', 'OBV', 'AVGO', 'QCOM', 'MACD', 'AAPL', 'INTC']
x_train_reduced = x_train.drop(features_to_remove, axis=1, errors='ignore')
x_test_reduced = x_test.drop(features_to_remove, axis=1, errors='ignore')

# Recalculate VIF
x_with_const = add_constant(x_train_reduced)
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nVIF after removing correlated features:")
print(vif_data.transpose().to_string())


VIF after removing correlated features:
               0         1          2         3         4         5         6         7         8         9         10           11        12
Feature     const       AMD  ADS_Index    Mkt-RF       SMB       HML       RMW       CMA        RF  CBBTCUSD      DJIA  MACD_Signal       RSI
VIF      1.020608  8.629611   1.676913  1.465456  2.407137  1.665049  1.761309  1.244787  1.686304  6.699282  6.813083     3.098141  3.078017


In [63]:
# Benchmark OLS Model
x_train_reduced_const = sm.add_constant(x_train_reduced)
benchmark_prep = sm.OLS(y_train, x_train_reduced_const).fit()
benchmark_select = x_train_reduced_const.columns[np.abs(benchmark_prep.tvalues)>=1.96]  # Hard thresholding based on t-values
x_bench = x_train_reduced_const[benchmark_select]
benchmark = sm.OLS(y_train, x_bench).fit()
print("\nBenchmark OLS Summary:")
print(benchmark.summary())
y_hat_benchmark1 = benchmark.predict(x_bench)
corr_benchmark1 = ss.pearsonr(y_hat_benchmark1, y_train)[0]
print('Benchmark: corr (Y, Y_pred) = '+str(corr_benchmark1))
print('Hard Thresholding selected ' +str(len(benchmark_select)) +' features: ', benchmark_select.values)


Benchmark OLS Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.850
Model:                            OLS   Adj. R-squared (uncentered):              0.844
Method:                 Least Squares   F-statistic:                              136.0
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    8.69e-66
Time:                        23:03:55   Log-Likelihood:                         -83.686
No. Observations:                 175   AIC:                                      181.4
Df Residuals:                     168   BIC:                                      203.5
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------

In [64]:
# Ridge Feature Selection
a = 0.1
ridge_prep = skl.Ridge(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
ridge_select = x_train_reduced.columns[np.abs(ridge_prep.coef_)>=0.001]
x_ridge = x_train_reduced[ridge_select]
ridge = sm.OLS(y_train, x_ridge).fit()
print("\nRidge Model Summary:")
print(ridge.summary())
y_pred_ridge = ridge.predict(x_ridge)
corr_ridge = ss.pearsonr(y_pred_ridge, y_train)[0]
print('Ridge Regression: corr (Y, Y_pred) = '+str(corr_ridge))
print('Ridge selected ' +str(len(ridge_select)) +' features: ', ridge_select.values)


Ridge Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.855
Model:                            OLS   Adj. R-squared (uncentered):              0.844
Method:                 Least Squares   F-statistic:                              80.16
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    7.05e-62
Time:                        23:03:55   Log-Likelihood:                         -80.638
No. Observations:                 175   AIC:                                      185.3
Df Residuals:                     163   BIC:                                      223.3
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------

In [65]:
# Lasso Feature Selection
a = 0.1 # 0.5 is toooo high
lasso_prep = skl.Lasso(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
lasso_select = x_train_reduced.columns[np.abs(lasso_prep.coef_)!=0.0]
x_lasso = x_train_reduced[lasso_select]
print("x_lasso shape:", x_lasso.shape)
print("y_train shape:", y_train.shape)

lasso = sm.OLS(y_train, x_lasso).fit()
print("\nLasso Model Summary:")
print(lasso.summary())
y_pred_lasso = lasso.predict(x_lasso)
corr_lasso = ss.pearsonr(y_pred_lasso, y_train)[0]
print('LASSO: corr (Y, Y_pred) = '+str(corr_lasso))
print('LASSO selected ' +str(len(lasso_select)) +' features: ', lasso_select.values)



x_lasso shape: (175, 4)
y_train shape: (175,)

Lasso Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.784
Model:                            OLS   Adj. R-squared (uncentered):              0.779
Method:                 Least Squares   F-statistic:                              154.9
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    9.39e-56
Time:                        23:03:55   Log-Likelihood:                         -115.67
No. Observations:                 175   AIC:                                      239.3
Df Residuals:                     171   BIC:                                      252.0
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>

In [66]:
# Elastic Net Feature Selection
a = 0.1
elastic_prep = skl.ElasticNet(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
elastic_select = x_train_reduced.columns[np.abs(elastic_prep.coef_)!=0.0]
x_elastic = x_train_reduced[elastic_select]
elastic = sm.OLS(y_train, x_elastic).fit()
print("\nElastic Net Model Summary:")
print(elastic.summary())
y_pred_elastic = elastic.predict(x_elastic)
corr_elastic = ss.pearsonr(y_pred_elastic, y_train)[0]
print('Elastic Net: corr (Y, Y_pred) = '+str(corr_elastic))
print('ElasticNet selected ' +str(len(elastic_select)) +' features: ', elastic_select.values)



Elastic Net Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.830
Model:                            OLS   Adj. R-squared (uncentered):              0.824
Method:                 Least Squares   F-statistic:                              137.4
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    2.55e-62
Time:                        23:03:55   Log-Likelihood:                         -94.674
No. Observations:                 175   AIC:                                      201.3
Df Residuals:                     169   BIC:                                      220.3
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------

In [67]:

# Least Angle Regression (LARS)
leastAngle_prep = skl.Lars().fit(x_train_reduced, y_train)
coef_lars = leastAngle_prep.coef_
# For LARS, filter features with a threshold
leastAngle_select = x_train_reduced.columns[coef_lars>=0.001]
x_lars = x_train_reduced[leastAngle_select]
leastAngle = sm.OLS(y_train, x_lars).fit()
print("\nLARS Model Summary:")
print(leastAngle.summary())
y_pred_leastAngle = leastAngle.predict(x_lars)
corr_leastAngle = ss.pearsonr(y_pred_leastAngle, y_train)[0]
print('LARS: corr (Y, Y_pred) = '+str(corr_leastAngle))
print('LARS selected ' +str(len(leastAngle_select)) +' features: ', leastAngle_select.values)



LARS Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.803
Model:                            OLS   Adj. R-squared (uncentered):              0.793
Method:                 Least Squares   F-statistic:                              84.96
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    7.35e-55
Time:                        23:03:55   Log-Likelihood:                         -107.62
No. Observations:                 175   AIC:                                      231.2
Df Residuals:                     167   BIC:                                      256.5
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------

In [68]:

# Print a summary of all selected features
print("\nFeature Selection Summary:")
print(f"Benchmark OLS: {benchmark_select.values}")
print(f"Ridge: {ridge_select.values}")
print(f"Lasso: {lasso_select.values}")
print(f"Elastic Net: {elastic_select.values}")
print(f"LARS: {leastAngle_select.values}")



Feature Selection Summary:
Benchmark OLS: ['AMD' 'ADS_Index' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal' 'RSI']
Ridge: ['AMD' 'ADS_Index' 'Mkt-RF' 'SMB' 'HML' 'RMW' 'CMA' 'RF' 'CBBTCUSD' 'DJIA'
 'MACD_Signal' 'RSI']
Lasso: ['RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal']
Elastic Net: ['ADS_Index' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal' 'RSI']
LARS: ['ADS_Index' 'Mkt-RF' 'SMB' 'RMW' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal']


In [71]:

selected_features = df[['NVDA', 'AMD', 'RSI', 'RF', 'CBBTCUSD', 'DJIA', 'MACD_Signal', 'ADS_Index']]
output_csv_path = './DataSelected.csv'
selected_features.to_csv(output_csv_path, index=False)
