In [28]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
import sklearn.linear_model as skl
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.model_selection import train_test_split

In [29]:
# Load the preprocessed dataset
output_file_path = './DataStandardized.csv'
df = pd.read_csv(output_file_path, index_col=0)

In [30]:
target_column = 'NVDA'
y = df[target_column]
X = df.drop(columns=[target_column])

# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(175, 23) (76, 23) (175,) (76,)


In [31]:
# Add constant for intercept for VIF calculation
x_with_const = add_constant(x_train)

# Calculate initial VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nInitial VIF:")
print(vif_data.transpose().to_string())



Initial VIF:
               0            1            2          3          4          5          6         7          8         9         10        11        12        13        14         15         16           17           18         19         20           21          22        23
Feature     const          SMA          EMA       AAPL        AMD       AVGO       INTC      QCOM  ADS_Index    Mkt-RF       SMB       HML       RMW       CMA        RF   CBBTCUSD   CBETHUSD        SP500    NASDAQCOM       DJIA       MACD  MACD_Signal         OBV       RSI
VIF      1.046403  5432.125096  6320.607299  34.820568  24.053884  61.313146  30.185955  56.05525   3.433977  1.671425  2.652426  1.884839  2.063098  1.400054  2.997564  76.168897  65.006473  2494.355882  1949.765768  195.04729  88.655368    48.712691  133.554203  8.028027


In [32]:
features_to_remove = ['SMA', 'EMA', 'NASDAQCOM_Log_Return', 'SP500_Log_Return', 'MACD']
x_train_reduced = x_train.drop(features_to_remove, axis=1, errors='ignore')
x_test_reduced = x_test.drop(features_to_remove, axis=1, errors='ignore')

# Recalculate VIF
x_with_const = add_constant(x_train_reduced)
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nVIF after removing correlated features:")
print(vif_data.transpose().to_string())


VIF after removing correlated features:
               0          1          2          3          4          5          6         7         8         9         10       11        12        13         14           15           16          17           18         19        20
Feature     const       AAPL        AMD       AVGO       INTC       QCOM  ADS_Index    Mkt-RF       SMB       HML       RMW      CMA        RF  CBBTCUSD   CBETHUSD        SP500    NASDAQCOM        DJIA  MACD_Signal        OBV       RSI
VIF      1.041561  19.363915  19.007057  60.501871  16.026936  34.871863   2.758956  1.565388  2.598219  1.799522  1.966865  1.34398  2.901962  74.50455  51.647211  1452.940125  1347.826124  107.232353     6.900547  88.402556  4.713933


In [33]:
# Benchmark OLS Model
x_train_reduced_const = sm.add_constant(x_train_reduced)
benchmark_prep = sm.OLS(y_train, x_train_reduced_const).fit()
benchmark_select = x_train_reduced_const.columns[np.abs(benchmark_prep.tvalues)>=1.96]  # Hard thresholding based on t-values
x_bench = x_train_reduced_const[benchmark_select]
benchmark = sm.OLS(y_train, x_bench).fit()
print("\nBenchmark OLS Summary:")
print(benchmark.summary())
y_hat_benchmark1 = benchmark.predict(x_bench)
corr_benchmark1 = ss.pearsonr(y_hat_benchmark1, y_train)[0]
print('Benchmark: corr (Y, Y_pred) = '+str(corr_benchmark1))
print('Hard Thresholding selected ' +str(len(benchmark_select)) +' features: ', benchmark_select.values)


Benchmark OLS Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                              3875.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   5.64e-188
Time:                        18:28:25   Log-Likelihood:                          218.67
No. Observations:                 175   AIC:                                     -419.3
Df Residuals:                     166   BIC:                                     -390.9
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------

In [34]:
# Ridge Feature Selection
a = 0.1
ridge_prep = skl.Ridge(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
ridge_select = x_train_reduced.columns[np.abs(ridge_prep.coef_)>=0.001]
x_ridge = x_train_reduced[ridge_select]
ridge = sm.OLS(y_train, x_ridge).fit()
print("\nRidge Model Summary:")
print(ridge.summary())
y_pred_ridge = ridge.predict(x_ridge)
corr_ridge = ss.pearsonr(y_pred_ridge, y_train)[0]
print('Ridge Regression: corr (Y, Y_pred) = '+str(corr_ridge))
print('Ridge selected ' +str(len(ridge_select)) +' features: ', ridge_select.values)


Ridge Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.996
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                              1813.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   8.29e-173
Time:                        18:28:26   Log-Likelihood:                          228.04
No. Observations:                 175   AIC:                                     -416.1
Df Residuals:                     155   BIC:                                     -352.8
Df Model:                          20                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------

In [35]:
# Lasso Feature Selection
a = 0.1 # 0.5 is toooo high
lasso_prep = skl.Lasso(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
lasso_select = x_train_reduced.columns[np.abs(lasso_prep.coef_)!=0.0]
x_lasso = x_train_reduced[lasso_select]
print("x_lasso shape:", x_lasso.shape)
print("y_train shape:", y_train.shape)

lasso = sm.OLS(y_train, x_lasso).fit()
print("\nLasso Model Summary:")
print(lasso.summary())
y_pred_lasso = lasso.predict(x_lasso)
corr_lasso = ss.pearsonr(y_pred_lasso, y_train)[0]
print('LASSO: corr (Y, Y_pred) = '+str(corr_lasso))
print('LASSO selected ' +str(len(lasso_select)) +' features: ', lasso_select.values)



x_lasso shape: (175, 3)
y_train shape: (175,)

Lasso Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.977
Model:                            OLS   Adj. R-squared (uncentered):              0.977
Method:                 Least Squares   F-statistic:                              2480.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   2.90e-141
Time:                        18:28:26   Log-Likelihood:                          81.967
No. Observations:                 175   AIC:                                     -157.9
Df Residuals:                     172   BIC:                                     -148.4
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|

In [36]:
# Elastic Net Feature Selection
a = 0.1
elastic_prep = skl.ElasticNet(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
elastic_select = x_train_reduced.columns[np.abs(elastic_prep.coef_)!=0.0]
x_elastic = x_train_reduced[elastic_select]
elastic = sm.OLS(y_train, x_elastic).fit()
print("\nElastic Net Model Summary:")
print(elastic.summary())
y_pred_elastic = elastic.predict(x_elastic)
corr_elastic = ss.pearsonr(y_pred_elastic, y_train)[0]
print('Elastic Net: corr (Y, Y_pred) = '+str(corr_elastic))
print('ElasticNet selected ' +str(len(elastic_select)) +' features: ', elastic_select.values)



Elastic Net Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.992
Model:                            OLS   Adj. R-squared (uncentered):              0.991
Method:                 Least Squares   F-statistic:                              2181.
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   2.14e-167
Time:                        18:28:26   Log-Likelihood:                          168.70
No. Observations:                 175   AIC:                                     -319.4
Df Residuals:                     166   BIC:                                     -290.9
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------

In [37]:

# Least Angle Regression (LARS)
leastAngle_prep = skl.Lars().fit(x_train_reduced, y_train)
coef_lars = leastAngle_prep.coef_
# For LARS, filter features with a threshold
leastAngle_select = x_train_reduced.columns[coef_lars>=0.001]
x_lars = x_train_reduced[leastAngle_select]
leastAngle = sm.OLS(y_train, x_lars).fit()
print("\nLARS Model Summary:")
print(leastAngle.summary())
y_pred_leastAngle = leastAngle.predict(x_lars)
corr_leastAngle = ss.pearsonr(y_pred_leastAngle, y_train)[0]
print('LARS: corr (Y, Y_pred) = '+str(corr_leastAngle))
print('LARS selected ' +str(len(leastAngle_select)) +' features: ', leastAngle_select.values)



LARS Model Summary:
                                 OLS Regression Results                                
Dep. Variable:                   NVDA   R-squared (uncentered):                   0.981
Model:                            OLS   Adj. R-squared (uncentered):              0.980
Method:                 Least Squares   F-statistic:                              790.5
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                   7.10e-136
Time:                        18:28:26   Log-Likelihood:                          99.410
No. Observations:                 175   AIC:                                     -176.8
Df Residuals:                     164   BIC:                                     -142.0
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------

In [38]:

# Print a summary of all selected features
print("\nFeature Selection Summary:")
print(f"Benchmark OLS: {benchmark_select.values}")
print(f"Ridge: {ridge_select.values}")
print(f"Lasso: {lasso_select.values}")
print(f"Elastic Net: {elastic_select.values}")
print(f"LARS: {leastAngle_select.values}")



Feature Selection Summary:
Benchmark OLS: ['AAPL' 'INTC' 'QCOM' 'CBBTCUSD' 'CBETHUSD' 'SP500' 'DJIA' 'MACD_Signal'
 'OBV']
Ridge: ['AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'ADS_Index' 'Mkt-RF' 'SMB' 'HML' 'RMW'
 'CMA' 'RF' 'CBBTCUSD' 'CBETHUSD' 'SP500' 'NASDAQCOM' 'DJIA' 'MACD_Signal'
 'OBV' 'RSI']
Lasso: ['INTC' 'NASDAQCOM' 'OBV']
Elastic Net: ['AAPL' 'AVGO' 'INTC' 'QCOM' 'CBETHUSD' 'SP500' 'NASDAQCOM' 'MACD_Signal'
 'OBV']
LARS: ['AAPL' 'AMD' 'AVGO' 'INTC' 'QCOM' 'Mkt-RF' 'SMB' 'CBETHUSD' 'SP500' 'OBV'
 'RSI']


In [39]:
selected_features = [
    'NVDA_Log_Return',
    'AMD_Log_Return',
    'RSI',
    'AVGO_Log_Return',
    'QCOM_Log_Return'
]

df_selected = df[selected_features]

# Export to a new CSV file
df_selected.to_csv('DataSelected.csv', index=False)

print("DataSelected.csv created with features:", selected_features)


KeyError: "['NVDA_Log_Return', 'AMD_Log_Return', 'AVGO_Log_Return', 'QCOM_Log_Return'] not in index"