In [15]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
import sklearn.linear_model as skl
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.model_selection import train_test_split

In [16]:
# Load the preprocessed dataset
output_file_path = './DataPreprocessed.csv'
df = pd.read_csv(output_file_path, index_col=0)

In [17]:
target_column = 'NVDA_Log_Return'
y = df[target_column]
X = df.drop(columns=[target_column])

# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(93, 14) (40, 14) (93,) (40,)


In [18]:
# Add constant for intercept for VIF calculation
x_with_const = add_constant(x_train)

# Calculate initial VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nInitial VIF:")
print(vif_data.transpose().to_string())



Initial VIF:
               0                1               2                3                4                5                 6                     7                8          9          10        11           12        13        14
Feature     const  AAPL_Log_Return  AMD_Log_Return  AVGO_Log_Return  INTC_Log_Return  QCOM_Log_Return  SP500_Log_Return  NASDAQCOM_Log_Return  DJIA_Log_Return        SMA        EMA      MACD  MACD_Signal       RSI       OBV
VIF      1.297008         1.732272        1.857909         1.910992         1.891852          2.48248         25.219462             20.260106          4.84439  13.753856  18.035165  5.556204     4.582626  1.515687  1.937125


In [19]:
features_to_remove = ['SMA', 'EMA', 'NASDAQCOM_Log_Return', 'SP500_Log_Return']
x_train_reduced = x_train.drop(features_to_remove, axis=1, errors='ignore')
x_test_reduced = x_test.drop(features_to_remove, axis=1, errors='ignore')

# Recalculate VIF
x_with_const = add_constant(x_train_reduced)
vif_data = pd.DataFrame()
vif_data["Feature"] = x_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(x_with_const.values, i) for i in range(x_with_const.shape[1])]
print("\nVIF after removing correlated features:")
print(vif_data.transpose().to_string())


VIF after removing correlated features:
               0                1               2                3                4                5                6         7            8         9         10
Feature     const  AAPL_Log_Return  AMD_Log_Return  AVGO_Log_Return  INTC_Log_Return  QCOM_Log_Return  DJIA_Log_Return      MACD  MACD_Signal       RSI       OBV
VIF      1.127426         1.278656        1.497183          1.79619         1.885039         2.295646         1.202971  4.191628     3.599398  1.329044  1.756022


In [20]:
# Benchmark OLS Model
x_train_reduced_const = sm.add_constant(x_train_reduced)
benchmark_prep = sm.OLS(y_train, x_train_reduced_const).fit()
benchmark_select = x_train_reduced_const.columns[np.abs(benchmark_prep.tvalues)>=1.96]  # Hard thresholding based on t-values
x_bench = x_train_reduced_const[benchmark_select]
benchmark = sm.OLS(y_train, x_bench).fit()
print("\nBenchmark OLS Summary:")
print(benchmark.summary())
y_hat_benchmark1 = benchmark.predict(x_bench)
corr_benchmark1 = ss.pearsonr(y_hat_benchmark1, y_train)[0]
print('Benchmark: corr (Y, Y_pred) = '+str(corr_benchmark1))
print('Hard Thresholding selected ' +str(len(benchmark_select)) +' features: ', benchmark_select.values)


Benchmark OLS Summary:
                                 OLS Regression Results                                
Dep. Variable:        NVDA_Log_Return   R-squared (uncentered):                   0.576
Model:                            OLS   Adj. R-squared (uncentered):              0.552
Method:                 Least Squares   F-statistic:                              23.87
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    4.28e-15
Time:                        16:39:10   Log-Likelihood:                         -54.942
No. Observations:                  93   AIC:                                      119.9
Df Residuals:                      88   BIC:                                      132.5
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------

In [21]:
# Ridge Feature Selection
a = 0.1
ridge_prep = skl.Ridge(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
ridge_select = x_train_reduced.columns[np.abs(ridge_prep.coef_)>=0.001]
x_ridge = x_train_reduced[ridge_select]
ridge = sm.OLS(y_train, x_ridge).fit()
print("\nRidge Model Summary:")
print(ridge.summary())
y_pred_ridge = ridge.predict(x_ridge)
corr_ridge = ss.pearsonr(y_pred_ridge, y_train)[0]
print('Ridge Regression: corr (Y, Y_pred) = '+str(corr_ridge))
print('Ridge selected ' +str(len(ridge_select)) +' features: ', ridge_select.values)


Ridge Model Summary:
                                 OLS Regression Results                                
Dep. Variable:        NVDA_Log_Return   R-squared (uncentered):                   0.604
Model:                            OLS   Adj. R-squared (uncentered):              0.556
Method:                 Least Squares   F-statistic:                              12.66
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    4.44e-13
Time:                        16:39:10   Log-Likelihood:                         -51.721
No. Observations:                  93   AIC:                                      123.4
Df Residuals:                      83   BIC:                                      148.8
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [22]:
# Lasso Feature Selection
a = 0.1 # 0.5 is toooo high
lasso_prep = skl.Lasso(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
lasso_select = x_train_reduced.columns[np.abs(lasso_prep.coef_)!=0.0]
x_lasso = x_train_reduced[lasso_select]
print("x_lasso shape:", x_lasso.shape)
print("y_train shape:", y_train.shape)

lasso = sm.OLS(y_train, x_lasso).fit()
print("\nLasso Model Summary:")
print(lasso.summary())
y_pred_lasso = lasso.predict(x_lasso)
corr_lasso = ss.pearsonr(y_pred_lasso, y_train)[0]
print('LASSO: corr (Y, Y_pred) = '+str(corr_lasso))
print('LASSO selected ' +str(len(lasso_select)) +' features: ', lasso_select.values)



x_lasso shape: (93, 3)
y_train shape: (93,)

Lasso Model Summary:
                                 OLS Regression Results                                
Dep. Variable:        NVDA_Log_Return   R-squared (uncentered):                   0.334
Model:                            OLS   Adj. R-squared (uncentered):              0.312
Method:                 Least Squares   F-statistic:                              15.08
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    4.98e-08
Time:                        16:39:10   Log-Likelihood:                         -75.868
No. Observations:                  93   AIC:                                      157.7
Df Residuals:                      90   BIC:                                      165.3
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      

In [23]:
# Elastic Net Feature Selection
a = 0.1
elastic_prep = skl.ElasticNet(alpha=a, fit_intercept=False).fit(x_train_reduced, y_train)
elastic_select = x_train_reduced.columns[np.abs(elastic_prep.coef_)!=0.0]
x_elastic = x_train_reduced[elastic_select]
elastic = sm.OLS(y_train, x_elastic).fit()
print("\nElastic Net Model Summary:")
print(elastic.summary())
y_pred_elastic = elastic.predict(x_elastic)
corr_elastic = ss.pearsonr(y_pred_elastic, y_train)[0]
print('Elastic Net: corr (Y, Y_pred) = '+str(corr_elastic))
print('ElasticNet selected ' +str(len(elastic_select)) +' features: ', elastic_select.values)



Elastic Net Model Summary:
                                 OLS Regression Results                                
Dep. Variable:        NVDA_Log_Return   R-squared (uncentered):                   0.553
Model:                            OLS   Adj. R-squared (uncentered):              0.523
Method:                 Least Squares   F-statistic:                              17.97
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    1.91e-13
Time:                        16:39:10   Log-Likelihood:                         -57.320
No. Observations:                  93   AIC:                                      126.6
Df Residuals:                      87   BIC:                                      141.8
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------

In [24]:

# Least Angle Regression (LARS)
leastAngle_prep = skl.Lars().fit(x_train_reduced, y_train)
coef_lars = leastAngle_prep.coef_
# For LARS, filter features with a threshold
leastAngle_select = x_train_reduced.columns[coef_lars>=0.001]
x_lars = x_train_reduced[leastAngle_select]
leastAngle = sm.OLS(y_train, x_lars).fit()
print("\nLARS Model Summary:")
print(leastAngle.summary())
y_pred_leastAngle = leastAngle.predict(x_lars)
corr_leastAngle = ss.pearsonr(y_pred_leastAngle, y_train)[0]
print('LARS: corr (Y, Y_pred) = '+str(corr_leastAngle))
print('LARS selected ' +str(len(leastAngle_select)) +' features: ', leastAngle_select.values)



LARS Model Summary:
                                 OLS Regression Results                                
Dep. Variable:        NVDA_Log_Return   R-squared (uncentered):                   0.551
Model:                            OLS   Adj. R-squared (uncentered):              0.520
Method:                 Least Squares   F-statistic:                              17.78
Date:                Tue, 10 Dec 2024   Prob (F-statistic):                    2.42e-13
Time:                        16:39:10   Log-Likelihood:                         -57.586
No. Observations:                  93   AIC:                                      127.2
Df Residuals:                      87   BIC:                                      142.4
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
---------------

In [25]:

# Print a summary of all selected features
print("\nFeature Selection Summary:")
print(f"Benchmark OLS: {benchmark_select.values}")
print(f"Ridge: {ridge_select.values}")
print(f"Lasso: {lasso_select.values}")
print(f"Elastic Net: {elastic_select.values}")
print(f"LARS: {leastAngle_select.values}")



Feature Selection Summary:
Benchmark OLS: ['AMD_Log_Return' 'MACD' 'MACD_Signal' 'RSI' 'OBV']
Ridge: ['AAPL_Log_Return' 'AMD_Log_Return' 'AVGO_Log_Return' 'INTC_Log_Return'
 'QCOM_Log_Return' 'DJIA_Log_Return' 'MACD' 'MACD_Signal' 'RSI' 'OBV']
Lasso: ['AMD_Log_Return' 'AVGO_Log_Return' 'RSI']
Elastic Net: ['AMD_Log_Return' 'AVGO_Log_Return' 'QCOM_Log_Return' 'MACD' 'RSI' 'OBV']
LARS: ['AAPL_Log_Return' 'AMD_Log_Return' 'QCOM_Log_Return' 'MACD' 'RSI' 'OBV']


In [26]:
selected_features = [
    'NVDA_Log_Return',
    'AMD_Log_Return',
    'RSI',
    'MACD',
    'OBV',
    'AVGO_Log_Return',
    'QCOM_Log_Return'
]

df_selected = df[selected_features]

# Export to a new CSV file
df_selected.to_csv('DataSelected.csv', index=False)

print("DataSelected.csv created with features:", selected_features)


DataSelected.csv created with features: ['NVDA_Log_Return', 'AMD_Log_Return', 'RSI', 'MACD', 'OBV', 'AVGO_Log_Return', 'QCOM_Log_Return']
