In [2]:
# Global tools
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split


Feature Selection Summary:

Benchmark OLS: ['AMD' 'ADS_Index' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal' 'RSI']

Ridge: ['AMD' 'ADS_Index' 'Mkt-RF' 'SMB' 'HML' 'RMW' 'CMA' 'RF' 'CBBTCUSD' 'DJIA'
 'MACD_Signal' 'RSI']

Lasso: ['RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal']

Elastic Net: ['ADS_Index' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal' 'RSI']

LARS: ['ADS_Index' 'Mkt-RF' 'SMB' 'RMW' 'RF' 'CBBTCUSD' 'DJIA' 'MACD_Signal']


In [3]:
df = pd.read_csv('./DataSelected.csv')
target_column = 'NVDA' 
y = df[target_column]
X = df.drop(columns=[target_column])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [21]:
# Simple Linear Models + Grid Search 

In [4]:
# Lasso 
param_grid = {
    'alpha': np.linspace(0.0001, 0.1, 1000)
    }
scorer = make_scorer(mean_squared_error, greater_is_better=False)
lasso = Lasso()
grid_search_lasso = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    scoring=scorer,
    cv=10,              
    n_jobs=-1       
)

grid_search_lasso.fit(X_train, y_train)

print("Lasso Best Params:", grid_search_lasso.best_params_)
print("Lasso Best Score:", grid_search_lasso.best_score_)

best_lasso = grid_search_lasso.best_estimator_
test_predictions = best_lasso.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
print("Test MSE with Best Found Parameters:", test_mse)

Lasso Best Params: {'alpha': np.float64(0.0017000000000000001)}
Lasso Best Score: -0.16834712238655897
Test MSE with Best Found Parameters: 0.1699278032812201


In [5]:
# Ridge
param_grid_ridge = {
    #'alpha': [0.01, 0.1, 1.0, 10.0, 100.0] # Ridge Best Params: {'alpha': 0.01}
    'alpha': np.linspace(0.0001, 10, 1000)
}
scorer = make_scorer(mean_squared_error, greater_is_better=False)
ridge = Ridge()
grid_search_ridge = GridSearchCV(
    estimator=ridge, 
    param_grid=param_grid_ridge, 
    scoring=scorer, 
    cv=10, 
    n_jobs=-1)
grid_search_ridge.fit(X_train, y_train)

print("Ridge Best Params:", grid_search_ridge.best_params_)
print("Ridge Best Score:", grid_search_ridge.best_score_)

best_ridge = grid_search_ridge.best_estimator_
test_predictions = best_ridge.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
print("Test MSE with Best Found Parameters:", test_mse)

Ridge Best Params: {'alpha': np.float64(1.4815666666666667)}
Ridge Best Score: -0.16806571344177768
Test MSE with Best Found Parameters: 0.1704669120292344


In [6]:
# Elastic Net
param_grid_elastic = {
    'alpha': np.logspace(-3, -1, 50), 
    'l1_ratio': np.linspace(0.1, 1.0, 10)  
}
scorer = make_scorer(mean_squared_error, greater_is_better=False)
elastic = ElasticNet()
grid_search_elastic = GridSearchCV(
    estimator=elastic, 
    param_grid=param_grid_elastic, 
    scoring=scorer, 
    cv=10, 
    n_jobs=-1)
grid_search_elastic.fit(X_train, y_train)
print("Elastic Net Best Params:", grid_search_elastic.best_params_)
print("Elastic Net Best Score:", grid_search_elastic.best_score_)

best_elastic = grid_search_elastic.best_estimator_
test_predictions = best_elastic.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
print("Test MSE with Best Found Parameters:", test_mse)


Elastic Net Best Params: {'alpha': np.float64(0.006551285568595509), 'l1_ratio': np.float64(0.1)}
Elastic Net Best Score: -0.16812036079581824
Test MSE with Best Found Parameters: 0.1705288136810767


In [7]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'alpha': 0.1,
    'lambda': 1.0
}

# Train the XGBoost model
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=200, evals=evals, early_stopping_rounds=20, verbose_eval=10)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Test Mean Squared Error: {mse:.4f}")
print(f"Test RMSE: {mse ** 0.5:.4f}")

# Feature importance
importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

# Save the feature importance to a CSV file
importance_csv_path = './FeatureImportance.csv'
importance_df.to_csv(importance_csv_path, index=False)
print(f"Feature importance saved to {importance_csv_path}")

# Display top features
print("\nTop Features:")
print(importance_df)


[0]	train-rmse:0.91975	test-rmse:0.86793
[10]	train-rmse:0.36919	test-rmse:0.41270
[20]	train-rmse:0.17254	test-rmse:0.27891
[30]	train-rmse:0.09723	test-rmse:0.23665
[40]	train-rmse:0.06638	test-rmse:0.22133
[50]	train-rmse:0.05153	test-rmse:0.21325
[60]	train-rmse:0.04248	test-rmse:0.20941
[70]	train-rmse:0.03636	test-rmse:0.20727
[80]	train-rmse:0.03216	test-rmse:0.20570
[90]	train-rmse:0.02858	test-rmse:0.20499
[100]	train-rmse:0.02602	test-rmse:0.20400
[110]	train-rmse:0.02395	test-rmse:0.20318
[120]	train-rmse:0.02230	test-rmse:0.20265
[130]	train-rmse:0.02068	test-rmse:0.20237
[140]	train-rmse:0.01945	test-rmse:0.20173
[150]	train-rmse:0.01869	test-rmse:0.20128
[160]	train-rmse:0.01796	test-rmse:0.20103
[170]	train-rmse:0.01725	test-rmse:0.20079
[180]	train-rmse:0.01662	test-rmse:0.20051
[190]	train-rmse:0.01600	test-rmse:0.20029
[199]	train-rmse:0.01563	test-rmse:0.20011
Test Mean Squared Error: 0.0400
Test RMSE: 0.2001
Feature importance saved to ./FeatureImportance.csv

Top F

In [26]:
from itertools import product

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'alpha': [0, 0.1, 0.5],
    'lambda': [1, 1.5, 2]
}

# Generate all combinations of hyperparameters
param_combinations = list(product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['subsample'],
    param_grid['colsample_bytree'],
    param_grid['alpha'],
    param_grid['lambda']
))

# Cross-validation to find the best parameters
best_params = None
best_rmse = float('inf')

for params in param_combinations:
    learning_rate, max_depth, subsample, colsample_bytree, alpha, lambda_ = params
    cv_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'alpha': alpha,
        'lambda': lambda_,
    }
    cv_results = xgb.cv(
        cv_params,
        dtrain,
        num_boost_round=200,
        nfold=3,
        early_stopping_rounds=20,
        verbose_eval=False
    )
    mean_rmse = cv_results['test-rmse-mean'].min()
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_params = cv_params

print("Best Parameters:", best_params)
print("Best RMSE from CV:", best_rmse)

# Train the model with the best parameters
final_model = xgb.train(best_params, dtrain, num_boost_round=200)

# Make predictions on the test set
y_pred = final_model.predict(dtest)

# Evaluate the model
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = test_mse ** 0.5

print(f"Test MSE with Best Parameters: {test_mse:.4f}")
print(f"Test RMSE with Best Parameters: {test_rmse:.4f}")

# Feature importance
importance = final_model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

# Save feature importance
importance_csv_path = './FeatureImportance_ManualTuning.csv'
importance_df.to_csv(importance_csv_path, index=False)
print(f"Feature importance saved to {importance_csv_path}")

# Display top features
print("\nTop Features:")
print(importance_df)

Best Parameters: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'learning_rate': 0.05, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8, 'alpha': 0, 'lambda': 1}
Best RMSE from CV: 0.2015352477010817
Test MSE with Best Parameters: 0.0369
Test RMSE with Best Parameters: 0.1920
Feature importance saved to ./FeatureImportance_ManualTuning.csv

Top Features:
       Feature  Importance
0          AMD      1636.0
1          RSI      1171.0
3     CBBTCUSD      1097.0
4         DJIA      1040.0
5  MACD_Signal       734.0
6    ADS_Index       622.0
2           RF       107.0
