In [15]:
# Global tools
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split
import optuna


In [16]:
df = pd.read_csv('./DataSelected.csv')
target_column = 'NVDA' 
y = df[target_column]
X = df.drop(columns=[target_column])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


XGB without Hyperparameter Tuning

In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,  
    'max_depth': 3,     
    'subsample': 0.75,      
    'colsample_bytree': 0.75, 
    'alpha': 1.0,          
    'lambda': 2.0          
}

# Cross-validation step
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,  
    metrics='rmse',
    early_stopping_rounds=20,  
    seed=42
)

best_num_boost_round = cv_results['test-rmse-mean'].idxmin() + 1
print(f"Best number of rounds from CV: {best_num_boost_round}")
print(f"Best RMSE from CV: {cv_results['test-rmse-mean'].min():.4f}")

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

y_pred = model.predict(dtest)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train Mean Squared Error: {train_mse:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test Mean Squared Error: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Best number of rounds from CV: 609
Best RMSE from CV: 0.2189
[0]	train-rmse:0.97029	test-rmse:0.91045
[10]	train-rmse:0.65343	test-rmse:0.64451
[20]	train-rmse:0.45975	test-rmse:0.48019
[30]	train-rmse:0.34517	test-rmse:0.38729
[40]	train-rmse:0.27420	test-rmse:0.32829
[50]	train-rmse:0.22907	test-rmse:0.29544
[60]	train-rmse:0.20048	test-rmse:0.27604
[70]	train-rmse:0.18107	test-rmse:0.26415
[80]	train-rmse:0.16508	test-rmse:0.25454
[90]	train-rmse:0.15054	test-rmse:0.24596
[100]	train-rmse:0.14244	test-rmse:0.24177
[110]	train-rmse:0.13505	test-rmse:0.23761
[120]	train-rmse:0.12948	test-rmse:0.23568
[130]	train-rmse:0.12466	test-rmse:0.23256
[140]	train-rmse:0.12076	test-rmse:0.22982
[150]	train-rmse:0.11755	test-rmse:0.22827
[160]	train-rmse:0.11405	test-rmse:0.22667
[170]	train-rmse:0.11162	test-rmse:0.22559
[180]	train-rmse:0.10976	test-rmse:0.22377
[190]	train-rmse:0.10742	test-rmse:0.22219
[200]	train-rmse:0.10615	test-rmse:0.22198
[210]	train-rmse:0.10484	test-rmse:0.22146
[220

XGB with Hyperparameter Tuning

In [18]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'alpha': trial.suggest_float('alpha', 0.1, 2),
        'lambda': trial.suggest_float('lambda', 1, 4),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        nfold=5,
        metrics='rmse',
        early_stopping_rounds=20,
        seed=42
    )
    return cv_results['test-rmse-mean'].min()

In [19]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) 

print(f"Best Parameters: {study.best_params}")
print(f"Best RMSE: {study.best_value:.4f}")

best_params = study.best_params
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

[I 2024-12-12 06:35:08,607] A new study created in memory with name: no-name-a406cde0-e80e-4724-a6a8-b5e4b0af00ad
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
[I 2024-12-12 06:35:09,107] Trial 0 finished with value: 0.3243035368928985 and parameters: {'learning_rate': 0.024233303834983774, 'max_depth': 4, 'subsample': 0.804949857244417, 'colsample_bytree': 0.7836206033745312, 'alpha': 1.0218308713956725, 'lambda': 2.905443756450847, 'min_child_weight': 10, 'gamma': 0.9109670737625897}. Best is trial 0 with value: 0.3243035368928985.
[I 2024-12-12 06:35:10,538] Trial 1 finished with value: 0.30942950738569325 and parameters: {'learning_rate': 0.005856685147426442, 'max_depth': 4, 'subsample': 0.684038993617935, 'colsample_bytree': 0.7559272554802121, 'alpha': 1.5195858962181608, 'lambda': 1.4700002907882253, 'min_child_weight': 5, 'gamma': 0.4552817249745761}. Best is trial 1 with value: 0.30942950738569325.
[I 2024-12-12 06:35:12,014] Trial 2 finished with

Best Parameters: {'learning_rate': 0.03687261090804782, 'max_depth': 5, 'subsample': 0.7595272156981417, 'colsample_bytree': 0.7234501182128238, 'alpha': 1.5189502614295103, 'lambda': 2.45821167946695, 'min_child_weight': 7, 'gamma': 0.0011468131332102866}
Best RMSE: 0.2362
[0]	train-rmse:0.98252	test-rmse:0.92120
[10]	train-rmse:0.73355	test-rmse:0.70956
[20]	train-rmse:0.56014	test-rmse:0.56388
[30]	train-rmse:0.44100	test-rmse:0.46350
[40]	train-rmse:0.35648	test-rmse:0.39418
[50]	train-rmse:0.29712	test-rmse:0.35044
[60]	train-rmse:0.25574	test-rmse:0.32163
[70]	train-rmse:0.22717	test-rmse:0.30278
[80]	train-rmse:0.20757	test-rmse:0.29018
[90]	train-rmse:0.19229	test-rmse:0.28050
[100]	train-rmse:0.18048	test-rmse:0.27254
[110]	train-rmse:0.17075	test-rmse:0.26703
[120]	train-rmse:0.16395	test-rmse:0.26376
[130]	train-rmse:0.15872	test-rmse:0.26093
[140]	train-rmse:0.15339	test-rmse:0.25751
[150]	train-rmse:0.14916	test-rmse:0.25460
[160]	train-rmse:0.14502	test-rmse:0.25206
[170]

In [20]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.03687261090804782, 'max_depth': 5, 'subsample': 0.7595272156981417, 'colsample_bytree': 0.7234501182128238, 'alpha': 1.5189502614295103, 'lambda': 2.45821167946695, 'min_child_weight': 7, 'gamma': 0.0011468131332102866}


In [21]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Train RMSE: 0.1096
Test RMSE: 0.2353

Top Features:
       Feature  Importance
3     CBBTCUSD       393.0
0          AMD       358.0
1          RSI       332.0
6    ADS_Index       310.0
4         DJIA       297.0
5  MACD_Signal       271.0
2           RF       102.0
