In [1]:
# Global tools
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./DataSelected.csv')
target_column = 'NVDA' 
y = df[target_column]
X = df.drop(columns=[target_column])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


XGB without Hyperparameter Tuning

In [3]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,  
    'max_depth': 3,     
    'subsample': 0.75,      
    'colsample_bytree': 0.75, 
    'alpha': 1.0,          
    'lambda': 2.0          
}

# Cross-validation step
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,  
    metrics='rmse',
    early_stopping_rounds=20,  
    seed=42
)

best_num_boost_round = cv_results['test-rmse-mean'].idxmin() + 1
print(f"Best number of rounds from CV: {best_num_boost_round}")
print(f"Best RMSE from CV: {cv_results['test-rmse-mean'].min():.4f}")

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

y_pred = model.predict(dtest)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train Mean Squared Error: {train_mse:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test Mean Squared Error: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Best number of rounds from CV: 609
Best RMSE from CV: 0.2189
[0]	train-rmse:0.97029	test-rmse:0.91045
[10]	train-rmse:0.65343	test-rmse:0.64451
[20]	train-rmse:0.45975	test-rmse:0.48019
[30]	train-rmse:0.34517	test-rmse:0.38729
[40]	train-rmse:0.27420	test-rmse:0.32829
[50]	train-rmse:0.22907	test-rmse:0.29544
[60]	train-rmse:0.20048	test-rmse:0.27604
[70]	train-rmse:0.18107	test-rmse:0.26415
[80]	train-rmse:0.16508	test-rmse:0.25454
[90]	train-rmse:0.15054	test-rmse:0.24596
[100]	train-rmse:0.14244	test-rmse:0.24177
[110]	train-rmse:0.13505	test-rmse:0.23761
[120]	train-rmse:0.12948	test-rmse:0.23568
[130]	train-rmse:0.12466	test-rmse:0.23256
[140]	train-rmse:0.12076	test-rmse:0.22982
[150]	train-rmse:0.11755	test-rmse:0.22827
[160]	train-rmse:0.11405	test-rmse:0.22667
[170]	train-rmse:0.11162	test-rmse:0.22559
[180]	train-rmse:0.10976	test-rmse:0.22377
[190]	train-rmse:0.10742	test-rmse:0.22219
[200]	train-rmse:0.10615	test-rmse:0.22198
[210]	train-rmse:0.10484	test-rmse:0.22146
[220

XGB with Hyperparameter Tuning

In [4]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'alpha': trial.suggest_float('alpha', 0.1, 2),
        'lambda': trial.suggest_float('lambda', 1, 4),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        nfold=5,
        metrics='rmse',
        early_stopping_rounds=20,
        seed=42
    )
    return cv_results['test-rmse-mean'].min()

In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) 

print(f"Best Parameters: {study.best_params}")
print(f"Best RMSE: {study.best_value:.4f}")

best_params = study.best_params
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

[I 2024-12-12 06:10:14,168] A new study created in memory with name: no-name-a98879fc-a308-4c87-9ed2-ade19df521ae
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
[I 2024-12-12 06:10:14,623] Trial 0 finished with value: 0.29622000469186105 and parameters: {'learning_rate': 0.027951337740373578, 'max_depth': 5, 'subsample': 0.7054303180383934, 'colsample_bytree': 0.7329496261687154, 'alpha': 1.2357931449743182, 'lambda': 3.250336688655829, 'min_child_weight': 13, 'gamma': 0.2991151387178026}. Best is trial 0 with value: 0.29622000469186105.
[I 2024-12-12 06:10:14,758] Trial 1 finished with value: 0.4077180939093436 and parameters: {'learning_rate': 0.046639102689162854, 'max_depth': 5, 'subsample': 0.763092192872227, 'colsample_bytree': 0.6394869382048595, 'alpha': 1.1515737007580393, 'lambda': 2.9587684803232515, 'min_child_weight': 12, 'gamma': 3.6451018049850132}. Best is trial 0 with value: 0.29622000469186105.
[I 2024-12-12 06:10:15,011] Trial 2 finished w

Best Parameters: {'learning_rate': 0.013545237254777324, 'max_depth': 6, 'subsample': 0.8142037203721846, 'colsample_bytree': 0.8485517312175224, 'alpha': 1.4032197505570951, 'lambda': 3.110205625018549, 'min_child_weight': 7, 'gamma': 0.004201071053323878}
Best RMSE: 0.2393
[0]	train-rmse:1.00158	test-rmse:0.93962
[10]	train-rmse:0.89845	test-rmse:0.84992
[20]	train-rmse:0.80693	test-rmse:0.77308
[30]	train-rmse:0.72792	test-rmse:0.70428
[40]	train-rmse:0.65871	test-rmse:0.64563
[50]	train-rmse:0.59622	test-rmse:0.59352
[60]	train-rmse:0.54275	test-rmse:0.54817
[70]	train-rmse:0.49496	test-rmse:0.50946
[80]	train-rmse:0.45274	test-rmse:0.47517
[90]	train-rmse:0.41651	test-rmse:0.44508
[100]	train-rmse:0.38409	test-rmse:0.41824
[110]	train-rmse:0.35641	test-rmse:0.39633
[120]	train-rmse:0.33200	test-rmse:0.37804
[130]	train-rmse:0.30964	test-rmse:0.36123
[140]	train-rmse:0.29019	test-rmse:0.34720
[150]	train-rmse:0.27345	test-rmse:0.33475
[160]	train-rmse:0.25856	test-rmse:0.32433
[170

In [6]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.013545237254777324, 'max_depth': 6, 'subsample': 0.8142037203721846, 'colsample_bytree': 0.8485517312175224, 'alpha': 1.4032197505570951, 'lambda': 3.110205625018549, 'min_child_weight': 7, 'gamma': 0.004201071053323878}


In [7]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Train RMSE: 0.1191
Test RMSE: 0.2425

Top Features:
       Feature  Importance
3     CBBTCUSD       681.0
0          AMD       632.0
1          RSI       538.0
4         DJIA       510.0
6    ADS_Index       463.0
5  MACD_Signal       406.0
2           RF       228.0
