In [1]:
# Global tools
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split
import optuna


ModuleNotFoundError: No module named 'secrets'

In [68]:
df = pd.read_csv('./DataSelected.csv')
target_column = 'NVDA' 
y = df[target_column]
X = df.drop(columns=[target_column])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


: 

XGB without Hyperparameter Tuning

In [69]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,  
    'max_depth': 3,     
    'subsample': 0.75,      
    'colsample_bytree': 0.75, 
    'alpha': 1.0,          
    'lambda': 2.0          
}

# Cross-validation step
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,  
    metrics='rmse',
    early_stopping_rounds=20,  
    seed=42
)

best_num_boost_round = cv_results['test-rmse-mean'].idxmin() + 1
print(f"Best number of rounds from CV: {best_num_boost_round}")
print(f"Best RMSE from CV: {cv_results['test-rmse-mean'].min():.4f}")

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

y_pred = model.predict(dtest)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train Mean Squared Error: {train_mse:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test Mean Squared Error: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Best number of rounds from CV: 609
Best RMSE from CV: 0.2189
[0]	train-rmse:0.97029	test-rmse:0.91045
[10]	train-rmse:0.65343	test-rmse:0.64451
[20]	train-rmse:0.45975	test-rmse:0.48019
[30]	train-rmse:0.34517	test-rmse:0.38729
[40]	train-rmse:0.27420	test-rmse:0.32829
[50]	train-rmse:0.22907	test-rmse:0.29544
[60]	train-rmse:0.20048	test-rmse:0.27604
[70]	train-rmse:0.18107	test-rmse:0.26415
[80]	train-rmse:0.16508	test-rmse:0.25454
[90]	train-rmse:0.15054	test-rmse:0.24596
[100]	train-rmse:0.14244	test-rmse:0.24177
[110]	train-rmse:0.13505	test-rmse:0.23761
[120]	train-rmse:0.12948	test-rmse:0.23568
[130]	train-rmse:0.12466	test-rmse:0.23256
[140]	train-rmse:0.12076	test-rmse:0.22982
[150]	train-rmse:0.11755	test-rmse:0.22827
[160]	train-rmse:0.11405	test-rmse:0.22667
[170]	train-rmse:0.11162	test-rmse:0.22559
[180]	train-rmse:0.10976	test-rmse:0.22377
[190]	train-rmse:0.10742	test-rmse:0.22219
[200]	train-rmse:0.10615	test-rmse:0.22198
[210]	train-rmse:0.10484	test-rmse:0.22146
[220

: 

XGB with Hyperparameter Tuning

: 

In [71]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'alpha': trial.suggest_float('alpha', 0.1, 2),
        'lambda': trial.suggest_float('lambda', 1, 4),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        nfold=5,
        metrics='rmse',
        early_stopping_rounds=20,
        seed=42
    )
    return cv_results['test-rmse-mean'].min()

: 

In [72]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) 

print(f"Best Parameters: {study.best_params}")
print(f"Best RMSE: {study.best_value:.4f}")

best_params = study.best_params
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=10
)

y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

[I 2024-12-12 04:58:31,810] A new study created in memory with name: no-name-7f791ffc-6bc4-4b50-b168-f3d7645b39b0
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
[I 2024-12-12 04:58:32,646] Trial 0 finished with value: 0.3547032971599965 and parameters: {'learning_rate': 0.009820754637849045, 'max_depth': 3, 'subsample': 0.7990706205892896, 'colsample_bytree': 0.7155842614969763, 'alpha': 1.5115306522984433, 'lambda': 2.4748589195951727, 'min_child_weight': 5, 'gamma': 1.474869746726057}. Best is trial 0 with value: 0.3547032971599965.
[I 2024-12-12 04:58:32,937] Trial 1 finished with value: 0.3566320197718559 and parameters: {'learning_rate': 0.03183426514801631, 'max_depth': 3, 'subsample': 0.6492953010577511, 'colsample_bytree': 0.8709844037728705, 'alpha': 0.7913624620655691, 'lambda': 3.2584677241537854, 'min_child_weight': 15, 'gamma': 1.6193424501544378}. Best is trial 0 with value: 0.3547032971599965.
[I 2024-12-12 04:58:34,259] Trial 2 finished with 

Best Parameters: {'learning_rate': 0.027691358267528898, 'max_depth': 5, 'subsample': 0.6145994358584134, 'colsample_bytree': 0.6035486493276264, 'alpha': 0.11036140287043233, 'lambda': 1.4116871086661242, 'min_child_weight': 9, 'gamma': 0.049636928324008345}
Best RMSE: 0.2255
[0]	train-rmse:0.98896	test-rmse:0.92653
[10]	train-rmse:0.78519	test-rmse:0.74702
[20]	train-rmse:0.62918	test-rmse:0.61567
[30]	train-rmse:0.51207	test-rmse:0.51838
[40]	train-rmse:0.42536	test-rmse:0.44028
[50]	train-rmse:0.35071	test-rmse:0.37805
[60]	train-rmse:0.29697	test-rmse:0.33898
[70]	train-rmse:0.25319	test-rmse:0.30346
[80]	train-rmse:0.22213	test-rmse:0.27957
[90]	train-rmse:0.19831	test-rmse:0.26044
[100]	train-rmse:0.17976	test-rmse:0.24754
[110]	train-rmse:0.16407	test-rmse:0.23881
[120]	train-rmse:0.15377	test-rmse:0.23110
[130]	train-rmse:0.14606	test-rmse:0.22621
[140]	train-rmse:0.14037	test-rmse:0.22089
[150]	train-rmse:0.13465	test-rmse:0.21747
[160]	train-rmse:0.13096	test-rmse:0.21449
[1

: 

In [73]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.027691358267528898, 'max_depth': 5, 'subsample': 0.6145994358584134, 'colsample_bytree': 0.6035486493276264, 'alpha': 0.11036140287043233, 'lambda': 1.4116871086661242, 'min_child_weight': 9, 'gamma': 0.049636928324008345}


: 

In [74]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(importance_df)


Train RMSE: 0.1085
Test RMSE: 0.2044

Top Features:
       Feature  Importance
3     CBBTCUSD       217.0
4         DJIA       207.0
5  MACD_Signal       177.0
0          AMD       176.0
1          RSI       141.0
6    ADS_Index       130.0
2           RF        52.0


: 