In [34]:
import pandas as pd

df = pd.read_csv("./Data/NVDA_cleaned.csv")

# features = ['Close', 'High', 'Low', 'Open', 'Volume', 'Price_200EMA_diff', 'MA_CO_signal_-1', 'MA_CO_signal_0', 'MA_CO_signal_1', 
#             'Close_1', 'High_1', 'Low_1', 'Open_1', 'Volume_1', 'Price_200EMA_diff_1', 
#             'Close_2', 'High_2', 'Low_2', 'Open_2', 'Volume_2', 'Price_200EMA_diff_2',
#             'Close_3', 'High_3', 'Low_3', 'Open_3', 'Volume_3', 'Price_200EMA_diff_3']
features = ['Close', 'High', 'Low', 'Open', 'Volume', 'Price_200EMA_diff', 'MA_CO_signal_-1', 'MA_CO_signal_0', 'MA_CO_signal_1']

df_features = df[features]
df_target = df['T_reg']

print(df_features.shape)
print(df_target.shape)

(1557, 9)
(1557,)


In [35]:
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

X = df_features
y = df_target

train_size = int(len(df) * 0.8)

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_val = X.iloc[train_size:]
y_val = y.iloc[train_size:]

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)


model = XGBRegressor(
    # use_label_encoder = False,
    # eval_metric = 'logloss',
    random_state = 42
)

param_grid = {
    'n_estimators': [100, 150, 200, 300, 500],
    'learning_rate': [0.01, 0.1, 0.001],
    'max_depth': [2, 3, 5, 7],
    'lambda': [0.01, 0.1, 1, 2, 5, 10]
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error', 
    cv=tscv,
    n_jobs=-1,            
    verbose=1
)


grid_search.fit(X_train_scaled, y_train)
print("Best parameters from GridSearchCV:", grid_search.best_params_)
print("Best CV Score (nmse):", grid_search.best_score_)


# model.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters from GridSearchCV: {'lambda': 10, 'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 100}
Best CV Score (nmse): -0.00013874746351350733


In [36]:
from sklearn.metrics import mean_squared_error

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled,y_train)


y_val_pred = best_model.predict(X_val_scaled)

mse = mean_squared_error(y_val, y_val_pred)

print("Validation MSE:", mse)

Validation MSE: 9.84137694558986e-05


In [37]:
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)

print(feature_importance_df)

             feature  importance
2                Low    0.651564
5  Price_200EMA_diff    0.250363
3               Open    0.098073
0              Close    0.000000
1               High    0.000000
4             Volume    0.000000
6    MA_CO_signal_-1    0.000000
7     MA_CO_signal_0    0.000000
8     MA_CO_signal_1    0.000000


In [38]:
best_model.save_model('xgb_regressor.model')

