In [16]:
import pandas as pd

df = pd.read_csv("NVDA_cleaned.csv")

def add_future_price(df, shift_days=1):
    df['Next Close'] = df['Close'].shift(-shift_days)
    df['T_reg'] = (df['Next Close'] - df['Close']) / df['Close']
    df['T_cla'] = df['T_reg'].apply(lambda x: 1 if x > 0 else 0)
    return df

# def add_hist_price(df, shift_days=-1):
#     df['Open 1'] = df['Open'].shift(-shift_days)
#     df['High 1'] = df['High'].shift(-shift_days)
#     df['Close 1'] = df['Close'].shift(-shift_days)
#     df['Low 1'] = df['Low'].shift(-shift_days)
#     df['Vol 1'] = df['Volume'].shift(-shift_days)
#     df['Low 1'] = df['Low'].shift(-shift_days)
#     df['Price_200EMA_diff 1'] = df['Price_200EMA_diff'].shift(-shift_days)
    return df


df = add_future_price(df, 1)
# df = add_hist_price(df, -1)
df = df.dropna()

# features = ['Close', 'High', 'Low', 'Open', 'Volume', 'Price_200EMA_diff', 'MA_CO_signal_-1', 'MA_CO_signal_0', 'MA_CO_signal_1', 
#             'Open 1', 'High 1', 'Low 1', 'Open 1', 'Vol 1', 'Price_200EMA_diff 1']
features = ['Close', 'High', 'Low', 'Open', 'Volume', 'Price_200EMA_diff', 'MA_CO_signal_-1', 'MA_CO_signal_0', 'MA_CO_signal_1',]
df_features = df[features]
df_target = df['T_cla']

print(df_features.shape)
print(df_target.shape)

(1548, 9)
(1548,)


In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


X = df_features
y = df_target

train_size = int(len(df) * 0.8)

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_val = X.iloc[train_size:]
y_val = y.iloc[train_size:]

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)


model = XGBClassifier(
    # use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

param_grid = {
    'n_estimators': [2, 5, 10, 50, 100],
    'learning_rate': [0.01, 0.1, 0.001],
    'max_depth': [2, 3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 100]
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy', 
    cv=tscv,
    n_jobs=-1,            
    verbose=1
)


grid_search.fit(X_train_scaled, y_train)
print("Best parameters from GridSearchCV:", grid_search.best_params_)
print("Best CV Score (acc):", grid_search.best_score_)


# model.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 6480 candidates, totalling 32400 fits


KeyboardInterrupt: 

In [15]:
# y_val_pred = model.predict(X_val_scaled)
y_val_pred = grid_search.best_estimator_.predict(X_val_scaled)
acc = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", acc)

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Validation Accuracy: 0.5258064516129032
Confusion Matrix:
[[  3 145]
 [  2 160]]


In [11]:
importances = grid_search.best_estimator_.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)

print(feature_importance_df)

             feature  importance
7     MA_CO_signal_0    0.233745
8     MA_CO_signal_1    0.165614
1               High    0.129517
4             Volume    0.103284
2                Low    0.097064
5  Price_200EMA_diff    0.082233
3               Open    0.079908
0              Close    0.069363
6    MA_CO_signal_-1    0.039273
