In [1]:
import numpy as np
import pandas as pd
import pickle as pk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier, DMatrix
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [2]:
with open("X_for_dl_2000.pkl", "rb") as f:
    X = pk.load(f)
with open("y_for_dl_2000.pkl", "rb") as f:
    y = pk.load(f)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=30
)

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((63968, 20, 87, 1), (15992, 20, 87, 1), (63968,), (15992,))

In [5]:
X_train_flat = X_train.reshape((X_train.shape[0], -1))
X_test_flat = X_test.reshape((X_test.shape[0], -1))

In [6]:
X_train_flat.shape, X_test_flat.shape, y_train.shape, y_test.shape

((63968, 1740), (15992, 1740), (63968,), (15992,))

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=30)
rfc.fit(X_train_flat, y_train)

In [38]:
with open(r"model\rfc_model.pkl", "wb") as f:
    pk.dump(rfc, f)

In [14]:
y_pred_rfc = rfc.predict(X_test_flat)

In [15]:
print("Random Forest Classifier Report:\n", classification_report(y_test, y_pred_rfc))

Random Forest Classifier Report:
               precision    recall  f1-score   support

           0       0.78      0.70      0.74      7920
           1       0.73      0.81      0.77      8072

    accuracy                           0.75     15992
   macro avg       0.76      0.75      0.75     15992
weighted avg       0.76      0.75      0.75     15992



In [16]:
svc = SVC(kernel="rbf")
svc.fit(X_train_flat, y_train)

In [37]:
with open(r"model\svc_model.pkl", "wb") as f:
    pk.dump(svc, f)

In [17]:
y_pred_svc = svc.predict(X_test_flat)

In [18]:
print("Support Vector Classifier Report:\n", classification_report(y_test, y_pred_svc))

Support Vector Classifier Report:
               precision    recall  f1-score   support

           0       0.68      0.63      0.66      7920
           1       0.66      0.71      0.69      8072

    accuracy                           0.67     15992
   macro avg       0.67      0.67      0.67     15992
weighted avg       0.67      0.67      0.67     15992



In [19]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=30)
gbc.fit(X_train_flat, y_train)

In [36]:
with open(r"model\gbc_model.pkl", "wb") as f:
    pk.dump(gbc, f)

In [20]:
y_pred_gbc = gbc.predict(X_test_flat)

In [21]:
print("Gradient Boosting Classifier Report:\n", classification_report(y_test, y_pred_gbc))

Gradient Boosting Classifier Report:
               precision    recall  f1-score   support

           0       0.70      0.67      0.69      7920
           1       0.69      0.72      0.71      8072

    accuracy                           0.70     15992
   macro avg       0.70      0.70      0.70     15992
weighted avg       0.70      0.70      0.70     15992



In [7]:
k = list(range(1, 101))
best_vals = {}
best_k = 1
score = 0

# Create progress bar
pbar = tqdm(k, desc="Testing n_estimators", unit="model")

# Run the search with progress bar
for i in pbar:
    # Train model
    xgb = XGBClassifier(
        n_estimators=i, use_label_encoder=False, eval_metric="logloss", max_depth=10
    )
    xgb.fit(X_train_flat, y_train)

    # Make predictions
    y_pred_gbc = xgb.predict(X_test_flat)
    f1 = f1_score(y_test, y_pred_gbc)
    best_vals[i] = f1

    # Update best score
    if f1 > score:
        score = f1
        best_k = i

    # Update progress bar description with current best
    pbar.set_postfix({"Best F1": f"{score:.4f}", "Best k": best_k})

# Close progress bar
pbar.close()

print(f"\nFinal Results:")
print(f"Best k: {best_k}")
print(f"Best score: {score:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


Final Results:
Best k: 34
Best score: 0.7661





In [8]:
print("Best values:\n", best_vals)

Best values:
 {1: 0.6753840741175091, 2: 0.7018343264400047, 3: 0.7204838897289753, 4: 0.7284575551288764, 5: 0.7373513639543017, 6: 0.742048234882908, 7: 0.7455180442374855, 8: 0.7483705772811918, 9: 0.7518744551002615, 10: 0.7566751799396332, 11: 0.7554702048417132, 12: 0.7601162790697674, 13: 0.7609971526526818, 14: 0.7603815284401535, 15: 0.7625138073367828, 16: 0.7636152281313572, 17: 0.7635178394738373, 18: 0.7629310344827587, 19: 0.7638419396200024, 20: 0.7633098139833226, 21: 0.7633374147280042, 22: 0.7627168136424692, 23: 0.7633177570093458, 24: 0.763757448300035, 25: 0.7619883040935672, 26: 0.7608682932537593, 27: 0.7609356725146199, 28: 0.7631563576893873, 29: 0.7630348375029226, 30: 0.763180993621628, 31: 0.7650292397660818, 32: 0.7649640161488502, 33: 0.7652357000818809, 34: 0.7661266740745073, 35: 0.7653323970037453, 36: 0.7643580586616708, 37: 0.7645818692902319, 38: 0.764895424453688, 39: 0.7651470760576585, 40: 0.7648127527398465, 41: 0.7646368649536548, 42: 0.76421237

In [None]:
X_train_flat_gpu = DMatrix(X_train_flat, label=y_train, nthread=-1)

# Initialize XGBClassifier with GPU-optimized settings
xgb = XGBClassifier(n_estimators=50, eval_metric="logloss", max_depth=5, device="cuda")

# Define parameter grid for hyperparameter tuning
param_grid = {
    "n_estimators": [50, 100, 150],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [4, 5, 6],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.3],
    "min_child_weight": [1, 5, 10],
}

# Calculate the total number of combinations for the progress bar
total_combinations = np.prod([len(v) for v in param_grid.values()])


# Custom class to wrap GridSearchCV with a progress bar
class GridSearchCVWithProgress(GridSearchCV):
    def fit(self, X, y=None, **fit_params):
        with tqdm(
            total=total_combinations, desc="Hyperparameter Search Progress"
        ) as pbar:

            def update(*args):
                pbar.update()

            self._fit = update
            return super().fit(X, y, **fit_params)


# Run GridSearchCV with the progress bar
grid_search = GridSearchCVWithProgress(
    estimator=xgb, param_grid=param_grid, cv=3, scoring="accuracy", n_jobs=-1
)
grid_search.fit(
    X_train_flat, y_train
)  # Pass data as normal; DMatrix not required here as it's handled within fit

# Output the best parameters and corresponding cross-validation accuracy
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Hyperparameter Search Progress:   0%|          | 0/2916 [00:00<?, ?it/s]

In [9]:
xgb = XGBClassifier(n_estimators=34, use_label_encoder=False, eval_metric="logloss", max_depth=5)
xgb.fit(X_train_flat, y_train)

Parameters: { "use_label_encoder" } are not used.



In [10]:
with open("xgb_model.pkl", "wb") as f:
    pk.dump(xgb, f)

In [11]:
y_pred_xgb = xgb.predict(X_test_flat)

In [12]:
print("XGBoost Classifier Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Classifier Report:
               precision    recall  f1-score   support

           0       0.72      0.67      0.70      7920
           1       0.70      0.74      0.72      8072

    accuracy                           0.71     15992
   macro avg       0.71      0.71      0.71     15992
weighted avg       0.71      0.71      0.71     15992

