In [1]:
import time
import os
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, roc_curve, confusion_matrix
import numpy as np
import pandas as pd
import optuna

import config

In [4]:
def load_numpy_data(csv_path, feature_names: list):
    """
    Loads data from a CSV and returns (X, y) as NumPy arrays.
    Assumes:
      - Column 0: FASTA header (ignored)
      - Columns 1:-1: features
      - Column -1: label
    """
    df = pd.read_csv(csv_path)
    X = df[feature_names].values
    y = df.iloc[:, -1].values
    return X, y

### Load the training data

In [6]:
print("Loading the data...")
X, y = load_numpy_data(os.path.join(config.data_dir, 'train_data.csv'), config.feature_names)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=(config.train_ratio),
    stratify=y, random_state=config.seed
    )

Loading the data...


### Train the XGBoost classifier and optimize thte hyperparameters through Optuna

In [None]:
# Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1500, 6000, step=250),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": config.seed,
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "n_jobs": -1,
    }

    # Train model
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Predict probabilities and compute ROC AUC
    val_preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_preds)

    return auc

# Create Optuna study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
number_of_trials = 50
study.optimize(objective, n_trials=number_of_trials)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best AUC ROC:", study.best_value)

# Train the best model with the best hyperparameters
best_params = study.best_params
best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Predict probabilities and labels for the validation set
val_preds_proba = best_model.predict_proba(X_val)[:, 1]
val_preds_labels = (val_preds_proba > 0.5).astype(int)

# Compute metrics
auc = roc_auc_score(y_val, val_preds_proba)
acc = accuracy_score(y_val, val_preds_labels)
mcc = matthews_corrcoef(y_val, val_preds_labels)

# Print metrics
print(f"Best model metrics on the validation set:")
print(f"  AUC ROC: {auc:.4f}")
print(f"  Accuracy: {acc:.4f}")
print(f"  MCC: {mcc:.4f}")

### Load the test data

In [7]:
import config
X_test, y_test = load_numpy_data(os.path.join(config.data_dir, 'test_data.csv'), config.feature_names)

### Check performance on the test set

In [None]:
# Predict probabilities and labels for the test set
test_preds_proba = best_model.predict_proba(X_test)[:, 1]

# Compute metrics
auc = roc_auc_score(y_test, test_preds_proba)
test_preds_labels = (test_preds_proba > 0.5).astype(int)
acc = accuracy_score(y_test, test_preds_labels)
mcc = matthews_corrcoef(y_test, test_preds_labels)

print(f"Best Model Metrics on the Test Set:")
print(f"  AUC ROC: {auc:.4f}")
print(f"  Accuracy: {acc:.4f}")
print(f"  MCC: {mcc:.4f}")