In [3]:
# ==============================================================================
# 1. SETUP AND IMPORTS
# ==============================================================================
import pandas as pd
import numpy as np
import os
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import warnings

warnings.filterwarnings('ignore', category=UserWarning) # To suppress XGBoost warnings
print("Libraries imported.")

# ==============================================================================
# 2. LOAD THE PROCESSED LUNG CANCER DATA
# ==============================================================================
try:
    CANCER_PROCESSED_DIR = os.path.join("..", "data", "processed", "lung_cancer")
    X_train = joblib.load(os.path.join(CANCER_PROCESSED_DIR, "X_train.joblib"))
    y_train = joblib.load(os.path.join(CANCER_PROCESSED_DIR, "y_train.joblib"))
    print("Lung cancer training data loaded successfully.")
    print(f"Training data shape: {X_train.shape}")
except FileNotFoundError:
    print("ERROR: Processed data not found. Please run the '05_lung_cancer_eda_and_preprocessing.ipynb' notebook first.")
    # Exit if data isn't found
    exit()

# ==============================================================================
# 3. CONFIGURE THE HYPERPARAMETER SEARCH
# ==============================================================================
# Calculate scale_pos_weight for the imbalanced dataset
if 1 in y_train.value_counts() and y_train.value_counts()[1] > 0:
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
else:
    scale_pos_weight = 1 # Default to 1 if no positive samples

print(f"\nUsing scale_pos_weight: {scale_pos_weight:.2f}")

# Define the hyperparameter distribution to search over for XGBoost
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),      # Range from 0.6 to 1.0
    'colsample_bytree': uniform(0.6, 0.4), # Range from 0.6 to 1.0
    'gamma': uniform(0, 0.5)
}

# Instantiate the base XGBoost model
xgb = XGBClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight)

# Set up the Randomized Search with Cross-Validation
rand_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=100,         # Number of parameter settings that are sampled.
    cv=5,               # 5-fold cross-validation.
    scoring='roc_auc',  # The metric to optimize.
    n_jobs=-1,          # Use all available CPU cores.
    random_state=42,
    verbose=2           # Show detailed progress.
)

# ==============================================================================
# 4. RUN THE SEARCH
# =================================================_============================
print("\nStarting hyperparameter search for XGBoost...")
rand_search.fit(X_train, y_train)
print("\nSearch complete.")

# ==============================================================================
# 5. DISPLAY RESULTS AND SAVE THE BEST MODEL
# ==============================================================================
print("-" * 50)
print(f"Best cross-validated AUC Score from search: {rand_search.best_score_:.4f}")
print("Best parameters found:")
print(rand_search.best_params_)
print("-" * 50)

# Get the best model found by the search
best_xgb_tuned = rand_search.best_estimator_

# Define the directory and path for saving the final model
CANCER_MODELS_DIR = os.path.join("..", "models", "lung_cancer")
MODEL_PATH = os.path.join(CANCER_MODELS_DIR, "best_lung_cancer_classifier.joblib")

# Ensure the directory exists
os.makedirs(CANCER_MODELS_DIR, exist_ok=True)

# Save the best model
joblib.dump(best_xgb_tuned, MODEL_PATH)
print(f"\nTUNED best lung cancer model saved successfully to: {MODEL_PATH}")

Libraries imported.
Lung cancer training data loaded successfully.
Training data shape: (411, 10)

Using scale_pos_weight: 1.76

Starting hyperparameter search for XGBoost...
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Search complete.
--------------------------------------------------
Best cross-validated AUC Score from search: 0.4742
Best parameters found:
{'colsample_bytree': np.float64(0.9570235993959911), 'gamma': np.float64(0.26967112095782536), 'learning_rate': np.float64(0.25223204654921877), 'max_depth': 6, 'n_estimators': 572, 'subsample': np.float64(0.7272013899887455)}
--------------------------------------------------

TUNED best lung cancer model saved successfully to: ..\models\lung_cancer\best_lung_cancer_classifier.joblib
