In [7]:
# ==============================================
# XGBoost Regression Hyperparameter Tuning for F1 (Refined)
# ==============================================
import numpy as np
import pandas as pd
import joblib
import json
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor

# ----------------------------------------------
# 1) Copy helper functions from training notebook
# ----------------------------------------------
LMH_MAP = {'Low': 3, 'Medium': 5, 'High': 8}

def normalize_inputs(df: pd.DataFrame) -> pd.DataFrame:
    """
    - Map 'Low/Medium/High' to numeric for key columns
    - Strip thousand separators and coerce numerics for budget/count/time cols
    """
    df = df.copy()

    # Columns that may appear as L/M/H but we need numeric for FE
    lmh_cols = ['Integration_Complexity', 'Requirement_Stability', 'Market_Volatility']
    for c in lmh_cols:
        if c in df.columns:
            if df[c].dtype == 'object':
                s = df[c].astype(str).str.strip()
                mapped = s.map(LMH_MAP)
                numeric = pd.to_numeric(s.str.replace(',', ''), errors='coerce')
                df[c] = mapped.fillna(numeric)

    # Columns that should be numeric (may have commas)
    numeric_cols = ['Project_Budget_USD', 'Team_Size', 'Estimated_Timeline_Months', 'Stakeholder_Count']
    for c in numeric_cols:
        if c in df.columns and df[c].dtype == 'object':
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', ''), errors='coerce')

    return df

def feature_engineer(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if {'Project_Budget_USD','Team_Size'}.issubset(df.columns):
        df['Budget_Per_TeamMember'] = df['Project_Budget_USD'] / (df['Team_Size'] + 1e-5)
    if {'Estimated_Timeline_Months','Stakeholder_Count'}.issubset(df.columns):
        df['Schedule_Pressure_Index'] = df['Estimated_Timeline_Months'] / (df['Stakeholder_Count'] + 1e-5)
    if {'Integration_Complexity','Requirement_Stability','Market_Volatility'}.issubset(df.columns):
        df['Complexity_Index'] = (
            df['Integration_Complexity'].astype(float)
            + df['Requirement_Stability'].astype(float)
            + df['Market_Volatility'].astype(float)
        ) / 3.0
    return df

# ----------------------------------------------
# 2) Load and prepare data (same as training)
# ----------------------------------------------
print("üìÇ Loading data...")
df = pd.read_csv("../data/project_risk_raw_dataset.csv")

# Normalize and engineer features
df_clean = normalize_inputs(df)
df_clean = feature_engineer(df_clean)

# Separate features and target
X = df_clean.drop(columns=["Risk_Level"])
y_labels = df_clean["Risk_Level"]

# Map labels to regression scores and class ints
label_to_score = {'Low': 0.25, 'Medium': 0.50, 'High': 0.75, 'Critical': 1.00}
label_to_class = {'Low': 0, 'Medium': 1, 'High': 2, 'Critical': 3}

y_regression = y_labels.map(label_to_score)
y_classification = y_labels.map(label_to_class)

# Train/test split (stratify by original label)
X_train_raw, X_test_raw, y_train_reg, y_test_reg, y_train_class, y_test_class = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42, stratify=y_labels
)

# One-hot encode (align test to train)
X_train = pd.get_dummies(X_train_raw, drop_first=True)
X_test = pd.get_dummies(X_test_raw, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

print(f"‚úÖ Data prepared: {X_train.shape}")

# ----------------------------------------------
# 3) Load or compute global thresholds
# ----------------------------------------------
try:
    with open("../notebooks/models/global_thresholds.json", "r") as f:
        T = json.load(f)
    t25 = T["low_medium"]
    t50 = T["medium_high"]
    t75 = T["high_critical"]
    print(f"‚úÖ Loaded thresholds: {t25:.3f}, {t50:.3f}, {t75:.3f}")
except FileNotFoundError:
    print("‚ö†Ô∏è Thresholds file not found, using default quartiles")
    t25, t50, t75 = 0.25, 0.50, 0.75

# ----------------------------------------------
# 4) Helper: Convert regression scores ‚Üí classes
# ----------------------------------------------
def convert_to_class(arr_like):
    """Convert continuous array-like predictions to risk classes (0..3)."""
    arr = np.asarray(arr_like, dtype=float)
    # if 1D with shape (n,1) flatten
    if arr.ndim > 1:
        arr = arr.ravel()
    labels = np.zeros_like(arr, dtype=int)
    labels[arr >= t75] = 3
    labels[(arr >= t50) & (arr < t75)] = 2
    labels[(arr >= t25) & (arr < t50)] = 1
    labels[arr < t25] = 0
    return labels

# Custom F1 scorer for regression model (robust)
def f1_from_regression(y_true, y_pred):
    """
    y_true: could be floats (0.25,0.5,...) or ints (0..3) depending on how RandomizedSearchCV passes them.
    y_pred: regression predictions (floats).
    We'll convert both to class ints via thresholds before computing F1.
    """
    try:
        # ensure numpy arrays & flatten
        y_pred = np.asarray(y_pred)
        if y_pred.ndim > 1:
            y_pred = y_pred.ravel()
        # replace NaN preds with mid-value
        y_pred = np.nan_to_num(y_pred, nan=(t25 + t50) / 2.0)

        # Convert predictions to classes
        y_pred_class = convert_to_class(y_pred)

        # Convert truth to classes if needed
        y_true = np.asarray(y_true)
        # If truth values appear to be floats in label range, convert using thresholds
        if np.issubdtype(y_true.dtype, np.floating):
            y_true_class = convert_to_class(y_true)
        else:
            # try to coerce to int for safety
            y_true_class = y_true.astype(int)

        # compute F1 (weighted)
        return f1_score(y_true_class, y_pred_class, average="weighted")
    except Exception as e:
        # If anything goes wrong, return 0.0 (prevents CV from producing NaN)
        # optionally you can print(e) for debugging but it floods logs in CV
        return 0.0

# wrap as sklearn scorer
f1_scorer = make_scorer(f1_from_regression, greater_is_better=True)

# ----------------------------------------------
# 5) Hyperparameter search space
# ----------------------------------------------
param_grid = {
    "n_estimators": [200, 300, 500, 800, 1200],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 4, 5, 6, 8, 10],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0, 0.1, 0.3, 0.5, 1.0],
    "reg_alpha": [0, 0.01, 0.1, 1],
    "reg_lambda": [0.1, 1, 2, 5]
}

# ----------------------------------------------
# 6) Randomized search with F1 optimization
# ----------------------------------------------
xgb_model = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    tree_method="hist"    # safe & faster; change to "gpu_hist" if you have GPU
)

search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=40,  # Increase to 80-100 for better results if you have time
    scoring=f1_scorer,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    return_train_score=False
)

print("\n‚è≥ Starting hyperparameter tuning (this may take 15-60 minutes depending on n_iter)...\n")
# IMPORTANT: train with regression targets (continuous); scorer converts truths inside
search.fit(X_train, y_train_reg)

# ----------------------------------------------
# 7) Results and save best model
# ----------------------------------------------
print("\n" + "="*60)
print("üèÜ BEST PARAMETERS FOUND:")
print("="*60)
for param, value in search.best_params_.items():
    print(f"  {param:20s}: {value}")
print("="*60)
# Note: search.best_score_ is F1 computed by our scorer (should be finite now)
print(f"\nüìä Best Cross-Validation F1 Score: {search.best_score_:.4f}")

# Get best model and evaluate on test set
best_model = search.best_estimator_

# Predict on test set (regression predictions)
y_pred_reg = best_model.predict(X_test)
# Convert to classes using thresholds
y_pred_class = convert_to_class(y_pred_reg)

# Calculate test set F1 using the integer test labels we saved earlier
test_f1 = f1_score(y_test_class, y_pred_class, average="weighted")
print(f"üìà Test Set F1 Score: {test_f1:.4f}")

# ----------------------------------------------
# 8) Save the tuned model and params
# ----------------------------------------------
MODEL_DIR = "../notebooks/models"

# Save as JSON (most stable)
best_model.save_model(f"{MODEL_DIR}/xgb_reg_finetuned.json")
print(f"\nüíæ Saved fine-tuned model: {MODEL_DIR}/xgb_reg_finetuned.json")

# Also save as pickle for compatibility
joblib.dump(best_model, f"{MODEL_DIR}/xgb_reg_finetuned.pkl")
print(f"üíæ Saved fine-tuned model (pkl): {MODEL_DIR}/xgb_reg_finetuned.pkl")

# Save best parameters
with open(f"{MODEL_DIR}/best_params.json", "w") as f:
    json.dump(search.best_params_, f, indent=2)
print(f"üíæ Saved best parameters: {MODEL_DIR}/best_params.json")

print("\n‚úÖ Hyperparameter tuning complete!")


üìÇ Loading data...
‚úÖ Data prepared: (3200, 3297)
‚ö†Ô∏è Thresholds file not found, using default quartiles

‚è≥ Starting hyperparameter tuning (this may take 15-60 minutes depending on n_iter)...

Fitting 5 folds for each of 40 candidates, totalling 200 fits

üèÜ BEST PARAMETERS FOUND:
  subsample           : 0.9
  reg_lambda          : 0.1
  reg_alpha           : 1
  n_estimators        : 300
  min_child_weight    : 7
  max_depth           : 5
  learning_rate       : 0.1
  gamma               : 0
  colsample_bytree    : 1.0

üìä Best Cross-Validation F1 Score: 0.6241
üìà Test Set F1 Score: 0.4482

üíæ Saved fine-tuned model: ../notebooks/models/xgb_reg_finetuned.json
üíæ Saved fine-tuned model (pkl): ../notebooks/models/xgb_reg_finetuned.pkl
üíæ Saved best parameters: ../notebooks/models/best_params.json

‚úÖ Hyperparameter tuning complete!
