
# 🧬 Gout Disease Prediction (Google Colab Notebook)

This Colab notebook trains and evaluates ML models to predict **gout** from a structured dataset (`dataset_gout_balanced.csv`).  
It includes data loading (file upload or Google Drive), EDA, preprocessing, model training (Logistic Regression, Random Forest, XGBoost), evaluation, feature importance, and saving the best model.

> **Expected target column:** `gout` (0/1).  
> **Expected input:** numeric/categorical codes already encoded as numbers (as in NHANES-like data).

---


In [None]:

# ============================================================
# 📦 Setup: Install packages (only needed once per runtime)
# ============================================================
!pip -q install xgboost==1.7.6


In [None]:

# ============================================================
# 📚 Imports & Config
# ============================================================
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    classification_report, confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)
RANDOM_STATE = 42


In [None]:

# ============================================================
# 📂 Data Loading
# Options:
#  1) Upload from your computer (choose a file dialog)
#  2) Mount Google Drive and point to a path
#  3) Use a file already in the Colab runtime (same folder)
# ============================================================

USE_UPLOAD = True   #@param {type:"boolean"}
USE_DRIVE  = False  #@param {type:"boolean"}

csv_path = "dataset_gout_balanced.csv"  #@param {type:"string"}

if USE_UPLOAD:
    from google.colab import files
    uploaded = files.upload()  # pick your CSV
    if uploaded:
        csv_path = list(uploaded.keys())[0]

if USE_DRIVE and not USE_UPLOAD:
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: csv_path = "/content/drive/MyDrive/your_folder/dataset_gout_balanced.csv"

assert os.path.exists(csv_path), f"CSV not found at: {csv_path}. Upload or set the correct path."
df = pd.read_csv(csv_path)
print("✅ Loaded:", csv_path)
print("Shape:", df.shape)
display(df.head())


In [None]:

# ============================================================
# 🔍 Quick EDA
# ============================================================
target_col = "gout"
assert target_col in df.columns, f"Target column '{target_col}' not found. Found: {df.columns.tolist()[:10]}..."

print("\nColumns:", len(df.columns))
print("Dtypes:\n", df.dtypes.value_counts())
print("\nMissing values (top 20):\n", df.isna().sum().sort_values(ascending=False).head(20))

# Target distribution
value_counts = df[target_col].value_counts(dropna=False).sort_index()
print("\nTarget distribution:\n", value_counts)

plt.figure()
value_counts.plot(kind='bar')
plt.title("Target distribution (gout)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


In [None]:

# ============================================================
# ✂️ Train / Test split & preprocessing
# ============================================================
# Keep only numeric columns (assumes pre-encoded categories)
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != target_col]
X = df[numeric_cols].copy()
y = df[target_col].astype(int).copy()

print(f"Using {len(numeric_cols)} numeric features:", numeric_cols[:12], "...")

# Simple imputation: fill missing with median
X = X.fillna(X.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

scaler = StandardScaler()
preprocess = ColumnTransformer(
    transformers=[("num", scaler, list(range(X_train.shape[1])))],
    remainder="drop"
)


In [None]:

# ============================================================
# 🤖 Define candidate models
# ============================================================
models = {
    "LogReg": Pipeline([
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE))
    ]),
    "RandomForest": Pipeline([
        ("prep", "passthrough"),  # RF doesn't need scaling
        ("clf", RandomForestClassifier(
            n_estimators=400, max_depth=None, min_samples_split=2,
            n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE
        ))
    ]),
    "XGBoost": Pipeline([
        ("prep", "passthrough"),  # trees don't need scaling
        ("clf", XGBClassifier(
            n_estimators=500, max_depth=4, learning_rate=0.05, subsample=0.9, colsample_bytree=0.8,
            reg_lambda=1.0, reg_alpha=0.0, objective="binary:logistic", eval_metric="logloss",
            tree_method="hist", random_state=RANDOM_STATE, n_jobs=-1
        ))
    ]),
}
list(models.keys())


In [None]:

# ============================================================
# 🧪 Train & Evaluate all models
# ============================================================
results = []
fitted = {}

for name, pipe in models.items():
    print(f"\n=== Training {name} ===")
    pipe.fit(X_train, y_train)
    fitted[name] = pipe

    proba = pipe.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    acc = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred, zero_division=0)
    roc = roc_auc_score(y_test, proba)
    pr  = average_precision_score(y_test, proba)

    print(f"Accuracy: {acc:.4f} | F1: {f1:.4f} | ROC-AUC: {roc:.4f} | PR-AUC: {pr:.4f}")
    print("\nClassification report:\n", classification_report(y_test, pred, zero_division=0))
    print("Confusion matrix:\n", confusion_matrix(y_test, pred))

    results.append({"model": name, "accuracy": acc, "f1": f1, "roc_auc": roc, "pr_auc": pr})

res_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
print("\n🏁 Summary (sorted by ROC-AUC):")
display(res_df.reset_index(drop=True))

best_name = res_df.iloc[0]["model"]
best_model = fitted[best_name]
print(f"\nBest model: {best_name}")


In [None]:

# ============================================================
# 📈 ROC & Precision-Recall curves for the best model
# ============================================================
proba_best = best_model.predict_proba(X_test)[:, 1]
pred_best = (proba_best >= 0.5).astype(int)

plt.figure()
RocCurveDisplay.from_predictions(y_test, proba_best)
plt.title(f"ROC Curve – {best_name}")
plt.show()

plt.figure()
PrecisionRecallDisplay.from_predictions(y_test, proba_best)
plt.title(f"Precision-Recall Curve – {best_name}")
plt.show()


In [None]:

# ============================================================
# 🧩 Feature importance / coefficients
# ============================================================
def plot_top_importances(names, importances, top_k=20, title="Top Features"):
    idx = np.argsort(importances)[::-1][:top_k]
    top_names = np.array(names)[idx]
    top_vals  = np.array(importances)[idx]
    plt.figure(figsize=(8, max(4, int(top_k/2))))
    plt.barh(range(len(top_names))[::-1], top_vals[idx*0+0], align='center')  # keep default colors
    plt.yticks(range(len(top_names))[::-1], top_names)
    plt.xlabel("Importance")
    plt.title(title)
    plt.show()

if best_name == "LogReg":
    # Coefficients after scaling
    lr_final = best_model.named_steps["clf"]
    coefs = lr_final.coef_.ravel()
    feat_names = [f"z({c})" for c in X_train.columns]
    plot_top_importances(feat_names, np.abs(coefs), top_k=20, title="Top absolute coefficients (LogReg)")

else:
    # RandomForest or XGBoost
    clf = best_model.named_steps["clf"]
    if hasattr(clf, "feature_importances_"):
        importances = clf.feature_importances_
        feat_names = X_train.columns
        plot_top_importances(feat_names, importances, top_k=20, title=f"Top importances ({best_name})")
    else:
        print("No tree-based importances available for this model.")


In [None]:

# ============================================================
# 🔁 Cross-Validation (optional): ROC-AUC with 5-fold CV
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
if best_name == "LogReg":
    model_for_cv = Pipeline([
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE))
    ])
elif best_name == "RandomForest":
    model_for_cv = Pipeline([
        ("prep", "passthrough"),
        ("clf", RandomForestClassifier(
            n_estimators=400, n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE
        ))
    ])
else:
    model_for_cv = Pipeline([
        ("prep", "passthrough"),
        ("clf", XGBClassifier(
            n_estimators=500, max_depth=4, learning_rate=0.05, subsample=0.9, colsample_bytree=0.8,
            eval_metric="logloss", tree_method="hist", random_state=RANDOM_STATE, n_jobs=-1
        ))
    ])

cv_scores = cross_val_score(model_for_cv, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)
print("CV ROC-AUC scores:", cv_scores)
print("CV ROC-AUC mean ± std:", np.mean(cv_scores).round(4), "±", np.std(cv_scores).round(4))


In [None]:

# ============================================================
# 💾 Save the best model and example inference
# ============================================================
model_path = f"best_gout_model_{best_name}.joblib"
joblib.dump(best_model, model_path)
print("Saved model to:", model_path)

# Example: run inference on a single row from test set
sample = X_test.iloc[[0]].copy()
sample_pred_proba = best_model.predict_proba(sample)[:, 1][0]
sample_pred = int(sample_pred_proba >= 0.5)
print("\nExample inference on one test row:")
print("Pred prob gout:", round(sample_pred_proba, 4), "| Pred label:", sample_pred)

# Template: how to predict on new data (replace with real values)
template = X_train.median(numeric_only=True).to_frame().T  # use medians as placeholders
template_pred = best_model.predict_proba(template)[:, 1][0]
print("\nTemplate input shape:", template.shape, "| Pred prob gout:", round(template_pred, 4))
template.to_csv("template_input.csv", index=False)
print("A CSV template (template_input.csv) with expected feature columns has been saved.")
