# Step 1: Local XGBoost Training at Each Hospital
This notebook trains an XGBoost model on each hospital's data (same model structure), simulating federated learning.

In [2]:
!pip install xgboost scikit-learn pandas matplotlib
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import os

base_path = '../data/'
hospitals = ['hosp_1', 'hosp_2', 'hosp_3']


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 7.1/150.0 MB 39.9 MB/s eta 0:00:04
   --- ------------------------------------ 14.2/150.0 MB 35.6 MB/s eta 0:00:04
   ----- ---------------------------------- 21.2/150.0 MB 35.4 MB/s eta 0:00:04
   -------- ------------------------------- 32.2/150.0 MB 39.4 MB/s eta 0:00:03
   ---------- ----------------------------- 41.2/150.0 MB 40.3 MB/s eta 0:00:03
   ------------- -------------------------- 49.5/150.0 MB 40.5 MB/s eta 0:00:03
   --------------- ------------------------ 59.8/150.0 MB 41.4 MB/s eta 0:00:03
   ------------------ --------------------- 69.2/150.0 MB 42.0 MB/s eta 0:00:02
   --------------------- ------------------ 80.2/150.0 MB 43.0 MB/s eta 0:00:02
   ----------------------- ---------------- 88.9/150.0 MB 43.0

In [3]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 3,
    'learning_rate': 0.1,
    'verbosity': 0
}

results = {}

for hospital in hospitals:
    path = os.path.join(base_path, hospital, 'breast_cancer.csv')
    df = pd.read_csv(path)

    X = df.drop(columns=['target'])
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    model = xgb.train(params, dtrain, num_boost_round=20)

    preds = model.predict(dtest)
    pred_labels = (preds > 0.5).astype(int)

    acc = accuracy_score(y_test, pred_labels)
    auc = roc_auc_score(y_test, preds)

    results[hospital] = {'accuracy': acc, 'auc': auc}
    print(f"{hospital} - Accuracy: {acc:.4f}, AUC: {auc:.4f}")


hosp_1 - Accuracy: 0.8947, AUC: 0.9318
hosp_2 - Accuracy: 0.9211, AUC: 0.9792
hosp_3 - Accuracy: 0.9474, AUC: 1.0000
