## Adult Income Dataset

## Import Requisite Libraries

In [1]:
import model_tuner

In [2]:
help(model_tuner)

Help on package model_tuner:

NAME
    model_tuner

DESCRIPTION
    $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    $      __  __           _      _   _____                          $ 
    $     |  \/  | ___   __| | ___| | |_   _|   _ _ __   ___ _ __     $
    $     | |\/| |/ _ \ / _` |/ _ \ |   | || | | | '_ \ / _ \ '__|    $
    $     | |  | | (_) | (_| |  __/ |   | || |_| | | | |  __/ |       $
    $     |_|  |_|\___/ \__,_|\___|_|   |_| \__,_|_| |_|\___|_|       $
    $                                                                 $
    $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
                                                                   
    The `model_tuner` library is a versatile and powerful tool designed to 
    facilitate the training, evaluation, and tuning of machine learning models. 
    It supports various functionalities such as handling imbalanced data, applying 
    different scaling and imputation techniques, c

In [69]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from model_tuner import Model

In [70]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

In [25]:
print("-" * 80)
print("X")
print("-" * 80)

print(X.head())  # inspect first 5 rows of X

--------------------------------------------------------------------------------
X
--------------------------------------------------------------------------------
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  c

In [71]:
print("-" * 80)
print("y = Outcome = Income")
print("-" * 80)

print(f"\n{y.head()}")  # inspect first 5 rows of y

y.loc[:, "income"] = y["income"].str.rstrip(".")  # Remove trailing periods

print(f"\n Income Value Counts: \n")
# Check the updated value counts
print(y["income"].value_counts())

y = y["income"].map({"<=50K": 0, ">50K": 1})

outcome = ["y"]

--------------------------------------------------------------------------------
y = Outcome = Income
--------------------------------------------------------------------------------

  income
0  <=50K
1  <=50K
2  <=50K
3  <=50K
4  <=50K

 Income Value Counts: 

income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [27]:
# >2 categories
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "race",
]

In [28]:
# continuous or binary
numerical_features = X.select_dtypes(np.number).columns.to_list()

In [31]:
xgb_name = "xgb"
xgb = XGBClassifier(
    objective="binary:logistic",
    random_state=222,
)
xgbearly = True
tuned_parameters_xgb = {
    f"{xgb_name}__max_depth": [3, 10, 20, 200, 500],
    f"{xgb_name}__learning_rate": [1e-4],
    f"{xgb_name}__n_estimators": [1000],
    f"{xgb_name}__early_stopping_rounds": [100],
    f"{xgb_name}__verbose": [0],
    f"{xgb_name}__eval_metric": ["logloss"],
}

xgb_definition = {
    "clc": xgb,
    "estimator_name": xgb_name,
    "tuned_parameters": tuned_parameters_xgb,
    "randomized_grid": False,
    "n_iter": 5,
    "early": xgbearly,
}

model_definitions = {
    xgb_name: xgb_definition,
}

# Define transformers for different column types
numerical_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("imputer", SimpleImputer(strategy="mean")),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Create the ColumnTransformer with passthrough
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",
)

In [32]:
model_type = "xgb"
clc = xgb_definition["clc"]
estimator_name = xgb_definition["estimator_name"]

tuned_parameters = xgb_definition["tuned_parameters"]
n_iter = xgb_definition["n_iter"]
rand_grid = xgb_definition["randomized_grid"]
early_stop = xgb_definition["early"]
kfold = False
calibrate = True

In [33]:
model_xgb = Model(
    name=f"AIDS_Clinical_{model_type}",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=clc,
    model_type="classification",
    kfold=kfold,
    pipeline_steps=[("ColumnTransformer", preprocessor)],
    stratify_y=True,
    stratify_cols=["race", "sex"],
    grid=tuned_parameters,
    randomized_grid=rand_grid,
    boost_early=early_stop,
    scoring=["roc_auc"],
    random_state=222,
    n_jobs=2,
)

In [34]:
model_xgb.grid_search_param_tuning(X, y, f1_beta_tune=True)


Pipeline Steps:

┌───────────────────────────────────────────────────────────┐
│ Step 1: preprocess_column_transformer_ColumnTransformer   │
│ ColumnTransformer                                         │
└───────────────────────────────────────────────────────────┘
                             │
                             ▼
┌───────────────────────────────────────────────────────────┐
│ Step 2: xgb                                               │
│ XGBClassifier                                             │
└───────────────────────────────────────────────────────────┘



100%|██████████| 5/5 [03:06<00:00, 37.38s/it]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  2.08it/s]

Best score/param set found on validation set:
{'params': {'xgb__early_stopping_rounds': 100,
            'xgb__eval_metric': 'logloss',
            'xgb__learning_rate': 0.0001,
            'xgb__max_depth': 10,
            'xgb__n_estimators': 999},
 'score': 0.9054592506968374}
Best roc_auc: 0.905 






In [35]:
X_train, y_train = model_xgb.get_train_data(X, y)
X_test, y_test = model_xgb.get_test_data(X, y)
X_valid, y_valid = model_xgb.get_valid_data(X, y)

In [36]:
model_xgb.fit(X_train, y_train, validation_data=[X_valid, y_valid])

## Return Metrics (Optional)

In [41]:
print("Validation Metrics")
model_xgb.return_metrics(X_valid, y_valid, optimal_threshold=True)

Validation Metrics
Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
              Pos    Neg
--------------------------------------------------------------------------------
Actual: Pos 2262 (tp)    76 (fn)
        Neg 3266 (fp)  4164 (tn)
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
{'AUC ROC': 0.9054600566219992,
 'Average Precision': 0.7748017565196692,
 'Brier Score': 0.16708507764273153,
 'Precision/PPV': 0.40918958031837915,
 'Sensitivity': 0.9674935842600513,
 'Specificity': 0.5604306864064603}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.98      0.56      0.71      7430
           1       0.41      0.97      0.58      2338

    accuracy                           0.6

{'Classification Report': {'0': {'precision': 0.9820754716981132,
   'recall': 0.5604306864064603,
   'f1-score': 0.7136246786632391,
   'support': 7430.0},
  '1': {'precision': 0.40918958031837915,
   'recall': 0.9674935842600513,
   'f1-score': 0.5751334858886347,
   'support': 2338.0},
  'accuracy': 0.6578624078624079,
  'macro avg': {'precision': 0.6956325260082461,
   'recall': 0.7639621353332557,
   'f1-score': 0.6443790822759369,
   'support': 9768.0},
  'weighted avg': {'precision': 0.8449535210382217,
   'recall': 0.6578624078624079,
   'f1-score': 0.6804763976735764,
   'support': 9768.0}},
 'Confusion Matrix': array([[4164, 3266],
        [  76, 2262]], dtype=int64)}

In [42]:
print("Test Metrics")
model_xgb.return_metrics(X_test, y_test, optimal_threshold=True)

Test Metrics
Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
              Pos    Neg
--------------------------------------------------------------------------------
Actual: Pos 2269 (tp)    68 (fn)
        Neg 3339 (fp)  4093 (tn)
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
{'AUC ROC': 0.9060802538652546,
 'Average Precision': 0.7813781976062162,
 'Brier Score': 0.1670615123206092,
 'Precision/PPV': 0.4046005706134094,
 'Sensitivity': 0.9709028669234061,
 'Specificity': 0.5507265877287406}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.98      0.55      0.71      7432
           1       0.40      0.97      0.57      2337

    accuracy                           0.65      9

{'Classification Report': {'0': {'precision': 0.9836577745734199,
   'recall': 0.5507265877287406,
   'f1-score': 0.7061157595100491,
   'support': 7432.0},
  '1': {'precision': 0.4046005706134094,
   'recall': 0.9709028669234061,
   'f1-score': 0.571176840780365,
   'support': 2337.0},
  'accuracy': 0.6512437301668543,
  'macro avg': {'precision': 0.6941291725934147,
   'recall': 0.7608147273260734,
   'f1-score': 0.6386463001452071,
   'support': 9769.0},
  'weighted avg': {'precision': 0.845132164413266,
   'recall': 0.6512437301668543,
   'f1-score': 0.6738348450795781,
   'support': 9769.0}},
 'Confusion Matrix': array([[4093, 3339],
        [  68, 2269]], dtype=int64)}

## Extract Predicted Probabilities

In [57]:
y_prob = model_xgb.predict_proba(X_test)

In [67]:
y_prob = pd.DataFrame(y_prob)
y_prob

Unnamed: 0,0,1
0,0.732727,0.267273
1,0.758547,0.241453
2,0.671430,0.328570
3,0.695792,0.304208
4,0.763772,0.236228
...,...,...
9764,0.763772,0.236228
9765,0.720243,0.279757
9766,0.763772,0.236228
9767,0.763772,0.236228


In [44]:
y_pred = model_xgb.predict(X_test, optimal_threshold=True)

In [48]:
# Cast predictions into DataFrame
y_pred = pd.DataFrame(y_pred)

In [56]:
y_pred

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
9764,0
9765,1
9766,0
9767,0
