## Adult Income Dataset

## Import Requisite Libraries

In [1]:
import model_tuner

In [2]:
help(model_tuner)

  

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
$      __  __           _      _   _____                          $ 
$     |  \/  | ___   __| | ___| | |_   _|   _ _ __   ___ _ __     $
$     | |\/| |/ _ \ / _` |/ _ \ |   | || | | | '_ \ / _ \ '__|    $
$     | |  | | (_) | (_| |  __/ |   | || |_| | | | |  __/ |       $
$     |_|  |_|\___/ \__,_|\___|_|   |_| \__,_|_| |_|\___|_|       $
$                                                                 $
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


                                                               
The `model_tuner` library is a versatile and powerful tool designed to 
facilitate the training, tuning, and evaluation of machine learning models. 
It supports various functionalities such as handling imbalanced data, applying 
different scaling and imputation techniques, calibrating models, and conducting 
cross-validation. This library is particularly useful for hyperparameter tu

In [3]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from model_tuner import Model

In [4]:
from eda_toolkit import add_ids

In [5]:
# # fetch dataset
# adult = fetch_ucirepo(id=2)

# adult = adult.data.features.join(adult.data.targets, how="inner")

# adult = add_ids(df=adult, id_colname="Adult_ID", num_digits=9).set_index(
#     "Adult_ID",
# )

adult = pd.read_parquet("df.parquet")
adult
# data (as pandas dataframes)
X = adult[[col for col in adult.columns if not "income" in col]]
y = adult[["income"]]

In [6]:
print("-" * 80)
print("X")
print("-" * 80)

print(X.head())  # inspect first 5 rows of X

--------------------------------------------------------------------------------
X
--------------------------------------------------------------------------------
           age         workclass  fnlwgt  education  education-num  \
census_id                                                            
582248222   39         State-gov   77516  Bachelors             13   
561810758   50  Self-emp-not-inc   83311  Bachelors             13   
598098459   38           Private  215646    HS-grad              9   
776705221   53           Private  234721       11th              7   
479262902   28           Private  338409  Bachelors             13   

               marital-status         occupation   relationship   race  \
census_id                                                                
582248222       Never-married       Adm-clerical  Not-in-family  White   
561810758  Married-civ-spouse    Exec-managerial        Husband  White   
598098459            Divorced  Handlers-cleaners 

In [7]:
print("-" * 80)
print("y = Outcome = Income")
print("-" * 80)

print(f"\n{y.head()}")  # inspect first 5 rows of y

y.loc[:, "income"] = y["income"].str.rstrip(".")  # Remove trailing periods

print(f"\n Income Value Counts: \n")
# Check the updated value counts
print(y["income"].value_counts())

y = y["income"].map({"<=50K": 0, ">50K": 1})

outcome = ["y"]

--------------------------------------------------------------------------------
y = Outcome = Income
--------------------------------------------------------------------------------

          income
census_id       
582248222  <=50K
561810758  <=50K
598098459  <=50K
776705221  <=50K
479262902  <=50K

 Income Value Counts: 

income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [8]:
# >2 categories
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "race",
]

In [9]:
# continuous or binary
numerical_features = X.select_dtypes(np.number).columns.to_list()

In [10]:
xgb_name = "xgb"
xgb = XGBClassifier(
    objective="binary:logistic",
    random_state=222,
    tree_method="hist",
    device="cuda",
)
xgbearly = True
tuned_parameters_xgb = {
    f"{xgb_name}__max_depth": [3, 10, 20, 200, 500],
    f"{xgb_name}__learning_rate": [1e-4],
    f"{xgb_name}__n_estimators": [1000],
    f"{xgb_name}__early_stopping_rounds": [100],
    f"{xgb_name}__verbose": [0],
    f"{xgb_name}__eval_metric": ["logloss"],
}

xgb_definition = {
    "clc": xgb,
    "estimator_name": xgb_name,
    "tuned_parameters": tuned_parameters_xgb,
    "randomized_grid": False,
    "n_iter": 5,
    "early": xgbearly,
}

model_definitions = {
    xgb_name: xgb_definition,
}

# Define transformers for different column types
numerical_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("imputer", SimpleImputer(strategy="mean")),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Create the ColumnTransformer with passthrough
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",
)

In [11]:
model_type = "xgb"
clc = xgb_definition["clc"]
estimator_name = xgb_definition["estimator_name"]

tuned_parameters = xgb_definition["tuned_parameters"]
n_iter = xgb_definition["n_iter"]
rand_grid = xgb_definition["randomized_grid"]
early_stop = xgb_definition["early"]
kfold = False
calibrate = True

In [12]:
model_xgb = Model(
    name=f"AIDS_Clinical_{model_type}",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=clc,
    model_type="classification",
    kfold=kfold,
    pipeline_steps=[("ColumnTransformer", preprocessor)],
    stratify_y=True,
    stratify_cols=["race", "sex"],
    grid=tuned_parameters,
    randomized_grid=rand_grid,
    boost_early=early_stop,
    scoring=["roc_auc"],
    random_state=222,
    n_jobs=2,
)

In [13]:
model_xgb.grid_search_param_tuning(X, y, f1_beta_tune=True)


Pipeline Steps:

┌───────────────────────────────────────────────────────────┐
│ Step 1: preprocess_column_transformer_ColumnTransformer   │
│ ColumnTransformer                                         │
└───────────────────────────────────────────────────────────┘
                             │
                             ▼
┌───────────────────────────────────────────────────────────┐
│ Step 2: xgb                                               │
│ XGBClassifier                                             │
└───────────────────────────────────────────────────────────┘



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


100%|██████████| 5/5 [00:43<00:00,  8.61s/it]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  3.71it/s]

Best score/param set found on validation set:
{'params': {'xgb__early_stopping_rounds': 100,
            'xgb__eval_metric': 'logloss',
            'xgb__learning_rate': 0.0001,
            'xgb__max_depth': 10,
            'xgb__n_estimators': 1000},
 'score': 0.9053881278013094}
Best roc_auc: 0.905 






In [14]:
X_train, y_train = model_xgb.get_train_data(X, y)
X_test, y_test = model_xgb.get_test_data(X, y)
X_valid, y_valid = model_xgb.get_valid_data(X, y)

In [15]:
model_xgb.fit(X_train, y_train, validation_data=[X_valid, y_valid])

## Return Metrics (Optional)

In [16]:
print("Validation Metrics")
model_xgb.return_metrics(X_valid, y_valid, optimal_threshold=True)

Validation Metrics
Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
              Pos    Neg
--------------------------------------------------------------------------------
Actual: Pos 2265 (tp)    73 (fn)
        Neg 3267 (fp)  4163 (tn)
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.98      0.56      0.71      7430
           1       0.41      0.97      0.58      2338

    accuracy                           0.66      9768
   macro avg       0.70      0.76      0.64      9768
weighted avg       0.85      0.66      0.68      9768

--------------------------------------------------------------------------------


In [17]:
print("Test Metrics")
model_xgb.return_metrics(X_test, y_test, optimal_threshold=True)

Test Metrics
Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
              Pos    Neg
--------------------------------------------------------------------------------
Actual: Pos 2269 (tp)    68 (fn)
        Neg 3336 (fp)  4096 (tn)
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.98      0.55      0.71      7432
           1       0.40      0.97      0.57      2337

    accuracy                           0.65      9769
   macro avg       0.69      0.76      0.64      9769
weighted avg       0.85      0.65      0.67      9769

--------------------------------------------------------------------------------


## Extract Predicted Probabilities

In [18]:
y_prob = model_xgb.predict_proba(X_test)

In [19]:
y_prob = pd.DataFrame(y_prob)
y_prob

Unnamed: 0,0,1
0,0.732720,0.267280
1,0.757513,0.242487
2,0.671364,0.328636
3,0.695807,0.304193
4,0.763796,0.236204
...,...,...
9764,0.763796,0.236204
9765,0.720224,0.279776
9766,0.763796,0.236204
9767,0.763796,0.236204


In [20]:
y_pred = model_xgb.predict(X_test, optimal_threshold=True)

In [21]:
# Cast predictions into DataFrame
y_pred = pd.DataFrame(y_pred)

In [22]:
y_pred

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
9764,0
9765,1
9766,0
9767,0
