## Adult Income Dataset

## Import Requisite Libraries

In [1]:
import model_tuner

In [2]:
help(model_tuner)

  

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
$      __  __           _      _   _____                          $ 
$     |  \/  | ___   __| | ___| | |_   _|   _ _ __   ___ _ __     $
$     | |\/| |/ _ \ / _` |/ _ \ |   | || | | | '_ \ / _ \ '__|    $
$     | |  | | (_) | (_| |  __/ |   | || |_| | | | |  __/ |       $
$     |_|  |_|\___/ \__,_|\___|_|   |_| \__,_|_| |_|\___|_|       $
$                                                                 $
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


                                                               
The `model_tuner` library is a versatile and powerful tool designed to 
facilitate the training, tuning, and evaluation of machine learning models. 
It supports various functionalities such as handling imbalanced data, applying 
different scaling and imputation techniques, calibrating models, and conducting 
cross-validation. This library is particularly useful for hyperparameter tu

In [3]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from model_tuner import Model

In [4]:
from eda_toolkit import add_ids

In [5]:
# fetch dataset
adult = fetch_ucirepo(id=2)

adult = adult.data.features.join(adult.data.targets, how="inner")

adult = add_ids(df=adult, id_colname="Adult_ID", num_digits=9).set_index(
    "Adult_ID",
)

adult
# data (as pandas dataframes)
X = adult[[col for col in adult.columns if not "income" in col]]
y = adult[["income"]]

ConnectionError: Error connecting to server

In [None]:
print("-" * 80)
print("X")
print("-" * 80)

print(X.head())  # inspect first 5 rows of X

In [None]:
print("-" * 80)
print("y = Outcome = Income")
print("-" * 80)

print(f"\n{y.head()}")  # inspect first 5 rows of y

y.loc[:, "income"] = y["income"].str.rstrip(".")  # Remove trailing periods

print(f"\n Income Value Counts: \n")
# Check the updated value counts
print(y["income"].value_counts())

y = y["income"].map({"<=50K": 0, ">50K": 1})

outcome = ["y"]

In [None]:
# >2 categories
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "race",
]

In [None]:
# continuous or binary
numerical_features = X.select_dtypes(np.number).columns.to_list()

In [None]:
xgb_name = "xgb"
xgb = XGBClassifier(
    objective="binary:logistic",
    random_state=222,
    tree_method="hist",
    device="cuda",
)
xgbearly = True
tuned_parameters_xgb = {
    f"{xgb_name}__max_depth": [3, 10, 20, 200, 500],
    f"{xgb_name}__learning_rate": [1e-4],
    f"{xgb_name}__n_estimators": [1000],
    f"{xgb_name}__early_stopping_rounds": [100],
    f"{xgb_name}__verbose": [0],
    f"{xgb_name}__eval_metric": ["logloss"],
}

xgb_definition = {
    "clc": xgb,
    "estimator_name": xgb_name,
    "tuned_parameters": tuned_parameters_xgb,
    "randomized_grid": False,
    "n_iter": 5,
    "early": xgbearly,
}

model_definitions = {
    xgb_name: xgb_definition,
}

# Define transformers for different column types
numerical_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("imputer", SimpleImputer(strategy="mean")),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Create the ColumnTransformer with passthrough
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",
)

In [None]:
model_type = "xgb"
clc = xgb_definition["clc"]
estimator_name = xgb_definition["estimator_name"]

tuned_parameters = xgb_definition["tuned_parameters"]
n_iter = xgb_definition["n_iter"]
rand_grid = xgb_definition["randomized_grid"]
early_stop = xgb_definition["early"]
kfold = False
calibrate = True

In [None]:
model_xgb = Model(
    name=f"AIDS_Clinical_{model_type}",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=clc,
    model_type="classification",
    kfold=kfold,
    pipeline_steps=[("ColumnTransformer", preprocessor)],
    stratify_y=True,
    stratify_cols=["race", "sex"],
    grid=tuned_parameters,
    randomized_grid=rand_grid,
    boost_early=early_stop,
    scoring=["roc_auc"],
    random_state=222,
    n_jobs=2,
)

In [None]:
model_xgb.grid_search_param_tuning(X, y, f1_beta_tune=True)

In [None]:
X_train, y_train = model_xgb.get_train_data(X, y)
X_test, y_test = model_xgb.get_test_data(X, y)
X_valid, y_valid = model_xgb.get_valid_data(X, y)

In [None]:
model_xgb.fit(X_train, y_train, validation_data=[X_valid, y_valid])

## Return Metrics (Optional)

In [None]:
print("Validation Metrics")
model_xgb.return_metrics(X_valid, y_valid, optimal_threshold=True)

In [None]:
print("Test Metrics")
model_xgb.return_metrics(X_test, y_test, optimal_threshold=True)

## Extract Predicted Probabilities

In [None]:
y_prob = model_xgb.predict_proba(X_test)

In [None]:
y_prob = pd.DataFrame(y_prob)
y_prob

In [None]:
y_pred = model_xgb.predict(X_test, optimal_threshold=True)

In [None]:
# Cast predictions into DataFrame
y_pred = pd.DataFrame(y_pred)

In [None]:
y_pred