Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier

Data loading

In [2]:
# Loading the dataset and showing the five first records.
income = pd.read_csv("https://github.com/breno-madruga/machine_learning_exercises/raw/master/XGBoost%20with%20GPU%20support/income.csv")
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Verifying if there are missing values.
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  high_income     32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Data preprocessing

In [4]:
# The only step necessary to be done outside of pipeline.
# Converting the target column to categorical.
col = pd.Categorical(income.high_income)
income["high_income"] = col.codes

In [5]:
# Custom Transformer that extracts columns passed as argument to its constructor.
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Class Constructor.
    def __init__( self, feature_names ):
        self.feature_names = feature_names

    # Returns self nothing else to do here.
    def fit( self, X, y = None):
        return self

    # Method that describes what we need this transformer to do.
    def transform( self, X, y = None):
        return X[self.feature_names]

Categorical pipeline

In [6]:
# Converts certain features to categorical.
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes a boolean as its argument.
    def __init__(self, new_features=True):
        self.new_features = new_features

    # Returns self nothing else to do here.
    def fit( self, X, y = None):
        return self

    # Transformer method we wrote for this transformer.
    def transform(self, X , y = None):
        df = X.copy()
        if self.new_features:
            # Treats ? workclass as unknown.
            df['workclass']= df['workclass'].replace('?','Unknown')
            # Two many category level, convert just US and Non-US.
            df.loc[df['native_country'] != ' United-States', 'native_country'] = 'non_usa'

        # Converts columns to categorical.
        for name in df.columns.to_list():
            col = pd.Categorical(df[name])
            df[name] = col.codes

        # Returns numpy array.
        return df

Modeling(Train and test)

In [7]:
# Global varibles.
seed = 42
num_folds = 10
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

Train/test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(income.drop(labels="high_income",axis=1),
                                                    income["high_income"],
                                                    test_size=0.20,
                                                    random_state=seed,
                                                    shuffle=True,
                                                    stratify=income["high_income"])

In [9]:
# Categrical features to pass down the categorical pipeline.
categorical_features = income.select_dtypes("object").columns.to_list()

# Numerical features to pass down the numerical pipeline.
numerical_features = income.select_dtypes("int64").columns.to_list()

# Defining the steps in the categorical pipeline.
categorical_pipeline = Pipeline(steps = [('cat_selector', FeatureSelector(categorical_features)),
                                         ('cat_transformer', CategoricalTransformer())])

# Defining the steps in the numerical pipeline.
numerical_pipeline = Pipeline(steps = [('num_selector', FeatureSelector(numerical_features)),
                                       ('std_scaler', MinMaxScaler())])

# Combining numerical and categorical piepline into one full big pipeline horizontally using FeatureUnion.
full_pipeline_preprocessing = FeatureUnion(transformer_list = [('categorical_pipeline', categorical_pipeline),
                                                               ('numerical_pipeline', numerical_pipeline)])

Tuning the algorithm

In [10]:
# The full pipeline as a step in another pipeline with an estimator as the final step.
pipe = Pipeline(steps = [("full_pipeline", full_pipeline_preprocessing),
                         ("fs", SelectKBest()),
                         ("clf", XGBClassifier())])

# Creating a dictionary with the hyperparameters.
search_space = [
                {"clf": [RandomForestClassifier()],
                 "clf__n_estimators": [800],
                 "clf__criterion": ["gini", "entropy"],
                 "clf__max_leaf_nodes": [300],
                 "clf__random_state": [seed],
                 "clf__oob_score": [True],
                 "fs__score_func": [chi2],
                 "fs__k": [10]},
                {"clf": [XGBClassifier()],
                 "clf__n_estimators": [300],
                 "clf__max_depth": [4],
                 "clf__learning_rate": [0.1],
                 "clf__random_state": [seed],
                 "clf__subsample": [1],
                 "clf__colsample_bytree": [1],
#                  "clf__tree_method": ["gpu_hist"],  # For using the GPU.
                 "fs__score_func":[chi2],
                 "fs__k":[13]}
]

# Defining StratifiedKFold object.
kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)

#############################################################################
# return_train_score=True
# official documentation: "computing the scores on the training set can be
# computationally expensive and is not strictly required to
# select the parameters that yield the best generalization performance".
#############################################################################

# Creating the GridSearchCV object.
grid = GridSearchCV(estimator=pipe, 
                    param_grid=search_space,
                    cv=kfold,
                    scoring=scoring,
                    return_train_score=True,
                    n_jobs=-1,
                    refit="AUC")

# Getting the time start.
tmp = time.time()

# Fitting the GridSearchCV object.
best_model = grid.fit(X_train, y_train)

# Printing the time spent.
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))  # 311.7510848045349 seconds
# print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))  # 301.1867105960846 seconds

CPU Training Time: 153.99099349975586 seconds


In [12]:
print("Best: %f using %s" % (best_model.best_score_,best_model.best_params_))  # 0.920415 (GPU) | 0.920279 (CPU)

Best: 0.923599 using {'clf': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1), 'clf__colsample_bytree': 1, 'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'clf__n_estimators': 300, 'clf__random_state': 42, 'clf__subsample': 1, 'fs__k': 13, 'fs__score_func': <function chi2 at 0x7f03623efb70>}


In [13]:
# Analyzing the results of cross-validation process.
result = pd.DataFrame(best_model.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__criterion,param_clf__max_leaf_nodes,param_clf__n_estimators,param_clf__oob_score,param_clf__random_state,param_fs__k,param_fs__score_func,param_clf__colsample_bytree,param_clf__learning_rate,param_clf__max_depth,param_clf__subsample,params,split0_test_AUC,split1_test_AUC,split2_test_AUC,split3_test_AUC,split4_test_AUC,split5_test_AUC,split6_test_AUC,split7_test_AUC,split8_test_AUC,split9_test_AUC,mean_test_AUC,std_test_AUC,rank_test_AUC,split0_train_AUC,split1_train_AUC,split2_train_AUC,split3_train_AUC,split4_train_AUC,split5_train_AUC,split6_train_AUC,split7_train_AUC,split8_train_AUC,split9_train_AUC,mean_train_AUC,std_train_AUC,split0_test_Accuracy,split1_test_Accuracy,split2_test_Accuracy,split3_test_Accuracy,split4_test_Accuracy,split5_test_Accuracy,split6_test_Accuracy,split7_test_Accuracy,split8_test_Accuracy,split9_test_Accuracy,mean_test_Accuracy,std_test_Accuracy,rank_test_Accuracy,split0_train_Accuracy,split1_train_Accuracy,split2_train_Accuracy,split3_train_Accuracy,split4_train_Accuracy,split5_train_Accuracy,split6_train_Accuracy,split7_train_Accuracy,split8_train_Accuracy,split9_train_Accuracy,mean_train_Accuracy,std_train_Accuracy
0,18.406798,0.078259,0.882006,0.008493,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,300.0,800,True,42,10,<function chi2 at 0x7f03623efb70>,,,,,"{'clf': RandomForestClassifier(bootstrap=True,...",0.909508,0.913253,0.919141,0.918046,0.91899,0.916836,0.90872,0.914634,0.914643,0.915146,0.914892,0.003443,3,0.940532,0.939881,0.939487,0.93993,0.939755,0.939492,0.940409,0.93962,0.939633,0.939696,0.939843,0.000343,0.857198,0.857582,0.859501,0.872553,0.864875,0.857198,0.855662,0.860653,0.857143,0.855223,0.859759,0.005028,2,0.884486,0.886747,0.885893,0.885254,0.88568,0.885296,0.884528,0.885126,0.885472,0.884533,0.885301,0.000671
1,20.812484,0.086407,0.93819,0.010944,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,300.0,800,True,42,10,<function chi2 at 0x7f03623efb70>,,,,,"{'clf': RandomForestClassifier(bootstrap=True,...",0.909143,0.913503,0.921321,0.919094,0.919162,0.9178,0.90916,0.915895,0.914637,0.915794,0.915551,0.003887,2,0.941352,0.940948,0.940702,0.940749,0.940924,0.940845,0.941429,0.940772,0.94079,0.940961,0.940947,0.000237,0.857965,0.855662,0.859501,0.873704,0.862956,0.856046,0.855662,0.859117,0.856759,0.856759,0.859413,0.005221,3,0.878215,0.87941,0.880049,0.877874,0.879111,0.878983,0.878514,0.877959,0.879202,0.878007,0.878732,0.000692
2,6.074407,0.657095,0.107407,0.013742,"XGBClassifier(base_score=0.5, booster='gbtree'...",,,300,,42,13,<function chi2 at 0x7f03623efb70>,1.0,0.1,4.0,1.0,"{'clf': XGBClassifier(base_score=0.5, booster=...",0.923436,0.926076,0.933418,0.929958,0.927379,0.92717,0.916369,0.916896,0.917993,0.917296,0.923599,0.005817,1,0.942508,0.942237,0.941621,0.941728,0.941654,0.942172,0.942529,0.941495,0.941915,0.941105,0.941896,0.000438,0.867179,0.869098,0.876008,0.878695,0.866027,0.869866,0.856046,0.85643,0.853303,0.852919,0.864557,0.008885,1,0.884486,0.885552,0.885595,0.88295,0.884614,0.886363,0.884656,0.884017,0.885386,0.884064,0.884768,0.000933


In [14]:
# ROC of best model.
result[result.rank_test_AUC == 1][['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
2,0.941896,0.000438,0.923599,0.005817


In [15]:
# Training score much higher than test score.
# The standard deviation of the test score is large.
result_auc = result[['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]
result_auc

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
0,0.939843,0.000343,0.914892,0.003443
1,0.940947,0.000237,0.915551,0.003887
2,0.941896,0.000438,0.923599,0.005817


In [16]:
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy','mean_test_Accuracy', 'std_test_Accuracy']]
result_acc

Unnamed: 0,mean_train_Accuracy,std_train_Accuracy,mean_test_Accuracy,std_test_Accuracy
0,0.885301,0.000671,0.859759,0.005028
1,0.878732,0.000692,0.859413,0.005221
2,0.884768,0.000933,0.864557,0.008885


Final model

In [17]:
# Serializing the best model.
with open('pipe.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [18]:
# Restoring the best model.
with open("pipe.pkl", "rb") as file:
    best_model = pickle.load(file)

In [19]:
# Testing final model.
predict = best_model.predict(X_test)
print("Accuracy of testing: ", accuracy_score(y_test, predict), "\n")  # 0.8765545831414094 (GPU) | 0.8747121142330723 (CPU)
print("Confusion Matrix:\n", confusion_matrix(y_test,predict), "\n")
print("Classification report:\n", classification_report(y_test,predict))

Accuracy of testing:  0.8747121142330723 

Confusion Matrix:
 [[4652  293]
 [ 523 1045]] 

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      4945
           1       0.78      0.67      0.72      1568

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.82      6513
weighted avg       0.87      0.87      0.87      6513

