<a href="https://colab.research.google.com/github/xanasa14/MLImplementations/blob/master/XGBoostIncome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier

In [29]:
# Loading the dataset and showing the five first records.
income = pd.read_csv("/content/drive/MyDrive/XGBoost/train.csv")
income.head(10)
len(income)

43957

In [57]:
income[income.isnull().any(axis=1)]
income['workclass'].isnull().sum()

income.dropna(subset = ["workclass"], inplace=True)
income.dropna(subset = ["native_country"], inplace=True)
income.dropna(subset = ["occupation"], inplace=True)


income[income.isnull().any(axis=1)]


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,high_income


In [58]:
# Verifying if there are missing values.
income.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40727 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             40727 non-null  int64 
 1   workclass       40727 non-null  object
 2   fnlwgt          40727 non-null  int64 
 3   education       40727 non-null  object
 4   education_num   40727 non-null  int64 
 5   marital_status  40727 non-null  object
 6   occupation      40727 non-null  object
 7   relationship    40727 non-null  object
 8   race            40727 non-null  object
 9   gender          40727 non-null  object
 10  capital_gain    40727 non-null  int64 
 11  capital_loss    40727 non-null  int64 
 12  hours_per_week  40727 non-null  int64 
 13  native_country  40727 non-null  object
 14  high_income     40727 non-null  int8  
dtypes: int64(6), int8(1), object(8)
memory usage: 4.7+ MB


In [59]:
income.rename(
    columns={"income_>50K":"high_income","native-country":"native_country",
             "capital-gain":"capital_gain","capital-loss": "capital_loss",
             "educational-num":"education_num","marital-status":"marital_status",
             "hours-per-week":"hours_per_week"}
          ,inplace=True)


In [60]:
income.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40727 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             40727 non-null  int64 
 1   workclass       40727 non-null  object
 2   fnlwgt          40727 non-null  int64 
 3   education       40727 non-null  object
 4   education_num   40727 non-null  int64 
 5   marital_status  40727 non-null  object
 6   occupation      40727 non-null  object
 7   relationship    40727 non-null  object
 8   race            40727 non-null  object
 9   gender          40727 non-null  object
 10  capital_gain    40727 non-null  int64 
 11  capital_loss    40727 non-null  int64 
 12  hours_per_week  40727 non-null  int64 
 13  native_country  40727 non-null  object
 14  high_income     40727 non-null  int8  
dtypes: int64(6), int8(1), object(8)
memory usage: 4.7+ MB


In [61]:
# The only step necessary to be done outside of pipeline.
# Converting the target column to categorical.
col = pd.Categorical(income.high_income)
income["high_income"] = col.codes
income["high_income"]

0        1
1        0
2        1
3        0
4        0
        ..
43952    1
43953    0
43954    0
43955    0
43956    0
Name: high_income, Length: 40727, dtype: int8

In [62]:
# Custom Transformer that extracts columns passed as argument to its constructor.
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Class Constructor.
    def __init__( self, feature_names ):
        self.feature_names = feature_names

    # Returns self nothing else to do here.
    def fit( self, X, y = None):
        return self

    # Method that describes what we need this transformer to do.
    def transform( self, X, y = None):
        return X[self.feature_names]

In [63]:
# Converts certain features to categorical.
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes a boolean as its argument.
    def __init__(self, new_features=True):
        self.new_features = new_features

    # Returns self nothing else to do here.
    def fit( self, X, y = None):
        return self

    # Transformer method we wrote for this transformer.
    def transform(self, X , y = None):
        df = X.copy()
        if self.new_features:
            # Treats ? workclass as unknown.
            df['workclass']= df['workclass'].replace('?','Unknown')
            # Two many category level, convert just US and Non-US.
            df.loc[df['native_country'] != ' United-States', 'native_country'] = 'non_usa'

        # Converts columns to categorical.
        for name in df.columns.to_list():
            col = pd.Categorical(df[name])
            df[name] = col.codes

        # Returns numpy array.
        return df

In [64]:
# Global varibles.
seed = 42
num_folds = 10
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

In [65]:
# Split-out train/validation and test dataset.
X_train, X_test, y_train, y_test = train_test_split(income.drop(labels="high_income",axis=1),
                                                    income["high_income"],
                                                    test_size=0.20,
                                                    random_state=seed,
                                                    shuffle=True,
                                                    stratify=income["high_income"])

In [66]:
# Categrical features to pass down the categorical pipeline.
categorical_features = income.select_dtypes("object").columns.to_list()

# Numerical features to pass down the numerical pipeline.
numerical_features = income.select_dtypes("int64").columns.to_list()

# Defining the steps in the categorical pipeline.
categorical_pipeline = Pipeline(steps = [('cat_selector', FeatureSelector(categorical_features)),
                                         ('cat_transformer', CategoricalTransformer())])

# Defining the steps in the numerical pipeline.
numerical_pipeline = Pipeline(steps = [('num_selector', FeatureSelector(numerical_features)),
                                       ('std_scaler', MinMaxScaler())])

# Combining numerical and categorical piepline into one full big pipeline horizontally using FeatureUnion.
full_pipeline_preprocessing = FeatureUnion(transformer_list = [('categorical_pipeline', categorical_pipeline),
                                                               ('numerical_pipeline', numerical_pipeline)])

In [67]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [68]:
# The full pipeline as a step in another pipeline with an estimator as the final step.
pipe = Pipeline(steps = [("full_pipeline", full_pipeline_preprocessing),
                         ("fs", SelectKBest()),
                         ("clf", XGBClassifier())])

# Creating a dictionary with the hyperparameters.
search_space = [
                {"clf": [RandomForestClassifier()],
                 "clf__n_estimators": [800],
                 "clf__criterion": ["gini", "entropy"],
                 "clf__max_leaf_nodes": [300],
                 "clf__random_state": [seed],
                 "clf__oob_score": [True],
                 "fs__score_func": [chi2],
                 "fs__k": [10]},
                {"clf": [XGBClassifier()],
                 "clf__n_estimators": [300],
                 "clf__max_depth": [4],
                 "clf__learning_rate": [0.1],
                 "clf__random_state": [seed],
                 "clf__subsample": [1],
                 "clf__colsample_bytree": [1],
#                  "clf__tree_method": ["gpu_hist"],  # For using the GPU.
                 "fs__score_func":[chi2],
                 "fs__k":[13]}
]

# Defining StratifiedKFold object.
kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)

#############################################################################
# return_train_score=True
# official documentation: "computing the scores on the training set can be
# computationally expensive and is not strictly required to
# select the parameters that yield the best generalization performance".
#############################################################################

# Creating the GridSearchCV object.
grid = GridSearchCV(estimator=pipe, 
                    param_grid=search_space,
                    cv=kfold,
                    scoring=scoring,
                    return_train_score=True,
                    n_jobs=-1,
                    refit="AUC")

# Getting the time start.
tmp = time.time()

# Fitting the GridSearchCV object.
best_model = grid.fit(X_train, y_train)

# Printing the time spent.
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))  # 311.7510848045349 seconds
# print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))  # 301.1867105960846 seconds

CPU Training Time: 360.24457144737244 seconds


In [69]:
print("Best: %f using %s" % (best_model.best_score_,best_model.best_params_))  # 0.920415 (GPU) | 0.920279 (CPU)


Best: 0.926018 using {'clf': XGBClassifier(max_depth=4, n_estimators=300, random_state=42), 'clf__colsample_bytree': 1, 'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'clf__n_estimators': 300, 'clf__random_state': 42, 'clf__subsample': 1, 'fs__k': 13, 'fs__score_func': <function chi2 at 0x7f1b0ad18dd0>}


In [70]:
# Analyzing the results of cross-validation process.
result = pd.DataFrame(best_model.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__criterion,param_clf__max_leaf_nodes,param_clf__n_estimators,param_clf__oob_score,param_clf__random_state,param_fs__k,param_fs__score_func,param_clf__colsample_bytree,param_clf__learning_rate,param_clf__max_depth,param_clf__subsample,params,split0_test_AUC,split1_test_AUC,split2_test_AUC,split3_test_AUC,split4_test_AUC,split5_test_AUC,split6_test_AUC,split7_test_AUC,split8_test_AUC,split9_test_AUC,mean_test_AUC,std_test_AUC,rank_test_AUC,split0_train_AUC,split1_train_AUC,split2_train_AUC,split3_train_AUC,split4_train_AUC,split5_train_AUC,split6_train_AUC,split7_train_AUC,split8_train_AUC,split9_train_AUC,mean_train_AUC,std_train_AUC,split0_test_Accuracy,split1_test_Accuracy,split2_test_Accuracy,split3_test_Accuracy,split4_test_Accuracy,split5_test_Accuracy,split6_test_Accuracy,split7_test_Accuracy,split8_test_Accuracy,split9_test_Accuracy,mean_test_Accuracy,std_test_Accuracy,rank_test_Accuracy,split0_train_Accuracy,split1_train_Accuracy,split2_train_Accuracy,split3_train_Accuracy,split4_train_Accuracy,split5_train_Accuracy,split6_train_Accuracy,split7_train_Accuracy,split8_train_Accuracy,split9_train_Accuracy,mean_train_Accuracy,std_train_Accuracy
0,21.845763,0.548558,1.119966,0.021196,RandomForestClassifier(),gini,300.0,800,True,42,10,<function chi2 at 0x7f1b0ad18dd0>,,,,,"{'clf': RandomForestClassifier(), 'clf__criter...",0.914593,0.91786,0.914709,0.913009,0.917757,0.916189,0.917938,0.915477,0.918561,0.910549,0.915664,0.002416,3,0.936031,0.935632,0.936102,0.936055,0.935714,0.936064,0.935844,0.935788,0.935955,0.936043,0.935923,0.000158,0.860387,0.853591,0.861265,0.860651,0.864334,0.856967,0.860344,0.856967,0.861572,0.855433,0.859151,0.003109,2,0.877873,0.879378,0.878764,0.878184,0.877639,0.878559,0.878423,0.878764,0.87798,0.878218,0.878378,0.000485
1,23.949065,0.253911,1.216153,0.023131,RandomForestClassifier(),entropy,300.0,800,True,42,10,<function chi2 at 0x7f1b0ad18dd0>,,,,,"{'clf': RandomForestClassifier(), 'clf__criter...",0.914707,0.918127,0.916191,0.913413,0.918103,0.917032,0.918466,0.916041,0.918639,0.911546,0.916226,0.002254,2,0.936851,0.936459,0.936768,0.937005,0.936627,0.936722,0.936669,0.936728,0.936635,0.936979,0.936744,0.000158,0.858546,0.857888,0.860344,0.859423,0.865562,0.855433,0.859423,0.857274,0.861572,0.855433,0.85909,0.002857,3,0.872451,0.87283,0.872626,0.872626,0.872762,0.872591,0.873205,0.873376,0.872148,0.872762,0.872738,0.000333
2,8.194945,0.351466,0.13537,0.008217,"XGBClassifier(max_depth=4, n_estimators=300, r...",,,300,,42,13,<function chi2 at 0x7f1b0ad18dd0>,1.0,0.1,4.0,1.0,"{'clf': XGBClassifier(max_depth=4, n_estimator...",0.925652,0.921054,0.929211,0.921785,0.927289,0.931629,0.929918,0.927117,0.931171,0.915351,0.926018,0.004927,1,0.943112,0.942666,0.943038,0.943481,0.942463,0.942314,0.942745,0.942638,0.942105,0.942745,0.942731,0.000381,0.867751,0.854512,0.872007,0.863106,0.871087,0.873542,0.872928,0.869245,0.878146,0.857581,0.867991,0.007068,1,0.885137,0.885482,0.8848,0.885823,0.884766,0.883675,0.88538,0.884903,0.883675,0.884493,0.884814,0.000678


In [71]:
# ROC of best model.
result[result.rank_test_AUC == 1][['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
2,0.942731,0.000381,0.926018,0.004927


In [72]:
# Training score much higher than test score.
# The standard deviation of the test score is large.
result_auc = result[['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]
result_auc

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
0,0.935923,0.000158,0.915664,0.002416
1,0.936744,0.000158,0.916226,0.002254
2,0.942731,0.000381,0.926018,0.004927


In [73]:
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy','mean_test_Accuracy', 'std_test_Accuracy']]
result_acc

Unnamed: 0,mean_train_Accuracy,std_train_Accuracy,mean_test_Accuracy,std_test_Accuracy
0,0.878378,0.000485,0.859151,0.003109
1,0.872738,0.000333,0.85909,0.002857
2,0.884814,0.000678,0.867991,0.007068


In [74]:
# Serializing the best model.
with open('pipe.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [75]:
# Restoring the best model.
with open("pipe.pkl", "rb") as file:
    best_model = pickle.load(file)

In [76]:
# Testing final model.
predict = best_model.predict(X_test)
print("Accuracy of testing: ", accuracy_score(y_test, predict), "\n")  # 0.8765545831414094 (GPU) | 0.8747121142330723 (CPU)
print("Confusion Matrix:\n", confusion_matrix(y_test,predict), "\n")
print("Classification report:\n", classification_report(y_test,predict))

Accuracy of testing:  0.8685244291676897 

Confusion Matrix:
 [[5755  372]
 [ 699 1320]] 

Classification report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      6127
           1       0.78      0.65      0.71      2019

    accuracy                           0.87      8146
   macro avg       0.84      0.80      0.81      8146
weighted avg       0.86      0.87      0.86      8146

