# Login

In [None]:
# !pip install scikit-plot 
# !pip install datacleaner
# !pip install shap
# !pip install --upgrade scikit-learn
# !pip install xgboost
# !pip instal pandas
# !pip install numpy
# !pip install  wandb
# !pip install seaborn

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/data'
os.symlink('/content/drive/MyDrive/Colab Notebooks/data', data_path)
sys.path.insert(0,data_path)

Mounted at /content/drive


# Setup

In [None]:
# Essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

# Data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

# Tests
import scipy as sp
import sklearn

# Experimental 
from sklearn.experimental import enable_halving_search_cv # noqa

# Models
from xgboost import XGBClassifier

# Performace indicators and Model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFECV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective

# Interpretation
import shap

# Utilities
from sklearn import set_config
from sklearn.model_selection import ParameterGrid
import  warnings
set_config(display='diagram')
pd.set_option('display.max_columns', None)
import winsound
# import wandb
# wandb.init()

In [None]:
# Useful Functions
def model_performance(X0, X1, Y0, Y1, model):
    Y0_predict = model.predict(X0)
    Y1_predict = model.predict(X1)

    Y0_hat = model.predict_proba(X0)[:, 1]
    Y1_hat = model.predict_proba(X1)[:, 1]

    print('\nROC AUC Train', roc_auc_score(Y0, Y0_hat).round(3))
    print('ROC AUC Test', roc_auc_score(Y1, Y1_hat).round(3))
    print('-------------------------')
    print('\nClassification Report Train')
    print(classification_report(Y0, Y0_predict))
    print('-------------------------')
    print('\nClassification Report Test')
    print(classification_report(Y1, Y1_predict))
    print('-------------------------')

In [None]:
class OutlierWinsorization(BaseEstimator,TransformerMixin):
    def __init__(self, bound_threshold=0.95):
        self.bound_threshold = bound_threshold
        
    def outlier_Winsor(self, X, y=None):
        X[(X < self.lower_bound)] = (self.lower_bound*[X < self.lower_bound]).reshape(X.shape)[X < self.lower_bound]
        X[(X > self.upper_bound)] = (self.upper_bound*[X > self.upper_bound]).reshape(X.shape)[X > self.upper_bound]
        return X
    
    def fit(self, X, y=None):
        self.lower_bound = np.quantile(X, (1 - self.bound_threshold), axis=0)
        self.upper_bound = np.quantile(X, self.bound_threshold, axis=0)
        return self
    
    def transform(self, X, y=None):
        return self.outlier_Winsor(X)

class LowFreqCombiner(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01, fill_value=-1):
        self.threshold = threshold
        self.fill_value = fill_value

    def combine_low_freq(self, col, y=None):
        col = np.array(col)
        values, counts = np.unique(col, return_counts=True)
        other_value_indices = np.in1d(col, values[(counts/len(col)) <= self.threshold])
        return np.where(other_value_indices, self.fill_value, col)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.apply_along_axis(self.combine_low_freq, 0, X)

class AddColumnNames(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X, columns=get_feature_names(self.preprocessor))

In [None]:
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [None]:
class Generalizer():
    def __init__(self, model, X, y, fit_params, test_size=0.25, random_state=42):
        self.model = model
        self.fit_params = fit_params
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, stratify=y, 
                                                                                random_state=random_state)

    def generalize(self, model_parameter_search, base_cv_score, cv_diff_threshold=0.005):
        param_grid = list(ParameterGrid(model_parameter_search))
        self.differences = []
        self.lowest_diff = base_cv_score
        self.best_params = param_grid[0]
        for params in param_grid:
            self.model.set_params(**params)
            self.model.fit(self.X_train, self.y_train, **self.fit_params)
            y_train_pred = self.model.predict_proba(self.X_train)[:,1]
            y_test_pred = self.model.predict_proba(self.X_test)[:,1]
            train_score = roc_auc_score(self.y_train, y_train_pred)
            test_score = roc_auc_score(self.y_test, y_test_pred)
            diff = abs(train_score - test_score)
            cv_score = cross_val_score(self.model, self.X, self.y, cv=5, scoring='roc_auc', 
                                       fit_params=self.fit_params).mean()
            cv_diff = abs(base_cv_score - cv_score)
            if cv_diff < cv_diff_threshold:
                if diff <  self.lowest_diff:
                    self.lowest_diff = diff
                    self.best_params = params
            self.differences.append({'diff':diff, 'cv_diff':cv_diff})
        return self

# Alternative models with pipelines

In [None]:
# Importing the data
dataset = pd.read_csv('./data/email_marketing.csv')
dataset = dataset.drop(dataset.tail(1).index)

# Setting up variables
target = 'RESPONSE_FLAG'
X = dataset.drop(target, axis=1)
y = dataset[target]

# Splittind data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

# Performing some preprocessing
numeric_transformer = Pipeline(steps=[
    ('NumericImputer', SimpleImputer(strategy='constant')),
    ('RobustScaler', RobustScaler(quantile_range=(5.0, 95.0)))])

categorical_transformer = Pipeline(steps=[
    ('CategImputer', SimpleImputer(strategy='most_frequent', fill_value='Missing')),
    ('OrdinalEncoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

# entire pipeline
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, selector(dtype_exclude="object")),
    ('categorical', categorical_transformer, selector(dtype_include="object"))
], remainder='passthrough', n_jobs=-1)

In [None]:
preprocess_params = {'preprocessor__numeric__NumericImputer__fill_value': [-9999, -1, 0, 9999],
                     'preprocessor__numeric__RobustScaler__quantile_range': [(25.0, 75.0), (10.0, 90.0), 
                                                                             (5.0, 95.0), (1.0, 99.0), (0.1, 99.9)],
                     'preprocessor__categorical__CategImputer__strategy': ['most_frequent', 'constant']
                     }

pxgb_params = {'max_depth': range(2, 10), 'colsample_bytree': np.arange(0.1, 1.1, 0.1),
               'colsample_bylevel': np.arange(0.1, 1.1, 0.1),
               'subsample': np.arange(0.1, 1.1, 0.1),
               'n_estimators': [75, 100, 150, 200, 250],
               'learning_rate': [0.1, 0.05, 0.025, 0.001],
               'min_child_weight': [1], 'reg_lambda': [1], 'gamma': [1.0], 
               'use_label_encoder': [False], 'random_state': [42], 
               'n_jobs': [-1],'tree_method': ['gpu_hist'], 'gpu_id': [0]
               }

pxgb_gs_params = {**{'classifier__' + str(key): val for key, val in pxgb_params.items()}, **preprocess_params}

PXBG = Pipeline([('preprocessor', preprocessor),
                ('classifier', XGBClassifier())])

In [None]:
%%time
xgb_fit_params = {'eval_metric':'auc', 
                   'eval_set': [(PXBG.named_steps['preprocessor'].fit(X_train).transform(X_test), y_test)], 
                   'early_stopping_rounds': 10, 'verbose': False}

pxgb_fit_params = {'classifier__' + str(key): val for key, val in xgb_fit_params.items()}

pxgb_hrs = HalvingRandomSearchCV(estimator=PXBG, param_distributions=pxgb_gs_params, 
                                 scoring="roc_auc", factor=2, cv=5, n_jobs=-1, 
                                 random_state=42, verbose=0, resource='n_samples', 
                                 aggressive_elimination=True, min_resources=50, 
                                 error_score='raise')

pxgb_hrs.fit(X_train, y_train, **pxgb_fit_params)
print('Best Params:', pxgb_hrs.best_params_)
print('AUC Score:', round((pxgb_hrs.best_score_), 4))

Best Params: {'preprocessor__numeric__RobustScaler__quantile_range': (5.0, 95.0), 'preprocessor__numeric__NumericImputer__fill_value': 9999, 'preprocessor__categorical__CategImputer__strategy': 'constant', 'classifier__use_label_encoder': False, 'classifier__tree_method': 'gpu_hist', 'classifier__subsample': 0.7000000000000001, 'classifier__reg_lambda': 1, 'classifier__random_state': 42, 'classifier__n_jobs': -1, 'classifier__n_estimators': 250, 'classifier__min_child_weight': 1, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1, 'classifier__gpu_id': 0, 'classifier__gamma': 1.0, 'classifier__colsample_bytree': 0.5, 'classifier__colsample_bylevel': 1.0}
AUC Score: 0.7737
Wall time: 10min 32s


In [None]:
display(pxgb_hrs)
model_performance(X_train, X_test, y_train, y_test, pxgb_hrs)


ROC AUC Train 0.824
ROC AUC Test 0.767
-------------------------

Classification Report Train
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     17137
           1       0.73      0.18      0.29      3413

    accuracy                           0.85     20550
   macro avg       0.79      0.58      0.60     20550
weighted avg       0.84      0.85      0.81     20550

-------------------------

Classification Report Test
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      5713
           1       0.59      0.13      0.21      1137

    accuracy                           0.84      6850
   macro avg       0.72      0.55      0.56      6850
weighted avg       0.81      0.84      0.79      6850

-------------------------


In [None]:
%%time
# Trimming dataset
clf = pxgb_hrs.best_estimator_.named_steps['classifier']
prep = pxgb_hrs.best_estimator_.named_steps['preprocessor']

selector = RFECV(clf, step=1, cv=5, min_features_to_select=10, n_jobs=-1, verbose=0)
selector_results = selector.fit(prep.transform(X_train), y_train)
best_features = [f.split('__')[1] for f in np.array(get_feature_names(prep))[selector_results.support_]]
Xt_train = X_train[best_features]
Xt_test = X_test[best_features]







Wall time: 9min 18s


Transformer NumericImputer (type SimpleImputer) does not provide get_feature_names. Will return input column names if available
Transformer RobustScaler (type RobustScaler) does not provide get_feature_names. Will return input column names if available
Transformer CategImputer (type SimpleImputer) does not provide get_feature_names. Will return input column names if available
Transformer OrdinalEncoder (type OrdinalEncoder) does not provide get_feature_names. Will return input column names if available


In [None]:
%%time

clf = pxgb_hrs.best_estimator_.named_steps['classifier']
prep = pxgb_hrs.best_estimator_.named_steps['preprocessor']

xgb_fit_params = {'eval_metric':'auc', 
                   'eval_set': [(prep.fit(Xt_train).transform(Xt_test), y_test)], 
                   'early_stopping_rounds': 10, 'verbose': False}
pxgb_fit_params = {'classifier__' + str(key): val for key, val in xgb_fit_params.items()}

pxgb_hrs.fit(Xt_train, y_train, **pxgb_fit_params)
print('Best Params:', pxgb_hrs.best_params_)
print('AUC Score:', round((pxgb_hrs.best_score_), 4))
model_performance(Xt_train, Xt_test, y_train, y_test, pxgb_hrs)
winsound.Beep(frequency=2500, duration=1500)

Best Params: {'preprocessor__numeric__RobustScaler__quantile_range': (5.0, 95.0), 'preprocessor__numeric__NumericImputer__fill_value': 9999, 'preprocessor__categorical__CategImputer__strategy': 'constant', 'classifier__use_label_encoder': False, 'classifier__tree_method': 'gpu_hist', 'classifier__subsample': 0.7000000000000001, 'classifier__reg_lambda': 1, 'classifier__random_state': 42, 'classifier__n_jobs': -1, 'classifier__n_estimators': 250, 'classifier__min_child_weight': 1, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1, 'classifier__gpu_id': 0, 'classifier__gamma': 1.0, 'classifier__colsample_bytree': 0.5, 'classifier__colsample_bylevel': 1.0}
AUC Score: 0.7771

ROC AUC Train 0.818
ROC AUC Test 0.769
-------------------------

Classification Report Train
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     17137
           1       0.71      0.19      0.29      3413

    accuracy                           0.85     20

In [None]:
# Generalizing the model to remove overfitting
model_params = {'max_depth': [3], 'colsample_bytree': [0.7],
                'colsample_bylevel': [0.7], 'use_label_encoder': [False],
                'subsample': [0.7], 'random_state': [42], 'gpu_id': [0],
                'tree_method': ['gpu_hist'], 'n_jobs': [-1],
                'n_estimators': [120], 'learning_rate': [0.15, 0.1, 0.05, 0.01],
                'min_child_weight': [6], 
                'reg_lambda': [9], 
                'gamma': [19], 
                }
prep_params = {'preprocessor__numeric__NumericImputer__fill_value':  [9999],
               'preprocessor__numeric__RobustScaler__quantile_range': [(5.0, 95.0)],
               'preprocessor__categorical__CategImputer__strategy': ['constant']
               }
fit_params = {'eval_metric': 'auc',
                        'eval_set': [(prep.fit(Xt_train).transform(Xt_test), y_test)],
                        'early_stopping_rounds': 10, 'verbose': False}


parameter_combinations = {**{'classifier__'+str(key):val for key,val in model_params.items()},**prep_params}
gfit_params = {'classifier__'+str(key): val for key, val in fit_params.items()}
gneral_PXBG = Pipeline([('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

general_search = Generalizer(gneral_PXBG, X[best_features], y, gfit_params)
general_search.generalize(parameter_combinations, base_cv_score=0.7787, cv_diff_threshold=0.01)
print('Best params:', general_search.best_params)
print('Lowest Difference:', round(general_search.lowest_diff, 3))
winsound.Beep(frequency=2500, duration=2000)

Best params: {'classifier__colsample_bylevel': 0.7, 'classifier__colsample_bytree': 0.7, 'classifier__gamma': 19, 'classifier__gpu_id': 0, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 3, 'classifier__min_child_weight': 6, 'classifier__n_estimators': 120, 'classifier__n_jobs': -1, 'classifier__random_state': 42, 'classifier__reg_lambda': 9, 'classifier__subsample': 0.7, 'classifier__tree_method': 'gpu_hist', 'classifier__use_label_encoder': False, 'preprocessor__categorical__CategImputer__strategy': 'constant', 'preprocessor__numeric__NumericImputer__fill_value': 9999, 'preprocessor__numeric__RobustScaler__quantile_range': (5.0, 95.0)}
Lowest Difference: 0.033


In [None]:
# Mannual Tunning
final_preprocess_params = {'preprocessor__numeric__NumericImputer__fill_value':  9999,
                           'preprocessor__numeric__RobustScaler__quantile_range': (5.0, 95.0),
                           'preprocessor__categorical__CategImputer__strategy': 'constant'
                           }

final_pxgb_params = {'max_depth': 3, 'colsample_bytree': 0.7,
                     'colsample_bylevel': 0.7,
                     'subsample': 0.7,
                     'n_estimators': 120,
                     'learning_rate': 0.05,
                     'min_child_weight': 6, 'reg_lambda': 9, 'gamma': 19,
                     'use_label_encoder': False, 'random_state': 42,
                     'n_jobs': -1, 'tree_method': 'gpu_hist', 'gpu_id': 0
                     }

final_params = {**{'classifier__' +
                   str(key): val for key, val in final_pxgb_params.items()}, **final_preprocess_params}

final_xgb_fit_params = {'eval_metric': 'auc','verbose': False}

final_fit_params = {'classifier__' +
                    str(key): val for key, val in final_xgb_fit_params.items()}

final_PXBG = Pipeline([('preprocessor', preprocessor),
                       ('classifier', XGBClassifier())])

final_PXBG.set_params(**final_params)
final_PXBG.fit(Xt_train, y_train, **final_fit_params)
model_performance(Xt_train, Xt_test, y_train, y_test, final_PXBG)
print('AUC 5-fold CV score:', round(cross_val_score(final_PXBG, X[best_features], y, cv=5, scoring='roc_auc',
                                                   fit_params=final_fit_params).mean(), 4))


ROC AUC Train 0.788
ROC AUC Test 0.755
-------------------------

Classification Report Train
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     17137
           1       0.71      0.10      0.18      3413

    accuracy                           0.84     20550
   macro avg       0.78      0.55      0.55     20550
weighted avg       0.82      0.84      0.79     20550

-------------------------

Classification Report Test
              precision    recall  f1-score   support

           0       0.84      0.99      0.91      5713
           1       0.64      0.08      0.15      1137

    accuracy                           0.84      6850
   macro avg       0.74      0.54      0.53      6850
weighted avg       0.81      0.84      0.79      6850

-------------------------
AUC 5-fold CV score: 0.7704
