# IMPORT DATA

In [1]:
########################################
# importing packages
########################################

# all libraries
import pandas as pd                                     # data science essentials
import matplotlib.pyplot as plt                         # data visualization
import seaborn as sns                                   # enhanced data visualization
import numpy as np                                      # contruct arrays

# classficiation packages
import statsmodels.formula.api as smf                   # logistic regression
from sklearn.linear_model import LogisticRegression     # logistic regression
from sklearn.model_selection import train_test_split    # train-test split
from sklearn.tree import DecisionTreeClassifier         # decision trees
from sklearn.ensemble import RandomForestClassifier     # random forest  
from sklearn.ensemble import GradientBoostingClassifier # gbm 

# tuning packages  
from sklearn.model_selection import RandomizedSearchCV  # hyperparameter tuning
from sklearn.metrics import make_scorer                 # customizable scorer

# evaluation metrics packages 
from sklearn.metrics import confusion_matrix            # confusion matrix
from sklearn.metrics import roc_auc_score               # auc score


########################################
# setting display options and loading data
########################################

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

# specify the path and file name
file = './datasets/Apprentice_Chef_Dataset.xlsx'

# read the file into Python
df = pd.read_excel(io=file)

# DEFINE FUNCTIONS

In [2]:
def text_split_feature(col, df, sep=' ', new_col_name='NUMBER_OF_NAMES'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))

# ENGINEER FEATURES

**Flag Trend-based Features:**   
Flagging features containing many zeroes and creating dummy variables for them.

In [3]:
# make new columns and assign 0 as placeholder
df['HAS_CANCELLATIONS_BEFORE_NOON'] = 0
df['HAS_CANCELLATIONS_AFTER_NOON']  = 0
df['HAS_WEEKLY_PLAN']               = 0
df['HAS_EARLY_DELIVERIES']          = 0
df['HAS_LATE_DELIVERIES']           = 0
df['HAS_TOTAL_PHOTOS_VIEWED']       = 0
df['HAS_MASTER_CLASSES_ATTENDED']   = 0

# iterate over each original column to change values in the new columns:
for index, value in df.iterrows():   
    
    # HAS_CANCELLATIONS_BEFORE_NOON
    if df.loc[index, 'CANCELLATIONS_BEFORE_NOON'] > 0:
        df.loc[index, 'HAS_CANCELLATIONS_BEFORE_NOON'] = 1
        
    # HAS_CANCELLATIONS_AFTER_NOON
    if df.loc[index, 'CANCELLATIONS_AFTER_NOON'] > 0:
        df.loc[index, 'HAS_CANCELLATIONS_AFTER_NOON'] = 1
        
    # HAS_WEEKLY_PLAN
    if df.loc[index, 'WEEKLY_PLAN'] > 0:
        df.loc[index, 'HAS_WEEKLY_PLAN'] = 1 
        
    # HAS_EARLY_DELIVERIES
    if df.loc[index, 'EARLY_DELIVERIES'] > 0:
        df.loc[index, 'HAS_EARLY_DELIVERIES'] = 1   
        
    # HAS_LATE_DELIVERIES
    if df.loc[index, 'LATE_DELIVERIES'] > 0:
        df.loc[index, 'HAS_LATE_DELIVERIES'] = 1
        
    # HAS_TOTAL_PHOTOS_VIEWED
    if df.loc[index, 'TOTAL_PHOTOS_VIEWED'] > 0:
        df.loc[index, 'HAS_TOTAL_PHOTOS_VIEWED'] = 1   
    
    # HAS_MASTER_CLASSES_ATTENDED
    if df.loc[index, 'MASTER_CLASSES_ATTENDED'] > 0:
        df.loc[index, 'HAS_MASTER_CLASSES_ATTENDED'] = 1   

**Split Features:**   
Splitting feature 'EMAIL' to show username and domain, and create dummy variable for domain groups.

In [4]:
# Step 1: split EMAIL

# create an empty placeholder list
placeholder_lst = []

# loop over each email address
for index, col in df.iterrows():
    
    # split email domain at '@'
    split_email = df.loc[index, 'EMAIL'].split(sep = '@')
    
    # append placeholder_lst with the results
    placeholder_lst.append(split_email)
    
# convert placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)

In [5]:
# Step 2: concatenate with original DataFrame

# rename column to concatenate
email_df.columns = ['account_name' , 'domain'] 

# concatenate 'domain' to  original DataFrame 
df = pd.concat([df, email_df['domain']], axis = 1)

In [6]:
# Step 3: aggregate emails into groups

# specify domain types 
professional = [    '@mmm.com',
                    '@amex.com',
                    '@apple.com',
                    '@boeing.com',
                    '@caterpillar.com',
                    '@chevron.com', 
                    '@cisco.com', 
                    '@cocacola.com', 
                    '@disney.com', 
                    '@dupont.com', 
                    '@exxon.com', 
                    '@ge.org',
                    '@goldmansacs.com',
                    '@homedepot.com', 
                    '@ibm.com', 
                    '@intel.com', 
                    '@jnj.com',
                    '@jpmorgan.com',
                    '@mcdonalds.com', 
                    '@merck.com', 
                    '@microsoft.com',
                    '@nike.com', 
                    '@pfizer.com', 
                    '@pg.com', 
                    '@travelers.com',
                    '@unitedtech.com',
                    '@unitedhealth.com', 
                    '@verizon.com', 
                    '@visa.com', 
                    '@walmart.com'       ]

personal   = [      '@gmail.com', 
                    '@yahoo.com', 
                    '@protonmail.com'    ]

junk       = [      '@me.com',
                    '@aol.com',
                    '@hotmail.com', 
                    '@live.com', 
                    '@msn.com',
                    '@passport.com'      ]

# create an empty placeholder list
placeholder_lst = []


# loop to group observations by domain type
for domain in df['domain']:
        if '@' + domain in personal:
            placeholder_lst.append('personal')
                  
        elif '@' + domain in junk:
            placeholder_lst.append('junk')
        
        elif '@' + domain in professional:
            placeholder_lst.append('professional')
            
        else:
            print('Unknown')

# concatenate with original DataFrame
df['group_domain'] = pd.Series(placeholder_lst)

In [7]:
# Step 4: get dummy variables for 'domain'

# one hot encode variable 
one_hot_domain    = pd.get_dummies(df['group_domain'])

# join coding together
df = df.join([one_hot_domain])

# save new columns
new_columns = df.columns

**Develop New Features:**

1) Count names and assign values to new column:

In [8]:
# New column: NUMBER_OF_NAMES

# split names using text_split_feature
text_split_feature('NAME', df, sep='')

2) Combine features to make new ones:

In [9]:
# creating a column for average cost per meal
df['AVG_PRICE_PER_ORDER'] = df['REVENUE']/df['TOTAL_MEALS_ORDERED']

In [10]:
# find the median to set thresholds:
df['WEEKLY_PLAN'].median()

# repeat step above for following features:
# professional
# UNIQUE_MEALS_PURCH
# TASTES_AND_PREFERENCES
# MEDIAN_MEAL_RATING
# AVG_PRICE_PER_ORDER

7.0

In [11]:
# make new columns with 0 as placeholder 
df['WEEKLY_WORKING']    = 0   # weekly plans using professional emails
df['UNIQUE_TASTE_PREF'] = 0   # made unique purchases based on prerferences 
df['MEDIAN_RATER']      = 0   # median rating based on cost per meal 


# iterate over each original column to change values in the new columns:
for index, value in df.iterrows(): 

    # WEEKLY_WORKING 
    if df.loc[index, 'WEEKLY_PLAN'] >= 7 and \
    df.loc[index, 'professional'] == 1:
        df.loc[index, 'WEEKLY_WORKING'] = 1      
    
    # UNIQUE_TASTE_PREF
    if df.loc[index, 'UNIQUE_MEALS_PURCH'] > 5 and \
    df.loc[index, 'TASTES_AND_PREFERENCES'] == 1:
        df.loc[index, 'UNIQUE_TASTE_PREF'] = 1 
    
    # MEDIAN_RATER
    if df.loc[index, 'MEDIAN_MEAL_RATING'] <= 3 and \
    df.loc[index, 'AVG_PRICE_PER_ORDER'] > 34:
        df.loc[index, 'MEDIAN_RATER'] = 1

# PREPARE FOR MODELING 

**Drop Unnecessary Features:**

In [12]:
# drop variables after they've been encoded
df = df.drop('NAME', axis = 1)
df = df.drop('EMAIL', axis = 1)
df = df.drop('FIRST_NAME', axis = 1)
df = df.drop('FAMILY_NAME', axis =1)
df = df.drop('domain', axis = 1)
df = df.drop('group_domain', axis = 1)

In [None]:
# check correlation with all x-variables
df_corr = df.corr(method = 'pearson').round(2)

df_corr['CROSS_SELL_SUCCESS'].sort_values(ascending = False)

**Train and Split Data for Testing:**

In [13]:
# making a copy of the dataset
df_data = pd.DataFrame.copy(df) 

# declare explanatory variables - drop y-variable only
df_data = df.drop(['CROSS_SELL_SUCCESS'], axis = 1)

# declare response variable
df_target =  df.loc[ : , 'CROSS_SELL_SUCCESS']

# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(df_data,
                                                    df_target,
                                                    random_state = 219,
                                                    test_size    = 0.25,
                                                    stratify     = df_target)

# merge training data for statsmodels
df_train = pd.concat([x_train, y_train], axis = 1)

**Develop a Logistic Regression with Summary Results:**

In [14]:
# print all features in dataset 
for val in df_data:
    print(f" {val} + ")

 REVENUE + 
 TOTAL_MEALS_ORDERED + 
 UNIQUE_MEALS_PURCH + 
 CONTACTS_W_CUSTOMER_SERVICE + 
 PRODUCT_CATEGORIES_VIEWED + 
 AVG_TIME_PER_SITE_VISIT + 
 MOBILE_NUMBER + 
 CANCELLATIONS_BEFORE_NOON + 
 CANCELLATIONS_AFTER_NOON + 
 TASTES_AND_PREFERENCES + 
 PC_LOGINS + 
 MOBILE_LOGINS + 
 WEEKLY_PLAN + 
 EARLY_DELIVERIES + 
 LATE_DELIVERIES + 
 PACKAGE_LOCKER + 
 REFRIGERATED_LOCKER + 
 AVG_PREP_VID_TIME + 
 LARGEST_ORDER_SIZE + 
 MASTER_CLASSES_ATTENDED + 
 MEDIAN_MEAL_RATING + 
 AVG_CLICKS_PER_VISIT + 
 TOTAL_PHOTOS_VIEWED + 
 HAS_CANCELLATIONS_BEFORE_NOON + 
 HAS_CANCELLATIONS_AFTER_NOON + 
 HAS_WEEKLY_PLAN + 
 HAS_EARLY_DELIVERIES + 
 HAS_LATE_DELIVERIES + 
 HAS_TOTAL_PHOTOS_VIEWED + 
 HAS_MASTER_CLASSES_ATTENDED + 
 junk + 
 personal + 
 professional + 
 NUMBER_OF_NAMES + 
 AVG_PRICE_PER_ORDER + 
 WEEKLY_WORKING + 
 UNIQUE_TASTE_PREF + 
 MEDIAN_RATER + 


In [15]:
# create a logreg using significant variables only (p-value < 0.05)
logit_sig = smf.logit(formula = """  CROSS_SELL_SUCCESS ~
                                        junk + 
                                        professional + 
                                        NUMBER_OF_NAMES + 
                                        CANCELLATIONS_BEFORE_NOON + 
                                        MOBILE_NUMBER + 
                                        REFRIGERATED_LOCKER +
                                        CONTACTS_W_CUSTOMER_SERVICE + 
                                        MEDIAN_RATER + 
                                        UNIQUE_TASTE_PREF""",  
                                        data    = df_train)

# fitting the model object
results_sig = logit_sig.fit()

# checking the results SUMMARY
results_sig.summary()

Optimization terminated successfully.
         Current function value: 0.542122
         Iterations 6


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1449.0
Method:,MLE,Df Model:,9.0
Date:,"Mon, 25 Jan 2021",Pseudo R-squ.:,0.1367
Time:,23:46:31,Log-Likelihood:,-790.96
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,7.923e-49

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.7062,0.324,-5.270,0.000,-2.341,-1.072
junk,-1.3095,0.157,-8.339,0.000,-1.617,-1.002
professional,0.5960,0.144,4.129,0.000,0.313,0.879
NUMBER_OF_NAMES,0.5567,0.094,5.894,0.000,0.372,0.742
CANCELLATIONS_BEFORE_NOON,0.2791,0.046,6.007,0.000,0.188,0.370
MOBILE_NUMBER,0.9185,0.178,5.162,0.000,0.570,1.267
REFRIGERATED_LOCKER,0.5069,0.209,2.430,0.015,0.098,0.916
CONTACTS_W_CUSTOMER_SERVICE,0.0658,0.028,2.367,0.018,0.011,0.120
MEDIAN_RATER,-0.4282,0.126,-3.390,0.001,-0.676,-0.181


**Develop a dictionary:**

In [16]:
# create a dictionary to store candidate models with significant variables 

candidate_dict = {
 
 # significant variables only
 'logit_sig' : ['professional','junk','NUMBER_OF_NAMES',
                'CANCELLATIONS_BEFORE_NOON','MOBILE_NUMBER', 'MOBILE_LOGINS',
                'REFRIGERATED_LOCKER','CONTACTS_W_CUSTOMER_SERVICE',
                'MEDIAN_RATER', 'UNIQUE_TASTE_PREF',
                'CANCELLATIONS_AFTER_NOON'],
 
 'logit_sig2' : [ 'MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON', 
                  'TASTES_AND_PREFERENCES', 'PC_LOGINS', 'EARLY_DELIVERIES',
                  'REFRIGERATED_LOCKER', 'junk', 'professional', 
                  'NUMBER_OF_NAMES', 'CONTACTS_W_CUSTOMER_SERVICE']
    

}

# APPLY CLASSIFICATION MODELS

## LOGISTIC REGRESSION

In [17]:
# train/test split with the logit_sig variables
df_data   =  df.loc[ : , candidate_dict['logit_sig']]
df_target =  df.loc[ : , 'CROSS_SELL_SUCCESS']

# train/test split
x_train, x_test, y_train, y_test = train_test_split(
                                                    df_data,
                                                    df_target,
                                                    random_state = 219,
                                                    test_size    = 0.25,
                                                    stratify     = df_target)

In [74]:
########################################
# find best hyperparameters for tuning
########################################

# declaring a hyperparameter space
C_space          = pd.np.arange(0.1, 5.0, 0.1)
warm_start_space = [True, False]
solver_space     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
param_grid = {'C'          : C_space,
              'warm_start' : warm_start_space,
              'solver'     : solver_space}


# INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219,
                              max_iter     = 1000)


# GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(df_data, df_target)


# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

  C_space          = pd.np.arange(0.1, 5.0, 0.1)


Tuned Parameters  : {'warm_start': True, 'solver': 'newton-cg', 'C': 3.9000000000000004}
Tuned CV AUC      : 0.6528


In [75]:
lr_tuned_cv.best_estimator_

LogisticRegression(C=3.9000000000000004, max_iter=1000, random_state=219,
                   solver='newton-cg', warm_start=True)

In [18]:
# instantiate a logistic regression model with tuned values
lr_tuned = LogisticRegression(C=3.9000000000000004, 
                              max_iter=1000, random_state=219,
                              solver='newton-cg', warm_start=True)

# fit to training set
lr_tuned_fit = lr_tuned.fit(x_train, y_train)

# predict based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test)


# score the results
print('Training ACCURACY:', lr_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# save scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(x_test, y_test).round(4)   # accuracy


# save the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

Training ACCURACY: 0.7382
Testing  ACCURACY: 0.7515
AUC Score        : 0.6664


In [19]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")


True Negatives : 67
False Positives: 89
False Negatives: 32
True Positives : 299



In [20]:
# declaring model performance objects
lr_train_acc = lr_tuned.score(x_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(x_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)

# creating a dictionary for model results
model_performance = {
    
    'Model Name'        : ['Tuned Logistic Regression'],
           
    'AUC Score'         : [lr_tuned_auc],
    
    'Training Accuracy' : [lr_tuned_train_score],
           
    'Testing Accuracy'  : [lr_tuned_test_score],

    'Confusion Matrix'  : [(lr_tuned_tn, lr_tuned_fp, lr_tuned_fn, lr_tuned_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)


# sending model results to Excel
model_performance.to_excel('model_performance.xlsx',
                           index = False)


# checking for results
pd.read_excel('model_performance.xlsx')

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Tuned Logistic Regression,0.6664,0.7382,0.7515,"(67, 89, 32, 299)"


## DECISION TREE

In [21]:
# train/test split with the FULL dataset (all x-variables)
df_data   =  df.drop([ 'CROSS_SELL_SUCCESS'], axis = 1)
df_target =  df.loc[ : , 'CROSS_SELL_SUCCESS']
 
# train/test split
x_train, x_test, y_train, y_test = train_test_split(
                                                    df_data,
                                                    df_target,
                                                    random_state = 219,
                                                    test_size    = 0.25,
                                                    stratify     = df_target)

In [168]:
########################################
# find best hyperparameters for tuning
########################################

# set the parameters and distributions to sample
param_grid = {"max_depth": [3, 8],
              "min_samples_leaf": [1,25],
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_tuned_cv = RandomizedSearchCV(tree, param_grid, 
                             cv=7,                    
                             random_state = 219)       

# Fit it to the data
tree_tuned_cv.fit(df_data, df_target)


# printing the optimal parameters and best score
print("Tuned Parameters  :", tree_tuned_cv.best_params_)
print("Tuned CV AUC      :", tree_tuned_cv.best_score_.round(4))



Tuned Parameters  : {'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'entropy'}
Tuned CV AUC      : 0.7492


In [169]:
tree_tuned_cv.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [22]:
# instantiate a decision tree with tuned values
tree_tuned = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=3,
                                    min_samples_leaf = 25,
                                    random_state=219)


# fit to the training set
tree_tuned_fit = tree_tuned.fit(x_train, y_train)

# predict based on the testing set
tree_tuned_pred = tree_tuned.predict(x_test)



# score the results
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# save scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


# save the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                       y_score = tree_tuned_pred).round(4) # auc

Training ACCURACY: 0.7402
Testing  ACCURACY: 0.7762
AUC Score        : 0.732


In [23]:
# unpacking the confusion matrix
tree_tuned_tn, \
tree_tuned_fp, \
tree_tuned_fn, \
tree_tuned_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tree_tuned_tn}
False Positives: {tree_tuned_fp}
False Negatives: {tree_tuned_fn}
True Positives : {tree_tuned_tp}
""")


True Negatives : 95
False Positives: 61
False Negatives: 48
True Positives : 283



In [24]:
# declaring model performance objects
tree_train_acc = tree_tuned.score(x_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(x_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                             y_score = tree_tuned_pred).round(4)

# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'             : 'Tuned Tree (Full)',
                          'Training Accuracy'  : tree_train_acc,
                          'Testing Accuracy'   : tree_test_acc,
                          'AUC Score'          : tree_auc,
                          'Confusion Matrix'   : (tree_tuned_tn,
                                                  tree_tuned_fp,
                                                  tree_tuned_fn,
                                                  tree_tuned_tp)},
                          ignore_index = True)

## RANDOM FOREST

In [25]:
# train/test split with the logit_sig variables
df_data   =  df.loc[ : , candidate_dict['logit_sig']]
df_target =  df.loc[ : , 'CROSS_SELL_SUCCESS']

# train/test split
x_train, x_test, y_train, y_test = train_test_split(
                                                    df_data,
                                                    df_target,
                                                    random_state = 219,
                                                    test_size    = 0.25,
                                                    stratify     = df_target)

In [152]:
########################################
# find best hyperparameters for tuning
########################################


# declaring a hyperparameter space
estimator_space  = pd.np.arange(100, 1100, 250)
leaf_space       = pd.np.arange(1, 31, 10)
criterion_space  = ['gini', 'entropy']
bootstrap_space  = [True, False]
warm_start_space = [True, False]


# creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_space,
              'min_samples_leaf' : leaf_space,
              'criterion'        : criterion_space,
              'bootstrap'        : bootstrap_space,
              'warm_start'       : warm_start_space}


# INSTANTIATING the model object without hyperparameters
forest_grid = RandomForestClassifier(random_state = 219)


# GridSearchCV object
forest_cv = RandomizedSearchCV(estimator           = forest_grid,
                               param_distributions = param_grid,
                               cv                  = 5,
                               n_iter              = 500,
                               scoring             = make_scorer(roc_auc_score,
                                                     needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
forest_cv.fit(df_data, df_target)


# printing the optimal parameters and best score
print("Tuned Parameters  :", forest_cv.best_params_)
print("Tuned Training AUC:", forest_cv.best_score_.round(4))

  estimator_space  = pd.np.arange(100, 1100, 250)
  leaf_space       = pd.np.arange(1, 31, 10)


Tuned Parameters  : {'warm_start': True, 'n_estimators': 100, 'min_samples_leaf': 11, 'criterion': 'entropy', 'bootstrap': False}
Tuned Training AUC: 0.6874


In [153]:
forest_cv.best_estimator_

RandomForestClassifier(bootstrap=False, criterion='entropy',
                       min_samples_leaf=11, random_state=219, warm_start=True)

In [26]:
# building a model based on tuning results
forest_tuned = RandomForestClassifier(criterion='entropy',
                                      bootstrap=False, 
                                      min_samples_leaf=11, 
                                      random_state=219, 
                                      warm_start=True)

# FITTING the model object
forest_tuned_fit = forest_tuned.fit(x_train, y_train)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

Forest Tuned Training ACCURACY: 0.7814
Forest Tuned Testing  ACCURACY: 0.7782
Forest Tuned AUC Score        : 0.6979


In [27]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")


True Negatives : 74
False Positives: 82
False Negatives: 26
True Positives : 305



In [28]:
# declaring model performance objects
tuned_rf_train_acc = forest_tuned_fit.score(x_train, y_train).round(4)
tuned_rf_test_acc  = forest_tuned_fit.score(x_test, y_test).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = forest_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Tuned Random Forest',
                           'Training Accuracy'  : tuned_rf_train_acc,
                           'Testing Accuracy'   : tuned_rf_test_acc,
                           'AUC Score'          : tuned_rf_auc,
                           'Confusion Matrix'   : (tuned_rf_tn,
                                                   tuned_rf_fp,
                                                   tuned_rf_fn,
                                                   tuned_rf_tp)},
                          ignore_index = True)

## GBM

In [29]:
# train/test split with FULL dataset (all x-variables)
df_data   =  df.drop([ 'CROSS_SELL_SUCCESS'], axis = 1)
df_target =  df.loc[ : , 'CROSS_SELL_SUCCESS']
 
# train/test split
x_train, x_test, y_train, y_test = train_test_split(
                                                    df_data,
                                                    df_target,
                                                    random_state = 219,
                                                    test_size    = 0.25,
                                                    stratify     = df_target)

In [30]:
# building a model with hyperparameter manually tuned 
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.3,
                                        criterion     = 'friedman_mse',
                                        min_samples_leaf =1,
                                        max_depth     = 1,
                                        warm_start    = False,
                                        random_state  = 219)

# FIT step is needed 
gbm_tuned_fit = gbm_tuned.fit(x_train, y_train)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

Training ACCURACY: 0.7752
Testing  ACCURACY: 0.7598
AUC Score        : 0.6758


In [31]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")


True Negatives : 69
False Positives: 87
False Negatives: 30
True Positives : 301



In [32]:
# declaring model performance objects
tuned_gbm_train_acc = gbm_tuned_fit.score(x_train, y_train).round(4)
tuned_gbm_test_acc  = gbm_tuned_fit.score(x_test, y_test).round(4)
tuned_gbm_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = gbm_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Tuned GBM (Full)',
                           'Training Accuracy'  : tuned_gbm_train_acc,
                           'Testing Accuracy'   : tuned_gbm_test_acc,
                           'AUC Score'          : tuned_gbm_auc,
                           'Confusion Matrix'   : (gbm_tuned_tn,
                                                   gbm_tuned_fp,
                                                   gbm_tuned_fn,
                                                   gbm_tuned_tp)},
                          ignore_index = True)

# FINAL MODEL OUTPUT

**Compare Model Scores:**

In [33]:
# show model with highest AUC at top 
model_performance.sort_values(by = 'AUC Score',
                              ascending = False)

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
1,Tuned Tree (Full),0.732,0.7402,0.7762,"(95, 61, 48, 283)"
2,Tuned Random Forest,0.6979,0.7814,0.7782,"(74, 82, 26, 305)"
3,Tuned GBM (Full),0.6758,0.7752,0.7598,"(69, 87, 30, 301)"
0,Tuned Logistic Regression,0.6664,0.7382,0.7515,"(67, 89, 32, 299)"


In [34]:
# saving results to Excel
model_performance.to_excel('./model_results/classification_model_performance.xlsx',
                           index = False)

## Display Chosen Model & Output

Tuned Decision Tree is the model with the best AUC score.    
The training-testing gap also remains below 0.05. Therefore, the model was not overfitted. 

In [35]:
# display chosen model only 
final_model =  model_performance.loc[ 1 , : ]
final_model

Model Name           Tuned Tree (Full)
AUC Score                        0.732
Training Accuracy               0.7402
Testing Accuracy                0.7762
Confusion Matrix     (95, 61, 48, 283)
Name: 1, dtype: object