# Week 8 Bank Data Case Study

## Load Packages

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In this section we read in the data. 

In [3]:
df = pd.read_csv("../../../case_8.csv")
df.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,1.630686,7.464411,C,4.145098,9.191265,2.436402,2.483921,2.30163,...,6.822439,3.549938,0.598896,AF,1.672658,3.239542,1.957825,0,1.925763,1.739389
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,1.630686,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,...,6.822439,3.549938,0.919812,Z,1.672658,3.239542,2.030373,0,1.925763,1.739389


No obvious issues like parsing errors or missings. Lets see what we have for data types.

In [4]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 133 columns):
 #   Column  Dtype  
---  ------  -----  
 0   ID      int64  
 1   target  int64  
 2   v1      float64
 3   v2      float64
 4   v3      object 
 5   v4      float64
 6   v5      float64
 7   v6      float64
 8   v7      float64
 9   v8      float64
 10  v9      float64
 11  v10     float64
 12  v11     float64
 13  v12     float64
 14  v13     float64
 15  v14     float64
 16  v15     float64
 17  v16     float64
 18  v17     float64
 19  v18     float64
 20  v19     float64
 21  v20     float64
 22  v21     float64
 23  v22     object 
 24  v23     float64
 25  v24     object 
 26  v25     float64
 27  v26     float64
 28  v27     float64
 29  v28     float64
 30  v29     float64
 31  v30     object 
 32  v31     object 
 33  v32     float64
 34  v33     float64
 35  v34     float64
 36  v35     float64
 37  v36     float64
 38  v37     float64
 39  v38     int64  
 40  v

We see mostly floats. There are some object data types we should probably recast. We have 114K observations, plenty to work with.  No variable names, as expected. He said there are no missings but lets check anyway.

In [5]:
df.isnull().values.any()

False

Fine. He told the truth. How about that target variable

In [6]:
counts = df.target.value_counts()
print(counts)
print(round(counts[0]/sum(counts),4))

1    87021
0    27300
Name: target, dtype: int64
0.2388


The target is binary and a little unbalanced, but not terrible.

## Data Cleaning

boring

In [None]:
#data['minimum_nights'] = data['minimum_nights'].astype('float')

## EDA

boring

## Modeling Prep

In [14]:
def transform_data(data):
    #OH encode
    label_encode = [
        x for x, y in dict(data.dtypes).items()
        if type(y) == pd.CategoricalDtype
    ]
    for var in label_encode:
        #impute to Mode
        print(f"{var} was OHE'd")
        data[var].fillna(data[var].mode()[0], inplace=True)
        cat_list = pd.get_dummies(data[var], prefix=var)
        data_new = data.join(cat_list)
        data = data_new

    normalize = [x for x, y in dict(data.dtypes).items() if y == np.float64]
    for col in normalize:
        #change impution scheme?
        print(f"{col} was Standard Scaled")
        data[col].fillna(data[col].median(), inplace=True)
        normalizer = StandardScaler()
        ar_train = data[col].to_numpy().reshape(-1, 1)
        normalizer.fit(ar_train)
        data.loc[:, col] = normalizer.transform(ar_train)
    
    data.drop(columns=label_encode, inplace=True)
    #data.drop(columns=target_col, inplace=True)
    
    return data

In [20]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score #https://scikit-learn.org/stable/modules/model_evaluation.html

In [21]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [42]:
n_estimators= list(range(80, 110, 10))
max_features = list(range(5, 50, 5))
min_samples_split = list(range(500, 701, 100))
min_samples_leaf = [10, 20]
print(f'n_estimator_grid_search:{n_estimators}')
print(f'max_features_grid_search:{max_features}')
print(f'min_samples_split_grid_search:{min_samples_split}')
print(f'min_samples_leaf_grid_search:{min_samples_leaf}')


param_dist = {'n_estimators': n_estimators,
              'max_features': max_features,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf}

scoring = {
            'Accuracy':'accuracy'
            , 'F-1 Score':'f1'
            , 'Log Loss':'neg_log_loss'}

n_estimator_grid_search:[80, 90, 100]
max_features_grid_search:[5, 10, 15, 20, 25, 30, 35, 40, 45]
min_samples_split_grid_search:[500, 600, 700]
min_samples_leaf_grid_search:[10, 20]


In [17]:
X = df.copy().drop(columns=["ID","target"]).select_dtypes(include=['number'])
print("The shape of X is: ", X.shape)

y = df.loc[:,"target"].copy()
print("The shape of y is: ", y.shape)

The shape of X is:  (114321, 112)
The shape of y is:  (114321,)


## Random Forest
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle
clf = RandomForestClassifier(max_depth=2, random_state=0, n_jobs=-1)
clf.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Grid search for random forest

In [66]:
%%time
#%%script false --no-raise-error

n_iter_search = 2
rf_random_search = RandomizedSearchCV(clf, param_distributions=param_dist, scoring=scoring, cv = cv, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
rf_random_search.fit(X, y)

filename = 'rf_random_search.p'
pickle.dump(rf_random_search, open(filename, 'wb'))

Wall time: 53.8 s


In [50]:
rf_random_search = pd.read_pickle('rf_random_search.p')
pd.DataFrame(rf_random_search.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,params,split0_test_Accuracy,...,mean_test_Log Loss,std_test_Log Loss,rank_test_Log Loss,split0_train_Log Loss,split1_train_Log Loss,split2_train_Log Loss,split3_train_Log Loss,split4_train_Log Loss,mean_train_Log Loss,std_train_Log Loss
0,33.225282,0.171574,0.344889,0.002626,100,600,20,45,"{'n_estimators': 100, 'min_samples_split': 600...",0.761207,...,-0.511646,0.000566,1,-0.510321,-0.510189,-0.510482,-0.510396,-0.510773,-0.510432,0.000196
1,23.361127,0.04948,0.311668,0.002335,90,500,10,35,"{'n_estimators': 90, 'min_samples_split': 500,...",0.761207,...,-0.51481,0.000694,5,-0.513723,-0.513504,-0.513954,-0.513783,-0.513657,-0.513724,0.000148
2,29.816636,0.160534,0.341702,0.002661,100,600,10,40,"{'n_estimators': 100, 'min_samples_split': 600...",0.761207,...,-0.51273,0.000558,3,-0.51157,-0.511346,-0.511733,-0.511543,-0.511883,-0.511615,0.000182
3,13.777653,0.035279,0.304035,0.002134,90,500,10,20,"{'n_estimators': 90, 'min_samples_split': 500,...",0.761207,...,-0.523464,0.000511,13,-0.522663,-0.522218,-0.522765,-0.522569,-0.522973,-0.522638,0.000249
4,20.266122,0.049475,0.306377,0.002861,90,600,10,30,"{'n_estimators': 90, 'min_samples_split': 600,...",0.761207,...,-0.517998,0.000559,10,-0.517101,-0.516783,-0.517239,-0.516857,-0.517197,-0.517035,0.000183
5,8.294916,0.025482,0.328474,0.003879,100,500,20,10,"{'n_estimators': 100, 'min_samples_split': 500...",0.761207,...,-0.530915,0.0004,15,-0.530427,-0.530465,-0.530478,-0.530053,-0.530432,-0.530371,0.00016
6,22.523329,0.039502,0.334075,0.003347,100,500,20,30,"{'n_estimators': 100, 'min_samples_split': 500...",0.761207,...,-0.517721,0.000554,6,-0.516797,-0.516469,-0.516948,-0.516661,-0.516922,-0.516759,0.000177
7,9.489747,0.027689,0.276266,0.001937,80,700,20,15,"{'n_estimators': 80, 'min_samples_split': 700,...",0.761207,...,-0.527724,0.000626,14,-0.527316,-0.52702,-0.526991,-0.526637,-0.526954,-0.526983,0.000216
8,20.281353,0.045832,0.307075,0.002103,90,500,20,30,"{'n_estimators': 90, 'min_samples_split': 500,...",0.761207,...,-0.517997,0.000559,9,-0.517101,-0.516783,-0.517239,-0.516857,-0.517197,-0.517035,0.000183
9,26.566817,0.073715,0.285262,0.002561,80,500,10,45,"{'n_estimators': 80, 'min_samples_split': 500,...",0.761207,...,-0.511835,0.000579,2,-0.510575,-0.510333,-0.510658,-0.510614,-0.510924,-0.510621,0.000189


## XGBoost

In [61]:
from xgboost import XGBClassifier
#https://xgboost.readthedocs.io/en/latest/build.html

  import pandas.util.testing as tm


In [62]:
# A parameter grid for XGBoost
params = {
        'learning_rate': [0.005, 0.01, 0.02, 0.05, 0.1],
        'n_estimators': [100,200,400,600,800,1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(objective='binary:logistic',
                    silent=True, nthread=-1)

In [67]:
%%time
#%%script false --no-raise-error
n_iter_search = 2
xgb_random_search = RandomizedSearchCV(xgb, param_distributions=params, scoring=scoring, cv = cv, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
xgb_random_search.fit(X, y)

filename = 'xgb_random_search.p'
pickle.dump(xgb_random_search, open(filename, 'wb'))

Wall time: 12min 35s


In [None]:
xgb_random_search = pd.read_pickle('xgb_random_search.p')
pd.DataFrame(xgb_random_search.cv_results_)

## SVC

In [69]:
from sklearn.svm import SVC
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [68]:
param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}

In [None]:
%%time
#%%script false --no-raise-error
n_iter_search = 2
svc_random_search = RandomizedSearchCV(SVC(), param_distributions=param_grid, scoring=scoring, cv = 2, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
svc_random_search.fit(X, y)

filename = 'svc_random_search.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

## Linear SVC

In [None]:
from sklearn.svm import LinearSVC
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

In [None]:
param_grid = {'C':[1,10,100,1000],'penalty':['l1','l2'], 'loss':['hinge','square_hinge']}

In [None]:
n_iter_search = 2
lsvc_random_search = RandomizedSearchCV(LinearSVC(), param_distributions=param_grid, scoring=scoring, cv = 2, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
lsvc_random_search.fit(X, y)

filename = 'lsvc_random_search.p'
pickle.dump(lsvc_random_search, open(filename, 'wb'))