# Week 8 Bank Data Case Study

## Load Packages

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In this section we read in the data. 

In [2]:
df = pd.read_csv("../Data/case_8.csv")
df.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,1.630686,7.464411,C,4.145098,9.191265,2.436402,2.483921,2.30163,...,6.822439,3.549938,0.598896,AF,1.672658,3.239542,1.957825,0,1.925763,1.739389
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,1.630686,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,...,6.822439,3.549938,0.919812,Z,1.672658,3.239542,2.030373,0,1.925763,1.739389


No obvious issues like parsing errors or missings. Lets see what we have for data types.

In [3]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 133 columns):
ID        int64
target    int64
v1        float64
v2        float64
v3        object
v4        float64
v5        float64
v6        float64
v7        float64
v8        float64
v9        float64
v10       float64
v11       float64
v12       float64
v13       float64
v14       float64
v15       float64
v16       float64
v17       float64
v18       float64
v19       float64
v20       float64
v21       float64
v22       object
v23       float64
v24       object
v25       float64
v26       float64
v27       float64
v28       float64
v29       float64
v30       object
v31       object
v32       float64
v33       float64
v34       float64
v35       float64
v36       float64
v37       float64
v38       int64
v39       float64
v40       float64
v41       float64
v42       float64
v43       float64
v44       float64
v45       float64
v46       float64
v47       object
v48       float64


We see mostly floats. There are some object data types we should probably recast. We have 114K observations, plenty to work with.  No variable names, as expected. He said there are no missings but lets check anyway.

In [4]:
df.isnull().values.any()

False

Fine. He told the truth. How about that target variable

In [5]:
counts = df.target.value_counts()
print(counts)
print(round(counts[0]/sum(counts),4))

1    87021
0    27300
Name: target, dtype: int64
0.2388


The target is binary and a little unbalanced, but not terrible.

In [6]:
df.apply(lambda x: len(x.unique()))

ID        114321
target         2
v1         64488
v2         64525
v3             3
           ...  
v127       64489
v128       65688
v129          10
v130       64477
v131       64415
Length: 133, dtype: int64

## Data Cleaning

boring

In [7]:
#data['minimum_nights'] = data['minimum_nights'].astype('float')

In [14]:
from sklearn.preprocessing import StandardScaler

def transform_data(data):
    #OH encode
    label_encode = [
        x for x, y in dict(data.dtypes).items()
        if type(y) == pd.CategoricalDtype
    ]
    #for var in label_encode:
    for var in ('v3','v22','v24','v30','v31','v47','v52','v56','v66','v71','v74','v75','v79','v91','v107','v110','v112','v113','v125'):
        #impute to Mode
        print(f"{var} was OHE'd")
        data[var].fillna(data[var].mode()[0], inplace=True)
        cat_list = pd.get_dummies(data[var], prefix=var)
        data_new = data.join(cat_list)
        data = data_new

    normalize = [x for x, y in dict(data.dtypes).items() if y == np.float64]
    for col in normalize:
        #change impution scheme?
        data[col].fillna(data[col].median(), inplace=True)
        normalizer = StandardScaler()
        ar_train = data[col].to_numpy().reshape(-1, 1)
        normalizer.fit(ar_train)
        data.loc[:, col] = normalizer.transform(ar_train)
    
    data.drop(columns=label_encode, inplace=True)
    #data.drop(columns=target_col, inplace=True)
    
    return data

In [15]:
df_transform = transform_data(df)

v3 was OHE'd
v22 was OHE'd
v24 was OHE'd
v30 was OHE'd
v31 was OHE'd
v47 was OHE'd
v52 was OHE'd
v56 was OHE'd
v66 was OHE'd
v71 was OHE'd
v74 was OHE'd
v75 was OHE'd
v79 was OHE'd
v91 was OHE'd
v107 was OHE'd
v110 was OHE'd
v112 was OHE'd
v113 was OHE'd
v125 was OHE'd


In [11]:
df_transform.apply(lambda x: len(x.unique()))

ID        114321
target         2
v1         64488
v2         64525
v3             3
           ...  
v125_V         2
v125_W         2
v125_X         2
v125_Y         2
v125_Z         2
Length: 18707, dtype: int64

In [10]:
df_transform.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v125_Q,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z
0,3,1,-0.362671,0.5676624,C,-0.259746,-0.535879,0.3614557,1.56529,-0.7032151,...,0,0,0,0,0,0,0,0,0,0
1,4,1,2.730298e-16,7.983532e-16,C,0.0,0.290849,9.85525e-16,0.0,0.3815862,...,0,0,0,0,0,0,0,0,0,0
2,5,1,-0.8445118,-0.9682275,C,0.3082,-2.213376,3.424656,3.263175,-0.7000377,...,0,0,0,0,0,0,0,0,0,0
3,6,1,-1.024604,0.3776793,C,0.093701,1.86926,-0.7516472,-1.121205,-0.6278492,...,0,0,0,0,0,0,0,0,0,0
4,8,1,2.730298e-16,7.983532e-16,C,0.0,0.0,9.85525e-16,0.0,-1.052455e-16,...,0,0,0,0,0,0,0,0,0,1


## EDA

boring

## Modeling Prep

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score #https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [None]:
n_estimators= list(range(80, 110, 10))
max_features = list(range(6, len(features_xform.columns), 5))
min_samples_split = list(range(500, 701, 100))
min_samples_leaf = [10, 20]
print(f'n_estimator_grid_search:{n_estimators}')
print(f'max_features_grid_search:{max_features}')
print(f'min_samples_split_grid_search:{min_samples_split}')
print(f'min_samples_leaf_grid_search:{min_samples_leaf}')
scoring = {
            'Accuracy':'accuracy'
            , 'F-1 Score':'f1'
            , 'Log Loss':'log_loss'}

## XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

# create variables we are more familiar with
if 'target' in df_transform:
    y = df_transform['target'].values # get the labels we want
    del df_transform['target'] # get rid of the class label
    X = df_transform.values # use everything else to predict!
    
yhat = np.zeros(y.shape) # we will fill this with predictions
yhat_score = np.zeros((y.shape[0],4))

# create cross validation iterator
cv = StratifiedKFold(n_splits=5, random_state=1234)

In [None]:
def per_class_accuracy(ytrue,yhat):
    conf = mt.confusion_matrix(ytrue,yhat)
    norm_conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    return np.diag(norm_conf)

In [None]:
#KGBoost 
import pickle

from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report

def XGBoost(boost, tree, depth, delta_step, etaparm):
    for train, test in cv.split(X,y):
        xgb_model = xgb.XGBClassifier(booster=boost, tree_method=tree, 
                                      max_depth=depth, max_delta_step=delta_step,eta=etaparm
                                     ).fit(X[train], y[train])
        yhat[test] = xgb_model.predict(X[test],ntree_limit=xgb_model.best_iteration)
    
    print (mt.classification_report(y, yhat, digits = 3))

In [None]:
%%time

boost='gbtree'
tree='auto'
depth=6
delta_step=0
etaparm=0.3

XGBoost(boost,tree,depth,delta_step,etaparm)

In [None]:
%%time
print('Fit the model...')
# XGBoost params:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "logloss",
   "eta": 0.01, 
   "subsample": 0.5,
   "colsample_bytree": 0.5,
   "max_depth": 3
}
boost_round = 50
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

In [None]:
#Make predict
print('Predict...')
test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
# Save results

In [None]:
import numpy as np
from sklearn.metrics import log_loss, accuracy_score
print(log_loss(y_test,test_preds))
print(accuracy_score(y_test,np.rint(test_preds)))

## Random Forest
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)

## SVM