> **Bank "churn" dataset**
<br>` 'Exited' is our classification target `
<br>` 1 - went elsewhere (nonzero is True) `
<br>` 0 - remains as a customer `

**Import the main libraries**

In [None]:
import pandas as pd
import numpy as np

from time import time

_import the local library_

In [None]:
# add parent folder path where lib folder is
import sys
if ".." not in sys.path:import sys; sys.path.insert(0, '..') 

In [None]:
from mylib import show_labels_dist, show_metrics, bias_var_metrics

**Import the Dataset**

In [None]:
## file path: windows style
data = pd.read_csv('..\\datasets\\churn_modelling.csv')

## file path: unix style
#data = pd.read_csv('../datasets/churn_modelling.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(data.shape[0], data.shape[1]))

In [None]:
data.info()

In [None]:
data.head(13)

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**Check for missing values**

In [None]:
cnt=0
print('Missing Values - ')
for col in data.columns:
    nnul = pd.notnull(data[col]) 
    if (len(nnul)!=len(data)):
        cnt=cnt+1
        print('\t',col,':',(len(data)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

# address missing values here

**Quick visual check of unique values, deal with unique identifiers**

In [None]:
# Identify columns with only one value 
# or with number of unique values == number of rows
n_eq_one = []
n_eq_all = []

print('Unique value count (',data.shape[0],'Rows in the dataset )')
for col in data.columns:
    lc = len(data[col].unique())
    print(col, ' ::> ', lc)
    if lc == 1:
        n_eq_one.append(data[col].name)
    if lc == data.shape[0]:
        n_eq_all.append(data[col].name)

In [None]:
# Drop columns with only one value
if len(n_eq_one) > 0:
    print('Dropping single-valued features')
    print(n_eq_one)
    data.drop(n_eq_one, axis=1, inplace=True)

# Drop or bin columns with number of unique values == number of rows
if len(n_eq_all) > 0:
    print('Dropping unique identifiers')
    print(n_eq_all)
    data.drop(n_eq_all, axis=1, inplace=True)

# continue with featue selection / feature engineering

In [None]:
# let's "bin" the EstimatedSalary and the Balance

In [None]:
print('Estimated Salary - minValue: ',data['EstimatedSalary'].min(),
      '  maxValue: ',data['EstimatedSalary'].max())

In [None]:
print('Balance - minValue: ',data['Balance'].min(),
      '  maxValue: ',data['Balance'].max())

In [None]:
range_labels = ['(Zero)','Below 1k','1k-35k','36k-59k','60k-95k','96k-119k','120k-179k','180k-239k','240k-300k']
cut_bins = [-1, 0, 999, 35999, 59999, 95999, 119999, 179999, 239999, 299999]
data['SalaryRange'] = pd.cut(data['EstimatedSalary'], bins=cut_bins, labels=range_labels)
data['BalanceRange'] = pd.cut(data['Balance'], bins=cut_bins, labels=range_labels)

In [None]:
data.head(6)

In [None]:
print('Unique value count: Estimated Salary ',len(data['EstimatedSalary'].unique()),
      '  SalaryRange ',len(data['SalaryRange'].unique()))

In [None]:
print('Unique value count: Balance ',len(data['Balance'].unique()),
      '  BalanceRange ',len(data['BalanceRange'].unique()))

In [None]:
# let's drop the detail and keep the categories
#   Using inPlace makes permanent changes to the dataframe in memory 
#   otherwise drop() will not affect the dataset we are working on
data.drop(['EstimatedSalary'], axis=1, inplace=True)
data.drop(['Balance'], axis=1, inplace=True)

In [None]:
# Remove one more column that will not help predict the outcome
data.drop(['Surname'], axis=1, inplace=True)
data.head(2)

In [None]:
data.info()

**<br>Classification target feature**
<br>"the Right Answers", or more formally "the desired outcome"
<br>Must be in a separate dataset for classification ,,,

In [None]:
## 'Exited' is our classification target 
## 1 (nonzero is True) - went elsewhere, zero - remains as a customer
print(data['Exited'].value_counts())

In [None]:
## Text labels look better in the confusion matrix

## a 'lambda' function is always simple, used only once
#data.Exited = data.Exited.apply(lambda x: 'Gone' if x==1 else 'Here')

## an alternative to a 'lambda' that has the same effect
data['Exited'] = ['Gone' if x==1 else 'Here' for x in data['Exited']]

## Let's change the name to 'Status' too - 'rename' is like 'drop'
## setting the parameter 'inplace' to True changes the original DataFrame 
## if not set, a new DataFrame is returned
data.rename(columns={'Exited': 'Status'}, inplace = True)

data['Status'].value_counts()

In [None]:
data.info()

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'Status'
y = data[labels_col]

## Features used for prediction 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe 
X = data.copy()
X.drop(labels_col, axis=1, inplace=True)

**<br>Check column names of categorical attributes**
<br>Features with text values (categorical attributes) need to be normalised
<br>by changing them to numeric types that the algorithms find easier to work with

In [None]:
categori = X.select_dtypes(include=['object','category']).columns
print(categori.to_list())

In [None]:
# check the distribution of the feature values 
for col in categori:
    print('Distribution of categories in', col)
    print(X[col].value_counts())
    print()

* 'one hot' encoding transforms a single column of text values into 
multiple columns of discrete values: 
it creates a new column for each unique value and puts
(one) in the column for which it is true and (zero) in the others

In [None]:
Country = pd.get_dummies(X.Geography)
Country.head()

In [None]:
X = pd.concat([X, Country], axis=1)
X.drop('Geography', axis=1, inplace=True)
X.info()

In [None]:
# the automatic way adds the original feature name
X = pd.get_dummies(X)

In [None]:
X.info()

In [None]:
# Drop one-hot columns with no values (no data in this category)
onehot = X.select_dtypes(include=['uint8']).columns
for col in onehot:
    lc = len(X[col].unique())
    if lc == 1:
        print('Dropping ',col, ' ::> ', lc)
        X.drop(col, axis=1, inplace=True)

In [None]:
X.info()

**<br>Check column names of numeric attributes**
<br>Features with numeric values need to be normalised by changing the values to
small numbers in a specific range (scaling). _Note that scaling comes_ after _the test//train split!_

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

***

**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=50,
                                                   stratify=y)
# train_test_split does random selection, 
#      so we should reset the dataframe indexes
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# data before normalization
X_test.head()

In [None]:
# scaling the Numeric columns 
# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1

# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))
    

In [None]:
# data after normalization
X_test.head()

**<br>Classifier Selection**

In [None]:
# prepare list
models = []

##  --  Linear  --  ## 
#from sklearn.linear_model import LogisticRegression 
#models.append (("LogReg",LogisticRegression())) 
#from sklearn.linear_model import SGDClassifier 
#models.append (("StocGradDes",SGDClassifier())) 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
models.append(("LinearDA", LinearDiscriminantAnalysis())) 
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
#models.append(("QuadraticDA", QuadraticDiscriminantAnalysis())) 

##  --  Support Vector  --  ## 
#from sklearn.svm import SVC 
#models.append(("SupportVectorClf", SVC())) 
#from sklearn.svm import LinearSVC 
#models.append(("LinearSVC", LinearSVC())) 
#from sklearn.linear_model import RidgeClassifier
#models.append (("RidgeClf",RidgeClassifier())) 

##  --  Non-linear  --  ## 
#from sklearn.tree import DecisionTreeClassifier 
#models.append (("DecisionTree",DecisionTreeClassifier())) 
#from sklearn.naive_bayes import GaussianNB 
#models.append (("GaussianNB",GaussianNB())) 
#from sklearn.neighbors import KNeighborsClassifier 
#models.append(("K-NNeighbors", KNeighborsClassifier())) 

##  --  Ensemble: bagging  --  ## 
#from sklearn.ensemble import RandomForestClassifier 
#models.append(("RandomForest", RandomForestClassifier())) 
##  --  Ensemble: boosting  --  ## 
#from sklearn.ensemble import AdaBoostClassifier 
#models.append(("AdaBoost", AdaBoostClassifier())) 
#from sklearn.ensemble import GradientBoostingClassifier 
#models.append(("GradientBoost", GradientBoostingClassifier())) 

##  --  NeuralNet (simplest)  --  ## 
#from sklearn.linear_model import Perceptron 
#models.append (("SingleLayerPtron",Perceptron())) 
#from sklearn.neural_network import MLPClassifier 
#models.append(("MultiLayerPtron", MLPClassifier()))

print(models)

**<br>Target Label Distributions** (standard block)

In [None]:
# from our local library
show_labels_dist(X_train,X_test,y_train,y_test)

**<br>Fit and Predict** (standard block)

In [None]:
# evaluate each model in turn
results = []

print('macro average: unweighted mean per label')
print('weighted average: support-weighted mean per label')
print('MCC: correlation between prediction and ground truth')
print('     (+1 perfect, 0 random prediction, -1 inverse)\n')

for name, clf in models:
    trs = time()
    print('Confusion Matrix:', name)
    
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)
    results.append((name, ygx))
    
    tre = time() - trs
    print ("Run Time {} seconds".format(round(tre,2)) + '\n')
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]

    show_metrics(y_test, ygx, clf.classes_)   # from our local library
    print('\nParameters: ', clf.get_params(), '\n\n')

**Bias - Variance Decomposition** (standard block)

In [None]:
# from our local library
# reduce (cross-validation) folds for faster results
folds = 20
for name, clf in models:
    print('Bias // Variance Decomposition:', name)
    bias_var_metrics(X_train,X_test,y_train,y_test,clf,folds)

***

***

***
**<br>Feature Selection using Correlations**<br>
> Same rule as scaling applies here:<br>
> "Don't cheat - get the filter only on the training data<br>
> then apply the filter to both the training and test data"

In [None]:
# save our original datasets before we test the reduced ones
XtrainOriginal = X_train
XtestOriginal = X_test

**<br>Pearson correlation, Mutual Information, Symmetric Uncertainty**
<br>The Pearson correlation coefficient (r) is the most common way of measuring a linear correlation. It is a number between â€“1 and 1 that measures the strength and direction of the relationship between two variables.
<br><br>
Mutual Information (MI) is a measure of the information that X and Y share - in effect, how much knowing one of these variables reduces uncertainty about the other (information gain). In other words, MI tells us how useful the feature X is at predicting the random variable Y on a scale of zero to one, with higher numbers indicating better predictors.
<br><br>
MI has a lot of advocates because it can capture may types of dependencies. There is a nice illustration of this in the sklearn documentation, comparing MI to the (anova) F-test, which captures only linear dependency like the pearson r metric:<br>
https://scikit-learn.org/stable/auto_examples/feature_selection/plot_f_test_vs_mi.html
<br><br>
Symmetric uncertainty (SU) compensates for mutual information's bias towards features having large number of different values and normalizes within range [0,1].
<br><br>
_IMPORTANT: Pearson Correlation and MI/SU are complementary, in the sense that high values for one do not mean high vales for the other, so we should check both when using them for feature selection_

In [None]:
# from our local library
from mylib import filter_fcy, rpt_ycor, get_filter 

# for graphs
import matplotlib.pyplot as plt
import seaborn as sns

# Requires numeric values for the target feature
from sklearn.preprocessing import LabelEncoder
## Feature being predicted ("the Right Answer")
ynum = LabelEncoder().fit_transform(y_train)

In [None]:
# generate lists of features to keep and to drop
# requires numeric labels
droplist, keeplist = filter_fcy(X_train, ynum)
print('Floor Filter:',len(keeplist),'features to keep,',len(droplist),'to drop')

In [None]:
# take a look ...
rpt_ycor(droplist)

In [None]:
rpt_ycor(keeplist)

In [None]:
# make a dataframe from the Keep List
ffdf = pd.DataFrame(keeplist, columns=['Feature','PCy','SUy','MIy'])
#ffdf.head()

In [None]:
# --  --
# Get this many highest Symmetric Uncertainty
sunf = 6        # (number of features to select)
# Get this many highest & lowest Pearson Correlations from the rest
pcnf = 2        # (number of features to select)
# --  --

corhi = ffdf.sort_values('SUy',ascending=False).head(sunf)
hicor = list(corhi['Feature'].values)
# these are selected, so drop them out
tmp_df = ffdf[~ffdf.Feature.isin(hicor)]

pcorhi = tmp_df.sort_values('PCy',ascending=False).head(pcnf)
hicor.extend(n for n in list(pcorhi['Feature'].values))
pcorlo = tmp_df.sort_values('PCy',ascending=False).tail(pcnf)
hicor.extend(n for n in list(pcorlo['Feature'].values))

# create a new dataframe with just those columns ...
hcXtrain = ffdf[ffdf.Feature.isin(hicor)]

In [None]:
# ... for visualisation
cc = 'PCy'
cl = 'Pearson Correlation'
#cc = 'SUy'
#cl = 'Symmetric Uncertainty'
#cc = 'MIy'
#cl = 'Mutual Information'
sns.barplot(x = cc, y = "Feature", 
            data = hcXtrain.sort_values(cc, ascending=False)).set(
    title = cl + ' with Target')
plt.show()

**<br>Evaluate: Full Keep List**

In [None]:
# apply the filter to create new train and test dataframes
kXtrain = X_train.filter(get_filter(keeplist))
kXtest = X_test.filter(get_filter(keeplist))

X_train = kXtrain
X_test = kXtest

# Create a list of the feature names
cols = list(X_train.columns)

In [None]:
# Create a list of the feature names
cols = list(X_train.columns)

**<br>Fit and Predict** (standard block)

**Bias - Variance Decomposition** (standard block)

**<br>Evaluate: just the high-correlation features**

In [None]:
# apply the filter to create new train and test dataframes
hXtrain = X_train.filter(get_filter(hicor))
hXtest = X_test.filter(get_filter(hicor))

X_train = hXtrain
X_test = hXtest

# Create a list of the feature names
cols = list(X_train.columns)

**<br>Fit and Predict**

In [None]:
# add standard blocks for performance metrics
# and blocks for appropriate visualisations

***
- _Bonus - Pairwise Correlation of features_

In [None]:
# Rank2D performs pairwise comparisons of each feature in the data set 
# with a specific metric or algorithm (default: Pearson correlation) 
# then returns them ranked as a lower left triangle diagram.

from yellowbrick.features.rankd import Rank2D

visualizer = Rank2D()
visualizer.fit(mX, y_train)
visualizer.transform(mX)
visualizer.show()

- _Correlation Matrix "HeatMap"_

In [None]:
# plot the full heatmap with values
from matplotlib.colors import ListedColormap
# Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)

# from yellowbrick.Rank2D
hm=visualizer.ranks_

plt.figure(figsize=(12,12))
sns.heatmap(hm, annot=True, cmap=colormap, xticklabels=merged, yticklabels=merged)
plt.show()

**<br>Target Label Distributions** (standard block)

In [None]:
# from our local library
show_labels_dist(X_train,X_test,y_train,y_test)

**<br>Fit and Predict** (standard block)

In [None]:
# evaluate each model in turn
results = []

print('macro average: unweighted mean per label')
print('weighted average: support-weighted mean per label')
print('MCC: correlation between prediction and ground truth')
print('     (+1 perfect, 0 random prediction, -1 inverse)\n')

for name, clf in models:
    trs = time()
    print('Confusion Matrix:', name)
    
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)
    results.append((name, ygx))
    
    tre = time() - trs
    print ("Run Time {} seconds".format(round(tre,2)) + '\n')
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]

    show_metrics(y_test, ygx, clf.classes_)   # from our local library
    print('\nParameters: ', clf.get_params(), '\n\n')

**Bias - Variance Decomposition** (standard block)

In [None]:
# from our local library
# reduce (cross-validation) folds for faster results
folds = 20
for name, clf in models:
    print('Bias // Variance Decomposition:', name)
    bias_var_metrics(X_train,X_test,y_train,y_test,clf,folds)

***

***