> **Bank "churn" dataset**
<br>` 'Exited' is our classification target `
<br>` 1 - went elsewhere (nonzero is True) `
<br>` 0 - remains as a customer `

**Importing the libraries**

In [None]:
import pandas as pd
import numpy as np

**Importing the Dataset**

In [None]:
## file path: windows style
data = pd.read_csv('..\\datasets\\churn_modelling.csv')

## file path: unix style
#data = pd.read_csv('../datasets/churn_modelling.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(
    data.shape[0], data.shape[1]))

In [None]:
data.info()

In [None]:
data.head(2)

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**Check for missing values**

In [None]:
cnt=0
print('Missing Values - ')
for col in data.columns:
    nnul = pd.notnull(data[col]) 
    if (len(nnul)!=len(data)):
        cnt=cnt+1
        print('\t',col,':',(len(data)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

# address missing values here

**Quick visual check of unique values, deal with unique identifiers**

In [None]:
# Identify columns with only one value 
# or with number of unique values == number of rows
n_eq_one = []
n_eq_all = []

print('Unique value count (',data.shape[0],'Rows in the dataset )')
for col in data.columns:
    lc = len(data[col].unique())
    print(col, ' ::> ', lc)
    if lc == 1:
        n_eq_one.append(data[col].name)
    if lc == data.shape[0]:
        n_eq_all.append(data[col].name)

In [None]:
# Drop columns with only one value
if len(n_eq_one) > 0:
    print('Dropping single-valued features')
    print(n_eq_one)
    data.drop(n_eq_one, axis=1, inplace=True)

# Drop or bin columns with number of unique values == number of rows
if len(n_eq_all) > 0:
    print('Dropping unique identifiers')
    print(n_eq_all)
    data.drop(n_eq_all, axis=1, inplace=True)

# continue with featue selection / feature engineering

In [None]:
# let's "bin" the EstimatedSalary and the Balance

In [None]:
print('Estimated Salary - minValue: ',data['EstimatedSalary'].min(),
      '  maxValue: ',data['EstimatedSalary'].max())

In [None]:
print('Balance - minValue: ',data['Balance'].min(),
      '  maxValue: ',data['Balance'].max())

In [None]:
range_labels = ['(Zero)','Below 1k','1k-35k','36k-59k','60k-95k','96k-119k','120k-179k','180k-239k','240k-300k']
cut_bins = [-1, 0, 999, 35999, 59999, 95999, 119999, 179999, 239999, 299999]
data['SalaryRange'] = pd.cut(data['EstimatedSalary'], bins=cut_bins, labels=range_labels)
data['BalanceRange'] = pd.cut(data['Balance'], bins=cut_bins, labels=range_labels)

In [None]:
data.head(6)

In [None]:
print('Unique value count: Estimated Salary ',len(data['EstimatedSalary'].unique()),
      '  SalaryRange ',len(data['SalaryRange'].unique()))

In [None]:
print('Unique value count: Balance ',len(data['Balance'].unique()),
      '  BalanceRange ',len(data['BalanceRange'].unique()))

In [None]:
# let's drop the detail and keep the categories
#   Using inPlace makes permanent changes to the dataframe in memory 
#   otherwise drop() will not affect the dataset we are working on
data.drop(['EstimatedSalary'], axis=1, inplace=True)
data.drop(['Balance'], axis=1, inplace=True)

In [None]:
# Remove one more column that will not help predict the outcome
data.drop(['Surname'], axis=1, inplace=True)
data.head(2)

In [None]:
data.info()

**Classification target feature**
<br>"the Right Answers", or more formally "the desired outcome"
<br>Must be in a separate dataset for classification ,,,

In [None]:
# this is a binary classification dataset
twoclass = True

In [None]:
## 'Exited' is our classification target 
## 1 (nonzero is True) - went elsewhere, zero - remains as a customer
print(data['Exited'].value_counts())

In [None]:
## Text labels look better in the confusion matrix

## a 'lambda' function is always simple, used only once
#data.Exited = data.Exited.apply(lambda x: 'Gone' if x==1 else 'Here')

## an alternative to a 'lambda' that has the same effect
data['Exited'] = ['Gone' if x==1 else 'Here' for x in data['Exited']]

## Let's change the name to 'Status' too - 'rename' is like 'drop'
## setting the parameter 'inplace' to True changes the original DataFrame 
## if not set, a new DataFrame is returned
data.rename(columns={'Exited': 'Status'}, inplace = True)

data['Status'].value_counts()

In [None]:
data.info()

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'Status'
y = data[labels_col]

## Features used for prediction 
X = data.copy()
X.drop(labels_col, axis=1, inplace=True)

In [None]:
# generate a sorted list of unique labels to use later
from sklearn.utils.multiclass import unique_labels
targetlabels = unique_labels(y)

**Check column names of categorical attributes**
<br>Features with text values (categorical attributes) need to be normalised
<br>by changing them to numeric types that the algorithms find easier to work with

In [None]:
categori = X.select_dtypes(include=['object','category']).columns
print(categori.to_list())

In [None]:
# check the distribution of the feature values 
for col in categori:
    print('Distribution of categories in', col)
    print(X[col].value_counts())
    print()

* 'one hot' encoding transforms a single column of text values into 
multiple columns of discrete values: 
it creates a new column for each unique value and puts
(one) in the column for which it is true and (zero) in the others

In [None]:
Country = pd.get_dummies(X.Geography)
Country.head()

In [None]:
X = pd.concat([X, Country], axis=1)
X.drop('Geography', axis=1, inplace=True)
X.info()

In [None]:
# the automatic way adds the original feature name
X = pd.get_dummies(X)

In [None]:
X.info()

In [None]:
# Drop one-hot columns with no values (no data in this category)
onehot = X.select_dtypes(include=['uint8']).columns
for col in onehot:
    lc = len(X[col].unique())
    if lc == 1:
        print('Dropping ',col, ' ::> ', lc)
        X.drop(col, axis=1, inplace=True)

In [None]:
X.info()

**Check column names of numeric attributes**
<br>Features with numeric values need to be normalised
<br>by changing them to small numbers in a specific range (scaling)

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

In [None]:
# The proper place to do scaling comes later in the pipeline ,,, 

***
**<br>Create Test // Train Datasets**
> Split X and y datasets into Train and Test subsets,<br>keeping relative proportions of each class (stratify)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=50,
                                                   stratify=y)

**<br>Target Label Distributions**

In [None]:
# shape method gives the dimensions of the dataset
print('X_train: {} rows, {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('X_test:  {} rows, {} columns'.format(X_test.shape[0], X_test.shape[1]))
print()
print('y_train: {} rows, 1 column'.format(y_train.shape[0]))
print('y_test:  {} rows, 1 column'.format(y_test.shape[0]))
print()

## Here's a nice report:  
# 1. series to dataframe conversion
my_train = pd.DataFrame(y_train)
my_test = pd.DataFrame(y_test)
# 2. dataframe copy with [[ -- ]]
av_train = my_train[[labels_col]].apply(lambda x: x.value_counts())
av_test = my_test[[labels_col]].apply(lambda x: x.value_counts())
# 3. add a new column
av_train['pct_train'] = round((100 * av_train / av_train.sum()),2)
av_test['pct_test'] = round((100 * av_test / av_test.sum()),2)
# 4. combine the dataframes
av_tt = pd.concat([av_train,av_test], axis=1) 
# 5. print the report
print('Frequency and Distribution of labels')
print(av_tt)

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# from above
# numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

In [None]:
# data before normalization
X_train.head()

In [None]:
# scaling the Numeric columns 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1
# ColumnTransformer returns a numpy.ndarray so we lose the feature names;
# we process one column at a time to preserve the dataframe

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))
    

In [None]:
# data after normalization
X_train.head()

**<br>Function** to calculate perfomance metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

In [None]:
def show_metrics(y_test,ygx,lbls):
    tptn_df = pd.DataFrame(confusion_matrix(y_test, ygx, labels=lbls), 
                           index=['train:{:}'.format(x) for x in lbls], 
                           columns=['pred:{:}'.format(x) for x in lbls])
    print(tptn_df)    
    print("\n~~~~")
    
    TP = np.diag(tptn_df.values)
    FP = tptn_df.values.sum(axis=0) - TP
    FN = tptn_df.values.sum(axis=1) - TP
    TN = np.sum(tptn_df.values) - (FP + FN + TP)
# false positive rates
    FPR = FP/(FP+TN)
# false negative rates
    FNR = FN/(TP+FN)
# overall 
    sfpr=FP.sum()/(FP.sum()+TN.sum())
    sfnr=FN.sum()/(TP.sum()+FN.sum())
    
    if len(lbls) >2:
        for x in range(len(lbls)):
            print('{:>12} : '.format(lbls[x]),
                  'FPR = %.3f   FNR = %.3f' % (FPR[x], FNR[x]))
        print()

    print('{:>12} : '.format('macro avg'),
          'FPR = %.3f   FNR = %.3f'  % (FPR.mean(), FNR.mean()))
    print('weighted avg :  FPR = %.3f   FNR = %.3f' % (sfpr, sfnr))
 
    print("\n~~~~")
    
#    macro average: unweighted mean per label 
# weighted average: support-weighted mean per label  
    print(classification_report(y_test, ygx, digits=3, target_names=lbls))

    print("~~~~")
# Matthews correlation coefficient: 
#   correlation between prediction and ground truth
#   (+1 perfect, 0 random prediction, -1 inverse)

    mcc = matthews_corrcoef(y_test, ygx)
    print('MCC: Overall :  %.3f' % mcc)
    if len(lbls) >2:
        for tc in lbls:
            bin_mcc = matthews_corrcoef(y_test == tc, ygx == tc)
            print('{:>12} :'.format(tc),' %.3f' % bin_mcc)  

    return '~~~~'

**<br>Classifier Selection**

In [None]:
# prepare list
models = []

##  --  Linear  --  ## 
#from sklearn.linear_model import LogisticRegression 
#models.append (("LogReg",LogisticRegression())) 
#from sklearn.linear_model import SGDClassifier 
#models.append (("StocGradDes",SGDClassifier())) 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
models.append(("LinearDA", LinearDiscriminantAnalysis())) 
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
#models.append(("QuadraticDA", QuadraticDiscriminantAnalysis())) 

##  --  Support Vector  --  ## 
#from sklearn.svm import SVC 
#models.append(("SupportVectorClf", SVC())) 
#from sklearn.svm import LinearSVC 
#models.append(("LinearSVC", LinearSVC())) 
#from sklearn.linear_model import RidgeClassifier
#models.append (("RidgeClf",RidgeClassifier())) 

##  --  Non-linear  --  ## 
#from sklearn.tree import DecisionTreeClassifier 
#models.append (("DecisionTree",DecisionTreeClassifier())) 
#from sklearn.naive_bayes import GaussianNB 
#models.append (("GaussianNB",GaussianNB())) 
#from sklearn.neighbors import KNeighborsClassifier 
#models.append(("K-NNeighbors", KNeighborsClassifier())) 

##  --  Ensemble: bagging  --  ## 
#from sklearn.ensemble import RandomForestClassifier 
#models.append(("RandomForest", RandomForestClassifier())) 
##  --  Ensemble: boosting  --  ## 
#from sklearn.ensemble import AdaBoostClassifier 
#models.append(("AdaBoost", AdaBoostClassifier())) 
#from sklearn.ensemble import GradientBoostingClassifier 
#models.append(("GradientBoost", GradientBoostingClassifier())) 

##  --  NeuralNet (simplest)  --  ## 
#from sklearn.linear_model import Perceptron 
#models.append (("SingleLayerPtron",Perceptron())) 
#from sklearn.neural_network import MLPClassifier 
#models.append(("MultiLayerPtron", MLPClassifier()))

print(models)

**<br>Fit and Predict**

In [None]:
# evaluate each model in turn
results = []
for name, clf in models:
    print('Confusion Matrix:', name)
    clf.fit(X_train, y_train)
    ygx = clf.predict(X_test)

    results.append((name, ygx))
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]
    show_metrics(y_test,ygx,clf.classes_)
    print('\nParameters: ', clf.get_params(), '\n\n')

 ***
 **_These examples only work with one classifier_**
 ***

***
**<br>Feature Importance Permutation**
<br>This can be used with any classifier or regressor to estimate feature importance.
<br>
It returns two arrays: the first array (here: `imp_vals`) contains the actual 
   importance values we are interested in. The second array is assigned to ` _ ` because we are not using it.
When `num_rounds` > 1 the permutation is repeated multiple times 
   with different random seeds, and the first array holds 
   the average of the importance values,
   with all individual values from these runs in the second array. 

In [None]:
# for graphs
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Create a list of the feature names
cols = list(X_train.columns)

In [None]:
# works best with numeric values for the target feature
from sklearn.preprocessing import LabelEncoder
## Feature being predicted ("the Right Answer")
ynum = LabelEncoder().fit_transform(y_train)

In [None]:
# note: for roc_auc_score
# we need to predict the probabilities, 
#     y_prob = clf.predict_proba(X_test)
# instead of predicting the class (like above)
#     y_pred = clf.predict(X_test)
#
# multiclass (ovr) works fine with
#     roc_auc_score(y_test, y_prob)
# but for binary classification
# output from model.predict_proba() is a matrix with 2 columns, one for each class
# to calculate roc, we need to provide the probability of the positive class:
#     roc_auc_score(y_test, y_prob[:,1])

In [None]:
# note: for feature_importance_permutation
# metric MUST be "r2" for regression 
#        or "accuracy" for classification 
#        or a custom function with signature func(y_true, y_pred)

from sklearn.metrics import roc_auc_score

def ovr_roc_auc(y_true, y_pred):
    if twoclass:
        return roc_auc_score(y_true, y_pred[:,1], 
                             average='macro')
    else:
        return roc_auc_score(y_true, y_pred,
                             multi_class='ovr',
                             average='macro')

In [None]:
from mlxtend.evaluate import feature_importance_permutation
clf = models[0][1]
clf.fit(X_train.values, ynum)

imp_vals, _ = feature_importance_permutation(
    predict_method=clf.predict_proba,      ## see note
    X=X_train.values,                      ## cannot use dataframe
    y=ynum,                                ## numeric labels
    metric=ovr_roc_auc,                    ## custom scorer
    num_rounds=1, seed=1)

#print(imp_vals)
#zz = sorted(zip(imp_vals,cols),reverse=True)
#list(zz)

In [None]:
midf = pd.DataFrame({'Name': X_train.columns, 'Score': imp_vals})
#midf.head()

nf = 8     # number of features
# extract the top nf
mihi = midf.sort_values('Score', ascending=False).head(nf)
#mihi

nf = 8     # number of features
# extract the low nf
milo = midf.sort_values('Score', ascending=True).head(nf)
#milo

# merge
hilo = pd.concat([mihi, milo])

In [None]:
hilo

In [None]:
# quick sns.barplot
ptitle = models[0][0]
ptitle += ": feature importance via permutation"
sns.barplot(x = "Score", y = "Name", data = hilo).set(title=ptitle)
plt.show()

In [None]:
# Plot ALL feature importances
ptitle = models[0][0]
ptitle += ": feature importance via permutation"
indices = np.flip(np.argsort(imp_vals))
plt.figure()
plt.title(ptitle)
plt.bar(range(X_train.shape[1]), imp_vals[indices], color="b")
plt.xticks(range(X_train.shape[1]), cols)
plt.xlim([0, X_train.shape[1]])
plt.yticks(np.arange(-0.02, 0.10, 0.02))
plt.grid(False)
plt.show()

***
**<br>SequentialFeatureSelector**
<br>Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d. 
<br><br>
The motivation behind feature selection algorithms is to automatically select a subset of features most relevant to the problem. The goal of feature selection is two-fold: We want to improve the computational efficiency and reduce the model's generalization error by removing irrelevant features or noise.
<br><br>
In a nutshell, SFAs remove or add one feature at a time based on the classifier performance until a feature subset of the desired size k is reached. The 'floating' algorithms have an additional exclusion or inclusion step to remove features once they were included (or excluded) so that a larger number of feature subset combinations can be sampled.
<br><br>
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ has tutorial videos, diagrams<br>
http://rasbt.github.io/mlxtend/api_subpackages/mlxtend.feature_selection/#sequentialfeatureselector documents the parameters

In [None]:
# For SFS we can group one-hot encoded features as a single feature
# feature_groups : the features within a group are always selected together, never split
# HOWEVER
# the `feature_group` list must contain ALL features and
# there should be no common feature betweeen any two distinct groups of features provided 

# from above:
#      put the names into a python list - for pandas.get_dummies()
#      categori = combined_df.select_dtypes(include=['object']).columns
# ((there is probably a more elegant way to do this, but never mind ...))

# create the groups of features
g = []
for c in categori.tolist():
    n = []
    r = X_train.columns.str.startswith(c)
    for x in range(len(r)):
        if r[x]:
            n.append(x)
    g.append(n)
#print(g)

# get the others and add each one as an individual list
a = []
w = X_train.columns.tolist()

for x in range(len(w)):
    a.append(x)

r=a
for x in range(len(g)):
        q=list(set(r).difference(g[x]))
        r=q

for x in range(len(r)):
    z=[]
    z.append(x)
    g.append(z)

print(g)

In [None]:
# patch - only for the demo (churn) notebook!!
#     [Geography was not done automatically by pandas]
g[0] = [6, 7, 8]
g=g[:-3]
print(g)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [None]:
go_fwd = False
 
sfs = SFS(models[0][1], 
          k_features='parsimonious', 
          forward=go_fwd, 
          floating=True,
          cv=2,      # CVfolds
          feature_groups=g,
          scoring='roc_auc_ovr',
          verbose=0, 
          n_jobs= -1)

sfs = sfs.fit(X_train, y_train)

In [None]:
#import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

if go_fwd:
    plt.title('Sequential [Forward] Selection (w. StdDev)')
else:
    plt.title('Sequential [Backward] Selection (w. StdDev)')
    
#plt.ylim([0.8, 1])
plt.grid()
plt.show()

In [None]:
sfs.k_feature_idx_

In [None]:
sfs.k_feature_names_

In [None]:
# uncomment the line below to see a dataframe of all of the details
#pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
# create new dataframes with just those columns
mXtrain = X_train.filter(sfs.k_feature_names_)
mXtest = X_test.filter(sfs.k_feature_names_)

mXtrain.info()

In [None]:
XtrainOriginal = X_train
XtestOriginal = X_test

X_train = mXtrain
X_test = mXtest

In [None]:
# Create a list of the feature names
cols = list(X_train.columns)

**<br>Fit and Predict**