> **Bank "churn" dataset**
<br>` 'Exited' is our classification target `
<br>` 1 - went elsewhere (nonzero is True) `
<br>` 0 - remains as a customer `

**Importing the libraries**

In [None]:
import pandas as pd
import numpy as np

**Importing the Dataset**

In [None]:
## file path: windows style
data = pd.read_csv('..\\datasets\\churn_modelling.csv')

## file path: unix style
#data = pd.read_csv('../datasets/churn_modelling.csv')

# shape method gives the dimensions of the dataset
print('Dataset dimensions: {} rows, {} columns'.format(data.shape[0], data.shape[1]))

In [None]:
data.info()

In [None]:
data.head(13)

***
**Data Preparation and EDA** (unique to this dataset)
* _Check for missing values_
* _Quick visual check of unique values_
* _Split the classification feature out of the dataset_
* _Check column names of categorical attributes ( for get_dummies() )_
* _Check column names of numeric attributes ( for Scaling )_

**Check for missing values**

In [None]:
cnt=0
print('Missing Values - ')
for col in data.columns:
    nnul = pd.notnull(data[col]) 
    if (len(nnul)!=len(data)):
        cnt=cnt+1
        print('\t',col,':',(len(data)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

# address missing values here

**Quick visual check of unique values, deal with unique identifiers**

In [None]:
# Identify columns with only one value 
# or with number of unique values == number of rows
n_eq_one = []
n_eq_all = []

print('Unique value count (',data.shape[0],'Rows in the dataset )')
for col in data.columns:
    lc = len(data[col].unique())
    print(col, ' ::> ', lc)
    if lc == 1:
        n_eq_one.append(data[col].name)
    if lc == data.shape[0]:
        n_eq_all.append(data[col].name)

In [None]:
# Drop columns with only one value
if len(n_eq_one) > 0:
    print('Dropping single-valued features')
    print(n_eq_one)
    data.drop(n_eq_one, axis=1, inplace=True)

# Drop or bin columns with number of unique values == number of rows
if len(n_eq_all) > 0:
    print('Dropping unique identifiers')
    print(n_eq_all)
    data.drop(n_eq_all, axis=1, inplace=True)

# continue with featue selection / feature engineering

In [None]:
# let's "bin" the EstimatedSalary and the Balance

In [None]:
print('Estimated Salary - minValue: ',data['EstimatedSalary'].min(),
      '  maxValue: ',data['EstimatedSalary'].max())

In [None]:
print('Balance - minValue: ',data['Balance'].min(),
      '  maxValue: ',data['Balance'].max())

In [None]:
range_labels = ['(Zero)','Below 1k','1k-35k','36k-59k','60k-95k','96k-119k','120k-179k','180k-239k','240k-300k']
cut_bins = [-1, 0, 999, 35999, 59999, 95999, 119999, 179999, 239999, 299999]
data['SalaryRange'] = pd.cut(data['EstimatedSalary'], bins=cut_bins, labels=range_labels)
data['BalanceRange'] = pd.cut(data['Balance'], bins=cut_bins, labels=range_labels)

In [None]:
data.head(6)

In [None]:
print('Unique value count: Estimated Salary ',len(data['EstimatedSalary'].unique()),
      '  SalaryRange ',len(data['SalaryRange'].unique()))

In [None]:
print('Unique value count: Balance ',len(data['Balance'].unique()),
      '  BalanceRange ',len(data['BalanceRange'].unique()))

In [None]:
# let's drop the detail and keep the categories
#   Using inPlace makes permanent changes to the dataframe in memory 
#   otherwise drop() will not affect the dataset we are working on
data.drop(['EstimatedSalary'], axis=1, inplace=True)
data.drop(['Balance'], axis=1, inplace=True)

In [None]:
# Remove one more column that will not help predict the outcome
data.drop(['Surname'], axis=1, inplace=True)
data.head(2)

In [None]:
data.info()

**Classification target feature**
<br>"the Right Answers", or more formally "the desired outcome"
<br>Must be in a separate dataset for classification ,,,

In [None]:
## 'Exited' is our classification target 
## 1 (nonzero is True) - went elsewhere, zero - remains as a customer
print(data['Exited'].value_counts())

In [None]:
## Text labels look better in the confusion matrix

## a 'lambda' function is always simple, used only once
#data.Exited = data.Exited.apply(lambda x: 'Gone' if x==1 else 'Here')

## an alternative to a 'lambda' that has the same effect
data['Exited'] = ['Gone' if x==1 else 'Here' for x in data['Exited']]

## Let's change the name to 'Status' too - 'rename' is like 'drop'
## setting the parameter 'inplace' to True changes the original DataFrame 
## if not set, a new DataFrame is returned
data.rename(columns={'Exited': 'Status'}, inplace = True)

data['Status'].value_counts()

In [None]:
data.info()

* Split the classification feature out of the dataset 

In [None]:
## Feature being predicted ("the Right Answer")
labels_col = 'Status'
y = data[labels_col]

## Features used for prediction 
X = data.drop(labels_col, axis=1)

In [None]:
# generate a sorted list of unique labels to use later
from sklearn.utils.multiclass import unique_labels
targetlabels = unique_labels(y)

**Check column names of categorical attributes**
<br>Features with text values (categorical attributes) need to be normalised
<br>by changing them to numeric types that the algorithms find easier to work with

In [None]:
categori = X.select_dtypes(include=['object','category']).columns
print(categori.to_list())

In [None]:
# check the distribution of the feature values 
for col in categori:
    print('Distribution of categories in', col)
    print(X[col].value_counts())
    print()

* 'one hot' encoding transforms a single column of text values into 
multiple columns of discrete values: 
it creates a new column for each unique value and puts
(one) in the column for which it is true and (zero) in the others

In [None]:
Country = pd.get_dummies(X.Geography)
Country.head()

In [None]:
X = pd.concat([X, Country], axis=1)
X.drop('Geography', axis=1, inplace=True)
X.info()

In [None]:
# the automatic way adds the original feature name
X = pd.get_dummies(X)

In [None]:
X.info()

In [None]:
# Drop one-hot columns with no values (no data in this category)
onehot = X.select_dtypes(include=['uint8']).columns
for col in onehot:
    lc = len(X[col].unique())
    if lc == 1:
        print('Dropping ',col, ' ::> ', lc)
        X.drop(col, axis=1, inplace=True)

In [None]:
X.info()

**Check column names of numeric attributes**
<br>Features with numeric values need to be normalised
<br>by changing them to small numbers in a specific range (scaling)

In [None]:
numeri = X.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

In [None]:
# The proper place to do scaling comes later in the pipeline ,,, 

***
**<br>Checking Correlations**<br>
_using X and y datasets (complete, not normalised)_<br>
_some argue that only the training data and labels should be used..._

In [None]:
# for graphs
import matplotlib.pyplot as plt
import seaborn as sns

# works best with numeric values for the target feature
from sklearn.preprocessing import LabelEncoder
## Feature being predicted ("the Right Answer")
ynum = LabelEncoder().fit_transform(y)

In [None]:
# Pearson Correlation
# pandas df.corr() gives different results than yellowbrick! so we do this directly with numpy

nf = 5        # (number of features to select)

# use numpy directly
cormx = []
for col in X.columns:
    vals = X[col].values
    coco = np.corrcoef(vals, ynum)[0,1]
    cormx.append((col, coco))

# convert to dataframe, select highest and lowest, Join
cordf = pd.DataFrame(cormx, columns=['Name','Score'])

corhi = cordf.sort_values('Score',ascending=False).head((nf))
corlo = cordf.sort_values('Score',ascending=False).tail(nf)

corhl = pd.concat([corhi, corlo])

corcols = corhl['Name'].values

print('Pearson correlation with classification target')
print(corhl)

In [None]:
sns.barplot(x = "Score", y = "Name", data = corhl).set(
    title='Pearson Correlation with Target')
plt.show()

**<br>Mutual Info Classification**
<br>Calculates a mutual information value for each independent variable with respect to the dependent variable, and selects the ones with the highest information gain. In other words, basically it tells us how useful the feature X is at predicting the random variable Y on a scale of zero to one, with higher numbers indicating better predictors.<br><br>Use this as the primary metric of covariance, and use pearson r just to find the "polarity" (-/+)

In [None]:
nf = 10        # (number of features to select)

# This takes a bit longer to run than the Pearson correlations ...

from sklearn.feature_selection import mutual_info_classif

# add random_state= for consistent output from multiple runs
minf = mutual_info_classif(X, y, random_state = 111)

# put the output into a dataframe
midf = pd.DataFrame({'Name': X.columns, 'Score': minf})

# extract the top nf
mihi = midf.sort_values('Score', ascending=False).head(nf)

micols = mihi['Name'].values

print('Mutual Information with classification target')
print(mihi)

In [None]:
# quick sns.barplot
sns.barplot(x = "Score", y = "Name", data = mihi).set(
    title='Mutual Info Correlation with Target')
plt.show()

In [None]:
# let's merge the lists and create a new dataframe with just those features
merged = list(corcols)
merged.extend(n for n in micols if n not in merged)

# create a new dataframe with just those columns
mX = X.filter(merged)

mX.info()

- _Pairwise Correlation of features_

In [None]:
# Rank2D performs pairwise comparisons of each feature in the data set 
# with a specific metric or algorithm (default: Pearson correlation) 
# then returns them ranked as a lower left triangle diagram.

from yellowbrick.features.rankd import Rank2D

visualizer = Rank2D()
visualizer.fit(mX, y)
visualizer.transform(mX)
visualizer.show()

- _Correlation Matrix "HeatMap"_

In [None]:
# plot the full heatmap with values
from matplotlib.colors import ListedColormap
# Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)

# from yellowbrick.Rank2D
hm=visualizer.ranks_

plt.figure(figsize=(12,12))
sns.heatmap(hm, annot=True, cmap=colormap, xticklabels=merged, yticklabels=merged)
plt.show()