# Preprocessing

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
pd.set_option('display.precision', 3)

# Data Visualisation Libraries
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
import seaborn as sns
sns.set_style('darkgrid')

# Statistics
from scipy.stats import chi2_contingency
from imblearn.over_sampling import SMOTE

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import learning_curve

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
import scikitplot as skplt


In [6]:
train_data=pd.read_csv('../Dataset/train_data.csv')
test_data=pd.read_csv('../Dataset/test_data.csv')

## Parameters and variables 

In [21]:
random_state = 42

## Drop columns 

In [8]:
train_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
train_data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [9]:
test_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
test_data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [7]:
continuous = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary']
categorical = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

## Feature selection

'EstimatedSalary' displays a uniform distribution for both types of customers and can be dropped (as they do not provide any value in predicting our target variable) (in EDA step)

In [10]:
chi2_array, p_array = [], []
for column in categorical:
    crosstab = pd.crosstab(train_data[column], train_data['Exited'])
    chi2, p, dof, expected = chi2_contingency(crosstab)
    chi2_array.append(chi2)
    p_array.append(p)

df_chi = pd.DataFrame({
    'Variable': categorical,
    'Chi-square': chi2_array,
    'p-value': p_array
})
df_chi.sort_values(by='Chi-square', ascending=False)

Unnamed: 0,Variable,Chi-square,p-value
3,NumOfProducts,1233.595,3.767e-267
0,Geography,230.748,7.828999999999999e-51
5,IsActiveMember,195.315,2.199e-44
1,Gender,90.173,2.183e-21
2,Tenure,15.197,0.125
4,HasCrCard,0.301,0.5833


'Tenure' and 'HasCrCard' have a small chi-square and a p-value greater than 0.05 (the standard cut-off value), confirming our initial hypothesis that these two features do not convey any useful information.

### Drop 3 columns: EstimatedSalary, Tenure, HasCrCard

In [12]:
features_drop = ['EstimatedSalary','Tenure','HasCrCard']
train_data = train_data.drop(features_drop, axis=1)

## Encoding Categorical Features

In [13]:
train_data['Gender'] = LabelEncoder().fit_transform(train_data['Gender'])

train_data['Geography'] = train_data['Geography'].map({
    'Germany': 1,
    'Spain':0,
    'France':0
})

## Scaling

In [14]:
scaler = StandardScaler()

scl_columns = ['CreditScore', 'Age', 'Balance']
train_data[scl_columns] = scaler.fit_transform(train_data[scl_columns])


In [17]:
Y_train = train_data['Exited']
X_train = train_data.drop('Exited', axis=1)

## Addressing Class Imbalance using SMOTE function

In [19]:
Y_train.value_counts()

Exited
0    6356
1    1644
Name: count, dtype: int64

There is an imbalance in the classes to be predicted, with 1 class (0 -retained) much more prevalent than the other (1- churned)

In [22]:
over = SMOTE(sampling_strategy='auto', random_state=random_state)
X_train, Y_train = over.fit_resample(X_train, Y_train)
Y_train.value_counts()

Exited
0    6356
1    6356
Name: count, dtype: int64