In [1]:
# Importing the libraries
import xgboost
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
print(df.head())
print("\n")

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [3]:
# Importing the dataset

print(df['Exited'].value_counts())
print("\n")

unique=df.Geography.unique()
print(unique)

count=df.Gender.unique()
print(count)

0    7963
1    2037
Name: Exited, dtype: int64


['France' 'Spain' 'Germany']
['Female' 'Male']


# Upsampling to negate Class Imbalance

In [4]:
from sklearn.utils import resample
df_majority=df[df.Exited==0] ## all rows where Exited==0
df_minority=df[df.Exited==1] ## all rows where Exited==1

df_minority_upsampled=resample(df_minority,replace=True,n_samples=7963,random_state=123)
df_upsampled=pd.concat([df_minority_upsampled,df_majority])

In [6]:
print(df_upsampled['Exited'].value_counts())

1    7963
0    7963
Name: Exited, dtype: int64


In [7]:
X = df.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y = df.Exited

X_upsampled = df_upsampled.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y_upsampled = df_upsampled.Exited

In [8]:
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(X.select_dtypes(include=['object']).columns)
print(categoryList)
print("\n")

## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(X[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

X.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(X.head())
print("\n")


X=pd.concat([X,dummies], axis =1 ) ## added encoded categorical columns


#### Do the same steps for upsampled #####
categoryList = list(X_upsampled.select_dtypes(include=['object']).columns)
print(categoryList)
print("\n")

## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(X_upsampled[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

X_upsampled.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(X_upsampled.head())
print("\n")


X_upsampled=pd.concat([X_upsampled,dummies], axis =1 ) ## added encoded categorical columns

['Geography', 'Gender']


   Geography_France  Geography_Germany  Geography_Spain  Gender_Female  \
0                 1                  0                0              1   
1                 0                  0                1              1   
2                 1                  0                0              1   
3                 1                  0                0              1   
4                 0                  0                1              1   

   Gender_Male  
0            0  
1            0  
2            0  
3            0  
4            0  


   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42       2       0.00              1          1   
1          608   41       1   83807.86              1          0   
2          502   42       8  159660.80              3          1   
3          699   39       1       0.00              2          0   
4          850   43       2  125510.82              1          1   

   IsActiveMembe

In [9]:
print(X.head())
print("\n")
print(X_upsampled.head())

   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42       2       0.00              1          1   
1          608   41       1   83807.86              1          0   
2          502   42       8  159660.80              3          1   
3          699   39       1       0.00              2          0   
4          850   43       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany  \
0               1        101348.88                 1                  0   
1               1        112542.58                 0                  0   
2               0        113931.57                 1                  0   
3               0         93826.63                 1                  0   
4               1         79084.10                 0                  0   

   Geography_Spain  Gender_Female  Gender_Male  
0                0              1            0  
1                1              1         

In [13]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test_upsampled, y_train, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size = 0.2, random_state = 0)

# Importing few Bagging and Boosting Algorithms

In [11]:
from sklearn. ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score 

  from numpy.core.umath_tests import inner1d


# AdaBoost

In [14]:
## Boosting
#Initially, all observations in the dataset are given equal weights.
#A model is built on a subset of data.
#Using this model, predictions are made on the whole dataset.
#Errors are calculated by comparing the predictions and actual values.
#While creating the next model, higher weights are given to the data points which were predicted incorrectly.
#Weights can be determined using the error value. For instance, higher the error more is the weight assigned to the observation.
#This process is repeated until the error function does not change, or the maximum limit of the number of estimators is reached.

adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 3, learning_rate = 0.001)
adb.fit(X_train,y_train) ## Tries to find the pattern in the data

print("Training=",adb.score(X_train,y_train))
print("Testing=",adb.score(X_test,y_test))

accuracies_adaboost= cross_val_score(estimator = adb, X = X_train, y = y_train, cv = 10) 
accuracies_adaboost_mean=accuracies_adaboost.mean()*100
print("Accuracy AdaBoost=",accuracies_adaboost_mean)

accuracies_adaboost_std=accuracies_adaboost.std()*100
print("Standard Deviation AdaBoost=",accuracies_adaboost_std)

Training= 1.0
Testing= 0.9705
Accuracy AdaBoost= 90.59663909554395
Standard Deviation AdaBoost= 0.8373626437711816


# Gradient Boost Classifier

In [15]:
## Gradient Boost Classifier

from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=0.01,random_state=1)
model.fit(X_train, y_train)

gbc_train=model.score(X_train,y_train)
print("gbc_train=",gbc_train)

gbc_test=model.score(X_test,y_test)
print("gbc_test=",gbc_test)

accuracies_gboost= cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10) 
accuracies_gboost_mean=accuracies_gboost.mean()*100
print("Accuracy Gradient Boost=",accuracies_gboost_mean)

accuracies_gboost_std=accuracies_gboost.std()*100
print("Standard Deviation Gradient Boost=",accuracies_gboost_std)

gbc_train= 0.7637362637362637
gbc_test= 0.803
Accuracy Gradient Boost= 75.98894074519046
Standard Deviation Gradient Boost= 1.1003439524579555


# Meta Bagging Estimator

In [16]:
# Bagging meta estimator
#Random subsets are created from the original dataset (Bootstrapping).
#The subset of the dataset includes all features.
#A user-specified base estimator is fitted on each of these smaller sets.
#Predictions from each model are combined to get the final result.


from sklearn.ensemble import BaggingClassifier
from sklearn import tree

model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(X_train, y_train)

print("mgc_train=",model.score(X_train, y_train))
print("mgc_test=",model.score(X_test,y_test))

accuracies_Bagging_MEst= cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10) 
accuracies_Bagging_MEst_Mean=accuracies_Bagging_MEst.mean()*100

print("Accuracy Bagging=",accuracies_Bagging_MEst_Mean)

accuracies_bagging_MEst_std=accuracies_Bagging_MEst_Mean.std()*100
print("Standard Deviation Bagging=",accuracies_bagging_MEst_std)

mgc_train= 0.9974097331240188
mgc_test= 0.984
Accuracy Bagging= 93.0223125856784
Standard Deviation Bagging= 0.0


# Voting Classifier

In [17]:
## Bagging on multiple classifiers:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

lr = LogisticRegression()
svm=SVC(kernel = 'rbf', C=10,gamma=1)
knn=KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
nb=GaussianNB()
dt=DecisionTreeClassifier(criterion = 'entropy')
rf=RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
#ann_classifier=KerasClassifier(build_fn=buildClassifier_DP,batch_size = 25, epochs = 100)


evc = VotingClassifier( estimators= [('lr',lr),('dt',dt),('svm',svm),('knn',knn),('nb',nb),('rf',rf)], voting = 'hard')
evc.fit(X_train,y_train)

print("evc_train=",evc.score(X_train,y_train))
print("evc_test",evc.score(X_test,y_test))

#Training_Testing_Difference_EVC=(evc.score(X_train, y_train) - evc.score(X_test, y_test))*100
#print(Training_Testing_Difference_EVC)

accuracies_EVC= cross_val_score(estimator = evc, X = X_train, y = y_train, cv = 10) 
accuracies_EVC_mean=accuracies_EVC.mean()*100
print("Accuracy EVC=",accuracies_EVC_mean)

accuracies_EVC_std=accuracies_EVC_mean.std()*100
print("Standard Deviation EVC=",accuracies_EVC_std)

  if diff:


evc_train= 0.984850863422292


  if diff:


evc_test 0.977


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Accuracy EVC= 92.11932905133776
Standard Deviation EVC= 0.0


  if diff:


# Max Voting

In [None]:
import statistics
model1 = tree.DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict(X_test)
pred2=model2.predict(X_test)
pred3=model3.predict(X_test)

final_pred = np.array([])
for i in range(0,len(X_test)):
    final_pred = np.append(final_pred, statistics.mode([pred1[i], pred2[i], pred3[i]]))
    
#print("maxvoting_test",evc.score(X_test,final_pred))
#print("\n")

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,classification_report

cm = confusion_matrix(y_test, final_pred)
print(classification_report(y_test, final_pred))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=25) ## Hyperparameter
classifier.fit(X_train,y_train)
predictions = classifier.predict(X_test)


from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

from sklearn.model_selection import cross_val_score 

accuracies_logistic= cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) 
accuracies_logistic_mean=accuracies_logistic.mean()*100
print("Mean Accuracy:Random Forest=",accuracies_logistic_mean)

accuracies_logistic_std=accuracies_logistic.std()*100
print("Standard Deviation:Random Forest=",accuracies_logistic_std)


## Hyper Parameter Tuning
print('Parameters currently in use:\n')
print(classifier.get_params())
print("\n")

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 5)] ## play with start and stop

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 20, num = 5)] ## change 10,20 and 2
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,10]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)
print("Best Parameters are:",rf_random.best_params_)

In [None]:
best_random = rf_random.best_estimator_
best_random.fit(X_train,y_train)

predictions = best_random.predict(X_test)


from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

from sklearn.model_selection import cross_val_score 

accuracies_rf= cross_val_score(estimator = best_random, X = X_train, y = y_train, cv = 10) 
accuracies_rf_mean=accuracies_rf.mean()*100
print("Mean Accuracy:Random Forest=",accuracies_logistic_mean)

accuracies_rf_std=accuracies_logistic.std()*100
print("Standard Deviation:Random Forest=",accuracies_rf_std)

In [None]:
X_test.info()

# Light GBM

In [None]:
## conda install -c conda-forge lightgbm

import lightgbm as lgb
d_train = lgb.Dataset(X_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt' ## gradient boosting
params['objective'] = 'binary' ## since its a classification problem
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
clf = lgb.train(params, d_train, 100)


#Prediction
y_pred=clf.predict(X_test)

#convert into binary values
for i in range(0,2000): ## 10000 indicates the number of rows in the dataset
    if y_pred[i]>=0.5:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0
    
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))  

# XgBoost

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Making the Confusion Matrix

from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

mean_score = cross_val_score(estimator = classifier, X = X_train, y = y_train, scoring="roc_auc", cv = 7).mean()
std_score = cross_val_score(estimator = classifier, X = X_train, y = y_train, scoring="roc_auc", cv = 7).std()


print ("Accuracy=",mean_score*100)
print("Standard Deviation",std_score*100)

# Cat Boost

In [None]:
# Importing the dataset
#from catboost import CatBoostRegressor -- use "pip install catboost" 

df = pd.read_csv('Churn_Modelling.csv')

X = df.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y = df.Exited

X = df.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y = df.Exited

X_upsampled = df_upsampled.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y_upsampled = df_upsampled.Exited

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test_upsampled, y_train, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size = 0.2, random_state = 0)


categorical_features_indices = np.where(X.dtypes == np.object)[0]

In [None]:
X.info()

In [None]:
categorical_features_indices

In [None]:
from catboost import CatBoostClassifier

### In Case of Regression
##from catboost import CatBoostRegressor
##model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
##model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)

model=CatBoostClassifier(iterations=100, depth=3, learning_rate=0.1, loss_function='Logloss')
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_test, y_test),plot=True)

y_pred=model.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

# Stacking

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

## Create a stacking with xgboost and random Forest.
## Homework: add Gradient Boost, AdaBoost, Light GBM and CatBoost
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
pred_val_xgb=xgb.predict(X_train)
test_pred_xgb=xgb.predict(X_test)




best_random.fit(X_train,y_train) ## Best Hyperparameters of random Forest found above
pred_val_rf=best_random.predict(X_train)
test_pred_rf=best_random.predict(X_test)


lr = LogisticRegression()
stacked_predictions=np.column_stack((pred_val_rf,pred_val_xgb))

#stacked_predictions[0:10]
stacked_test_predictions=np.column_stack((test_pred_rf,test_pred_xgb))
#stacked_test_predictions[0:10]

## Building Meta Model
lr.fit(stacked_predictions,y_train)

y_pred=lr.predict(stacked_test_predictions)
y_pred


from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred)) 

accuracies_lr= cross_val_score(estimator = lr, X = stacked_predictions, y = y_train, cv = 10) 
accuracies_lr_mean=accuracies_lr.mean()*100
print("Accuracy Stacking=",accuracies_lr_mean)

accuracies_lr_std=accuracies_lr_mean.std()*100
print("Standard Deviation lr=",accuracies_lr_std)

# Blending

In [None]:
df = pd.read_csv('Churn_Modelling.csv')

X = df.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y = df.Exited

X = df.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y = df.Exited

X_upsampled = df_upsampled.drop(['Exited','RowNumber','CustomerId','Surname'], axis=1)
y_upsampled = df_upsampled.Exited


### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(X.select_dtypes(include=['object']).columns)
print(categoryList)
print("\n")

## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(X[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

X.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(X.head())
print("\n")


X=pd.concat([X,dummies], axis =1 ) ## added encoded categorical columns


#### Do the same steps for upsampled #####
categoryList = list(X_upsampled.select_dtypes(include=['object']).columns)
print(categoryList)
print("\n")

## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(X_upsampled[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

X_upsampled.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(X_upsampled.head())
print("\n")


X_upsampled=pd.concat([X_upsampled,dummies], axis =1 ) ## added encoded categorical columns



from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test_upsampled, y_train, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size = 0.2, random_state = 0)

In [None]:
## Mix independent variables and stacked outputs 

from sklearn.model_selection import train_test_split


model1 = rf_random.best_estimator_ ## Best hyperparameters of Random Forest
model1.fit(X_train, y_train)
val_pred1=model1.predict(X_train)
test_pred1=model1.predict(X_test)


model2 = XGBClassifier()
model2.fit(X_train,y_train)
val_pred2=model2.predict(X_train)
test_pred2=model2.predict(X_test)


stacked_predictions=np.column_stack((X_train,val_pred1,val_pred2))
stacked_test_predictions=np.column_stack((X_test,test_pred1,test_pred2))

stacked_predictions=pd.DataFrame(stacked_predictions)
stacked_test_predictions=pd.DataFrame(stacked_test_predictions)

## Building Meta Model
lr = LogisticRegression()
lr.fit(stacked_predictions,y_train)

y_pred=lr.predict(stacked_test_predictions)
y_pred

from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))  