In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
churn = pd.read_csv('Churn_Modelling.csv')
churn

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [3]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
churn.duplicated().any()

False

# data preprocessing

## drop irrelevant columns/data fields

In [5]:
churn.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [6]:
churn.drop(columns = ['RowNumber', 'CustomerId','Surname' ], inplace = True)

In [7]:
churn

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Encode categorical features to number(int/float)

In [8]:
# from sklearn.preprocessing import LabelEncoder

In [9]:
# le = LabelEncoder()

In [10]:
# churn['Geography'] = le.fit_transform(churn['Geography'])
# churn['Gender'] = le.fit_transform(churn['Gender'])

In [11]:
# churn

In [12]:
churn_encoded = pd.get_dummies(churn, columns = ['Geography', 'Gender'], drop_first = True, dtype = 'int64')

In [13]:
churn_encoded

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.00,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.80,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.00,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,1,0,0,0
9998,772,42,3,75075.31,2,1,0,92888.52,1,1,0,1


# separate features and target variables into in to X and y

In [14]:
X = churn_encoded.drop(columns= 'Exited')
y = churn_encoded['Exited']

In [15]:
X

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.00,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.80,3,1,0,113931.57,0,0,0
3,699,39,1,0.00,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.10,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,0,0,0
9998,772,42,3,75075.31,2,1,0,92888.52,1,0,1


# Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
X = scaler.fit_transform(X)

In [19]:
X

array([[-0.32622142,  0.29351742, -1.04175968, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [-0.44003595,  0.19816383, -1.38753759, ..., -0.57873591,
         1.74273971, -1.09598752],
       [-1.53679418,  0.29351742,  1.03290776, ..., -0.57873591,
        -0.57380915, -1.09598752],
       ...,
       [ 0.60498839, -0.27860412,  0.68712986, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [ 1.25683526,  0.29351742, -0.69598177, ...,  1.72790383,
        -0.57380915,  0.91241915],
       [ 1.46377078, -1.04143285, -0.35020386, ..., -0.57873591,
        -0.57380915, -1.09598752]])

# split the data into training and test sets

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [22]:
X_train

array([[ 0.36701255, -0.66001848,  0.34135195, ..., -0.57873591,
        -0.57380915,  0.91241915],
       [-0.19171334,  0.29351742, -0.35020386, ...,  1.72790383,
        -0.57380915,  0.91241915],
       [-0.94702796, -1.42284721, -0.69598177, ..., -0.57873591,
         1.74273971,  0.91241915],
       ...,
       [ 0.87400456, -0.08789694, -1.38753759, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [ 0.17042381,  0.38887101,  1.03290776, ..., -0.57873591,
        -0.57380915,  0.91241915],
       [ 0.48082708,  1.15169974, -1.38753759, ...,  1.72790383,
        -0.57380915,  0.91241915]])

# importing models

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# initialize models

In [24]:
lrc = LogisticRegression()
svc = SVC()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn = KNeighborsClassifier()

# fit data

In [25]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
dtc.fit(X_train, y_train)
rfc.fit(X_train, y_train)
knn.fit(X_train, y_train)

In [26]:
lrc_predict = lrc.predict(X_test)
svc_predict = svc.predict(X_test)
dtc_predict = dtc.predict(X_test)
rfc_predict = rfc.predict(X_test)
knn_predict = knn.predict(X_test)

In [27]:
print(lrc_predict, svc_predict, dtc_predict, rfc_predict, knn_predict)

[0 0 0 ... 0 0 0] [0 0 0 ... 1 0 0] [1 0 0 ... 0 0 1] [0 0 0 ... 1 0 1] [0 0 0 ... 1 0 0]


# Evaluation models

In [28]:
from sklearn import metrics

# Accuracy

In [29]:
lrc_acc = metrics.accuracy_score(y_test, lrc_predict)
svc_acc = metrics.accuracy_score(y_test, svc_predict)
dtc_acc = metrics.accuracy_score(y_test, dtc_predict)
rfc_acc = metrics.accuracy_score(y_test, rfc_predict)
knn_acc = metrics.accuracy_score(y_test, knn_predict)

In [30]:
print(lrc_acc, svc_acc, dtc_acc, rfc_acc, knn_acc)

0.811 0.856 0.7785 0.865 0.8295


# F1 Score

In [31]:
lrc_f1 = metrics.f1_score(y_test, lrc_predict)
svc_f1 = metrics.f1_score(y_test, svc_predict)
dtc_f1 = metrics.f1_score(y_test, dtc_predict)
rfc_f1 = metrics.f1_score(y_test, rfc_predict)
knn_f1 = metrics.f1_score(y_test, knn_predict)

In [32]:
print(lrc_f1, svc_f1, dtc_f1, rfc_f1, knn_f1)

0.2947761194029851 0.5102040816326531 0.4744958481613286 0.5833333333333334 0.4612954186413902


In [33]:
print(metrics.classification_report(y_test, lrc_predict))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



In [34]:
print(metrics.classification_report(y_test, svc_predict))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1607
           1       0.77      0.38      0.51       393

    accuracy                           0.86      2000
   macro avg       0.82      0.68      0.71      2000
weighted avg       0.85      0.86      0.84      2000



In [35]:
print(metrics.classification_report(y_test, rfc_predict))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.48      0.58       393

    accuracy                           0.86      2000
   macro avg       0.81      0.72      0.75      2000
weighted avg       0.86      0.86      0.85      2000



# save the model

In [36]:
import joblib

In [37]:
rfc = RandomForestClassifier()
rfc.fit(X, y)

In [38]:
joblib.dump(rfc, 'churn_model')

['churn_model']

In [39]:
churn_model = joblib.load('churn_model')

# prepare new data

In [40]:
churn.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [41]:
def prepare_input(data):
    input_df = pd.DataFrame([data])
    # input_df['Geography'] = le.transform(input_df['Geography'])
    # input_df['Gender'] = le.transform(input_df['Gender'])
    scale_input = scaler.transform(input_df)
    return scale_input

In [42]:
def predict_for_new_user(data, model):
    prepared_data = prepare_input(data)
    prediction = model.predict(prepared_data)
    return prediction[0]

In [43]:
new_user_data = {
       'CreditScore' : 600 ,
       'Geography' : 'Germany' ,
       'Gender' : 'Male',
       'Age' : 45,
       'Tenure' : 2,
       'Balance' : 60000,
       'NumOfProducts' : 2,
       'HasCrCard' : 1,
       'IsActiveMember' : 1,
       'EstimatedSalary' : 50000
}
predict_for_new_user(new_user_data, churn_model)

NameError: name 'le' is not defined