In [23]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("churn.csv")

In [4]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
df.shape

(10000, 14)

In [9]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [10]:
cols = ["Geography", "Gender"]

In [11]:
for col in cols:
    df[col] = label_encoder(df[col])

Geography ['France' 'Germany' 'Spain']
Gender ['Female' 'Male']


In [12]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [13]:
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

In [15]:
X = df.drop(["Exited"], axis=1)
y = df["Exited"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [17]:
X_train.shape, X_test.shape

((8000, 10), (2000, 10))

In [18]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
print("GaussianNB: ", accuracy_score(y_test, y_pred))

GaussianNB:  0.782


In [19]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
print("Decision Tree: ", accuracy_score(y_test, y_pred))

Decision Tree:  0.7845


In [20]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.852


In [21]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
print("KNN: ", accuracy_score(y_test, y_pred))

KNN:  0.759


In [22]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.7885


In [24]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print("XGB: ", accuracy_score(y_test, y_pred))

XGB:  0.8485


In [25]:
pickle.dump(rf_model, open("RF.pkl", "wb"))