In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install xgboost
!pip install imbalanced_learn



In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score,f1_score,recall_score

In [10]:
customer_data = pd.read_csv("/content/drive/MyDrive/Churn_Modelling.csv")
customer_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [28]:
customer_data['Exited'].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [12]:
# Train
dataset = customer_data.drop(["RowNumber","Surname","CustomerId"],axis=1)

In [15]:
# Tạo Onehot
dataset_dummy = pd.get_dummies(dataset, drop_first=True)

x = dataset_dummy.drop(["Exited"],axis=1)
y = dataset_dummy["Exited"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

scaler = MinMaxScaler()
scaler.fit(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])

# Đánh giá hiệu suất
def print_scores(y_true, y_pred):
  print(classification_report(y_true,y_pred))


In [16]:
# Train model với imbalance data
model = LogisticRegression(class_weight="balanced")
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print_scores(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1607
           1       0.38      0.71      0.50       393

    accuracy                           0.72      2000
   macro avg       0.65      0.72      0.65      2000
weighted avg       0.81      0.72      0.74      2000



In [29]:
# Sủ dụng phương pháp Undersampling : Near Miss
dataset_dummy = pd.get_dummies(dataset, drop_first=True)

X = dataset_dummy.drop(["Exited"],axis=1)
y = dataset_dummy["Exited"]

from imblearn.under_sampling import NearMiss
nm = NearMiss()

x_res , y_res = nm.fit_resample(X,y)
x_res.shape, y_res.shape

((4074, 11), (4074,))

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x_res,y_res,test_size=0.2, random_state=42)

scaler = MinMaxScaler()
scaler.fit(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])

model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print_scores(y_test,y_pred)


              precision    recall  f1-score   support

           0       0.85      0.97      0.91       443
           1       0.96      0.80      0.87       372

    accuracy                           0.89       815
   macro avg       0.90      0.88      0.89       815
weighted avg       0.90      0.89      0.89       815



In [32]:
# Sử dụng kỹ thuật oversampling: SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE()
dataset_dummy = pd.get_dummies(dataset,drop_first=True)

x = dataset_dummy.drop(["Exited"],axis=1)
y = dataset_dummy['Exited']

x_sm, y_sm = sm.fit_resample(x, y)
x_sm.shape, y_sm.shape


((15926, 11), (15926,))

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x_sm,y_sm,test_size=0.2, random_state=42)

scaler = MinMaxScaler()
scaler.fit(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_train[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scaler.transform(x_test[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])

model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print_scores(y_test,y_pred)


              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1633
           1       0.77      0.77      0.77      1553

    accuracy                           0.77      3186
   macro avg       0.77      0.77      0.77      3186
weighted avg       0.77      0.77      0.77      3186

