# Lab - Imbalanced Data

We will be using the files_for_lab/customer_churn.csv dataset to build a churn predictor.

In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Instructions
Load the dataset and explore the variables.

In [3]:
data = pd.read_csv('customer_churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [4]:
# Slice the data we want to observe

In [5]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
data2 = data[['tenure','SeniorCitizen', 'MonthlyCharges', 'Churn']]
data2

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.30,No
4,2,0,70.70,Yes
...,...,...,...,...
7038,24,0,84.80,No
7039,72,0,103.20,No
7040,11,0,29.60,No
7041,4,1,74.40,Yes


In [7]:
data2.dtypes

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
Churn              object
dtype: object

We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen, MonthlyCharges.

In [8]:
data3 = data.drop(['tenure','SeniorCitizen', 'MonthlyCharges','Churn'], axis=1)
data3

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges
0,7590-VHVEG,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85
1,5575-GNVDE,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,1889.5
2,3668-QPYBK,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,108.15
3,7795-CFOCW,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),1840.75
4,9237-HQITU,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,1990.5
7039,2234-XADUH,Female,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),7362.9
7040,4801-JZAZL,Female,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,346.45
7041,8361-LTMKD,Male,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,306.6


In [9]:
num = data2.select_dtypes(include = np.number)
cat = data2.select_dtypes(include = np.object)
num
cat

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes
...,...
7038,No
7039,No
7040,No
7041,Yes


In [10]:
cat = pd.get_dummies(cat,drop_first=True)
cat.head()

Unnamed: 0,Churn_Yes
0,0
1,0
2,1
3,0
4,1


In [11]:
data2 = pd.concat([num, cat], axis=1)
data2

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn_Yes
0,1,0,29.85,0
1,34,0,56.95,0
2,2,0,53.85,1
3,45,0,42.30,0
4,2,0,70.70,1
...,...,...,...,...
7038,24,0,84.80,0
7039,72,0,103.20,0
7040,11,0,29.60,0
7041,4,1,74.40,1


Extract the independent variables and scale them.

In [12]:
y = data2['Churn_Yes']
X = data2.drop(['Churn_Yes'], axis=1)

In [13]:
transformer = StandardScaler().fit(X)
X_scaled = pd.DataFrame(transformer.transform(X),columns=X.columns)
X_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.277445,-0.439916,-1.160323
1,0.066327,-0.439916,-0.259629
2,-1.236724,-0.439916,-0.36266
3,0.514251,-0.439916,-0.746535
4,-1.236724,-0.439916,0.197365


Build the logistic regression model.

In [31]:
# Assign train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=77)

In [32]:
# Fit model and run LogisticRegression
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train, y_train)

In [33]:
predictions = classification.predict(X_test)
classcore = classification.score(X_test, y_test)
print("The classification score of the model is of", classcore)

The classification score of the model is of 0.7892122072391767


In [34]:
from sklearn import neighbors
clf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf.fit(X, y)
predictions_clf = clf.predict(X_test)
clf.score(X_test, y_test)

KNeighborsClassifier(n_neighbors=3)

0.7395315826827538

In [35]:
print(y_test.value_counts())

0    1042
1     367
Name: Churn_Yes, dtype: int64


In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[950,  92],
       [205, 162]], dtype=int64)

Oversample 

In [37]:
from sklearn.utils import resample

In [38]:
train = pd.concat([X_train, y_train],axis=1)
train.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn_Yes
3013,-0.137274,-0.439916,0.207336,0
3115,-0.7888,-0.439916,0.654359,1
1378,1.043616,-0.439916,0.067745,0
2031,-1.114563,-0.439916,-1.319855,0
866,1.206498,-0.439916,-0.856214,0


In [39]:
no_churn = train[train['Churn_Yes']==0]
yes_churn = train[train['Churn_Yes']==1]

In [40]:
yes_churn_oversampled = resample(yes_churn, replace=True, n_samples = len(no_churn), random_state=0)

In [41]:
display(no_churn.shape)
display(yes_churn_oversampled.shape)

(4132, 4)

(4132, 4)

In [42]:
train_oversampled = pd.concat([no_churn,yes_churn_oversampled])
train_oversampled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn_Yes
3013,-0.137274,-0.439916,0.207336,0
1378,1.043616,-0.439916,0.067745,0
2031,-1.114563,-0.439916,-1.319855,0
866,1.206498,-0.439916,-0.856214,0
4929,1.247218,2.273159,0.639403,0


In [43]:
y_train_over = train_oversampled['Churn_Yes'].copy()
X_train_over = train_oversampled.drop('Churn_Yes',axis = 1).copy()

Undersample

In [44]:
no_churn_undersampled = resample(no_churn, replace=False, n_samples = len(yes_churn), random_state=0)

In [45]:
display(yes_churn.shape)
display(no_churn_undersampled.shape)

(1502, 4)

(1502, 4)

In [46]:
train_undersampled = pd.concat([yes_churn,no_churn_undersampled])
train_undersampled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn_Yes
3115,-0.7888,-0.439916,0.654359,1
6926,0.962175,-0.439916,0.302058,1
1053,-1.114563,-0.439916,-0.839596,1
2679,-1.155283,-0.439916,0.328647,1
131,-0.300156,-0.439916,-0.20479,1


In [49]:
y_train_under = train_undersampled['Churn_Yes'].copy()
X_train_under = train_undersampled.drop('Churn_Yes',axis = 1)

SMOTE

In [50]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100,sampling_strategy='minority',k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

Evaluate the model

In [51]:
# Original train/test

from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)
LR.score(X_test, y_test)

LogisticRegression(random_state=0)

0.7892122072391767

In [52]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.6377952755905512
recall:  0.44141689373297005
f1:  0.5217391304347826


Other results : Undersampling, Oversampling & SMOTE 

In [53]:
# Oversampled train/test results

In [54]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LogisticRegression(max_iter=1000)

precision:  0.504708097928437
recall:  0.7302452316076294
f1:  0.5968819599109132


In [55]:
# Undersampled train/test results
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_under, y_train_under)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LogisticRegression(max_iter=1000)

precision:  0.5028142589118199
recall:  0.7302452316076294
f1:  0.5955555555555555


In [56]:
# SMOTE train/test results
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LogisticRegression(max_iter=1000)

precision:  0.5037593984962406
recall:  0.7302452316076294
f1:  0.596218020022247
