In [1]:
import pandas as pd
import numpy as np


### Load the dataset and explore the variables.

In [2]:
customer_churn = pd.read_csv('files_for_lab/customer_churn.csv')

In [3]:
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
customer_churn.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


In [5]:
customer_churn.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
customer_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
customer_churn.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [8]:
customer_churn.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [9]:
customer_churn.shape

(7043, 21)

### trying to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
# Define the predictor variables and target variable

X = customer_churn[['tenure','SeniorCitizen','MonthlyCharges']]
y = customer_churn['Churn']

In [12]:
# Convert the target variable to numerical
y = np.where(y == 'Yes',1,0)

In [13]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [14]:
# Fit the logistic regression model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [15]:
# Predict the target variable on the testing set
y_pred = clf.predict(X_test)


In [16]:
# Calculate the accuracy of the model
acc = accuracy_score(y_test, y_pred)
print ("Accuracy",acc)

Accuracy 0.7821149751596878


### Extract the target variable.

In [17]:
y = customer_churn['Churn']
y.describe()

count     7043
unique       2
top         No
freq      5174
Name: Churn, dtype: object

### Extract the independent variables and scale them.

In [18]:
X = customer_churn.iloc[:,:-1]

In [19]:
X.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
dtype: object

In [20]:
#Encode categorical variables:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
categorical_features = X.select_dtypes(include=['object']).columns

X[categorical_features] = X[categorical_features].apply(lambda col: le.fit_transform(col))


In [21]:
X.dtypes

customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges          int64
dtype: object

In [22]:
# scale them
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

In [23]:
display (X_scaler)

array([[ 0.91189045, -1.00955867, -0.43991649, ...,  0.39855772,
        -1.16032292, -0.39860759],
       [ 0.21690598,  0.99053183, -0.43991649, ...,  1.33486261,
        -0.25962894, -0.94876238],
       [-0.47070074,  0.99053183, -0.43991649, ...,  1.33486261,
        -0.36266036, -1.64188328],
       ...,
       [-0.07574495, -1.00955867, -0.43991649, ...,  0.39855772,
        -1.1686319 , -0.13968008],
       [ 1.18683477,  0.99053183,  2.27315869, ...,  1.33486261,
         0.32033821, -0.31653445],
       [-0.63694614,  0.99053183, -0.43991649, ..., -1.47405205,
         1.35896134,  1.13801338]])

### build the logistic regression model

In [24]:
#logistic regression model
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

LogisticRegression()

### Evaluate the model.




In [25]:
# Predict the target values
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = logistic_regression.predict(X_test)


In [26]:
# Calculate accuracy score

acc = accuracy_score(y_test, y_pred)
print ("Accuracy:", acc)

Accuracy: 0.7821149751596878


In [27]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)

Confusion Matrix: 
 [[932 109]
 [198 170]]


#### The accuracy of the model is high because there are a lot of cases in the test data that are predicted correctly, i.e. the model is able to classify the customer churn cases correctly in more than 70% of the cases. This high accuracy could be due to several factors such as the simplicity of the model, the size of the data, or the presence of a strong relationship between the predictors and the target variable.

Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [28]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [29]:
# Apply SMOTE oversampling to the training data

X_resampled, y_resampled = SMOTE().fit_resample (X_train, y_train)

In [30]:
# Fit logistic regression model to the resampled data

log_reg = LogisticRegression()
log_reg.fit(X_resampled, y_resampled)

LogisticRegression()

In [31]:
# Predict on the test data

y_pred = log_reg.predict(X_test)

In [32]:
acc = accuracy_score(y_test, y_pred)
print ("Accuracy:", acc)

Accuracy: 0.7083037615330021


In [33]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[738 303]
 [108 260]]


when apply SMOTE to oversample the minority class in the training data, and then trained a logistic regression model on the oversampled data. The performance of the logistic regression model was then evaluated on the test data using accuracy and confusion matrix. The accuracy of the logistic regression model after using SMOTE was 70.76%, which is lower than the accuracy of 78.21% obtained without using SMOTE. The confusion matrix showed that the model misclassified more examples after using SMOTE compared to without using SMOTE.