In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report, RocCurveDisplay, f1_score
from imblearn.over_sampling import SMOTE

In [2]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [6]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [13]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [14]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [16]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [20]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].median())
churnData['TotalCharges'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
7043 non-null   float64
dtypes: float64(1)
memory usage: 55.2 KB


In [22]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.50
2,2,0,53.85,108.15
3,45,0,42.30,1840.75
4,2,0,70.70,151.65
...,...,...,...,...
7038,24,0,84.80,1990.50
7039,72,0,103.20,7362.90
7040,11,0,29.60,346.45
7041,4,1,74.40,306.60


In [25]:
y = churnData['Churn']
y = y.apply(lambda x: 1 if x == 'Yes' else 0)
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train, y_train)

y_train_pre = model.predict(X_train)
y_test_pre = model.predict(X_test)

print(classification_report(y_test, y_test_pre))
print(classification_report(y_train, y_train_pre))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1298
           1       0.62      0.46      0.53       463

    accuracy                           0.78      1761
   macro avg       0.72      0.68      0.69      1761
weighted avg       0.77      0.78      0.77      1761

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      3876
           1       0.65      0.46      0.54      1406

    accuracy                           0.79      5282
   macro avg       0.74      0.69      0.70      5282
weighted avg       0.78      0.79      0.78      5282



In [29]:
y.value_counts()/len(y)

Churn
0    0.73463
1    0.26537
Name: count, dtype: float64

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

sm = SMOTE(k_neighbors=5)
X_train, y_train = sm.fit_resample(X_train, y_train)
X_test, y_test = sm.fit_resample(X_test, y_test)
    
model = LogisticRegression()
model.fit(X_train, y_train)

y_train_pre = model.predict(X_train)
y_test_pre = model.predict(X_test)

print(classification_report(y_test, y_test_pre))
print(classification_report(y_train, y_train_pre))

              precision    recall  f1-score   support

           0       0.73      0.71      0.72      1298
           1       0.72      0.74      0.73      1298

    accuracy                           0.73      2596
   macro avg       0.73      0.73      0.73      2596
weighted avg       0.73      0.73      0.73      2596

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      3876
           1       0.74      0.75      0.74      3876

    accuracy                           0.74      7752
   macro avg       0.74      0.74      0.74      7752
weighted avg       0.74      0.74      0.74      7752



In [58]:
from imblearn.under_sampling import RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
X_test, y_test = rus.fit_resample(X_test, y_test)
    
model = LogisticRegression()
model.fit(X_train, y_train)

y_train_pre = model.predict(X_train)
y_test_pre = model.predict(X_test)

print(classification_report(y_test, y_test_pre))
print(classification_report(y_train, y_train_pre))


              precision    recall  f1-score   support

           0       0.76      0.70      0.73       479
           1       0.72      0.77      0.75       479

    accuracy                           0.74       958
   macro avg       0.74      0.74      0.74       958
weighted avg       0.74      0.74      0.74       958

              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1390
           1       0.74      0.74      0.74      1390

    accuracy                           0.74      2780
   macro avg       0.74      0.74      0.74      2780
weighted avg       0.74      0.74      0.74      2780

