In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [4]:
import random
from imblearn.over_sampling import SMOTE

In [5]:
file_path = "Telco-Customer-Churn.csv"

df = pd.read_csv(file_path)

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [8]:
df = df.drop("customerID", axis = 1)


In [9]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [10]:
# convert object to float
df["TotalCharges"] = pd.to_numeric(df['TotalCharges'], errors = "coerce")
# drops rows with NaN total charges that results from coerce
df = df.dropna(how = "any", axis = 0)

In [11]:
df['Churn'] = np.where(df.Churn == 'Yes', 1, 0)

In [12]:
# Converting all categorical variables into dummy variables
# Note we have a new df now named as df_dummies and now we will perform on it instead of df
df_dummies = pd.get_dummies(df)
df_dummies.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [57]:
# Split the data into features (X) and target variable (y)
X = df_dummies.drop("Churn", axis = 1)
y = df_dummies["Churn"]

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [58]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [59]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [60]:
y_pred

array([0, 0, 0, ..., 1, 0, 1])

In [61]:
y_test

5561    0
5814    0
2645    0
3983    1
6438    1
       ..
2757    0
5702    1
1662    1
2766    0
2918    0
Name: Churn, Length: 1407, dtype: int32

In [62]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.5834586466165415


In [63]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.8031272210376688


In [64]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7137396807494086


#Random sampling:
    

In [82]:
df_dummies['Churn'].count()

7032

In [84]:
class_counts = df['Churn'].value_counts()
print(class_counts)

0    5163
1    1869
Name: Churn, dtype: int64


In [85]:
# Get the minimum class count
min_count = class_counts.min()
print(min_count)

1869


In [95]:
a= 0
b = df_dummies['Churn'].index

In [130]:

# Randomly sample the dataset to balance it (undersampling)
balanced_df = df.groupby('Churn').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Checking the size of each class after applying undersampling
class_sizes = balanced_df['Churn'].value_counts()
print(class_sizes)


0    1869
1    1869
Name: Churn, dtype: int64


In [131]:
balanced_df[balanced_df['Churn']==1]['Churn'].count()/balanced_df['Churn'].count()

0.5

In [132]:
balanced_df[balanced_df['Churn']==0]['Churn'].count()/balanced_df['Churn'].count()

0.5

# Undersampling

In [133]:
# Split the data into features (X) and target variable (y)
X = balanced_df.drop("Churn", axis = 1)
y = balanced_df["Churn"]

# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [134]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [136]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [137]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.7845579078455791


In [138]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.7687165775401069


In [139]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7687165775401069


In [149]:
# Random oversampling
# Count the number of observations in each class
class_counts = df_dummies['Churn'].value_counts()
print(class_counts)

# Get the maximum class count (majority class count)
max_count = class_counts.max()
print(max_count)

# Perform oversampling
oversampled_df = (df_dummies.groupby('Churn')
                  .apply(lambda x: x.sample(max_count, replace=True))
                  .reset_index(drop=True))

# Checking the size of each class after applying oversampling
class_sizes = oversampled_df['Churn'].value_counts()
print(class_sizes)

0    5163
1    1869
Name: Churn, dtype: int64
5163
0    5163
1    5163
Name: Churn, dtype: int64


# Oversampling usign SMOT

In [140]:
# Split the data into features (X) and target variable (y)
X = df_dummies.drop("Churn", axis = 1)
y = df_dummies["Churn"]

# Apply SMOTE to balance the dataset
smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [141]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [142]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.8582159624413147
0.8538238141335914
0.8535972842192131


# class weighting

In [143]:
# Split the data into features (X) and target variable (y)
X = df_dummies.drop("Churn", axis = 1)
y = df_dummies["Churn"]


# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [144]:
from sklearn.linear_model import LogisticRegression
# Define class weights
class_weights = {0: 0.2, 1: 0.8}
# Create logistic regression object with class weights
lr = LogisticRegression(class_weight=class_weights)
# Train the model using the training data
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight={0: 0.2, 1: 0.8})

In [145]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.616751269035533
0.7853589196872779
0.7444898726443913
