## Churn Prediction: Logistic Regression



### Data Preparation

In [1]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

In [2]:
customers = pd.read_csv('input/customer.csv')

In [3]:
# customers = 'https://...'
# !wget $customers -0 input/customers.csv

In [4]:
customers.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
customers.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
customers.columns = customers.columns.str.lower()
categorical_columns = customers.dtypes[customers.dtypes == 'object'].index #index: gets the index labels (which are the column names) of this filtered Series
for c in categorical_columns:
    customers[c] = customers[c].str.lower().str.replace(' ', '_')

In [7]:
customers.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [8]:
customers.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [9]:
# "totalcharges" has an object data type while it should be a number. So I will change it to a numeric data file.
customers.totalcharges = pd.to_numeric(customers.totalcharges, errors = 'coerce')  # There are some empty cells that cannot be parsed, I will use coerce to ignore them.

In [10]:
customers[customers.totalcharges.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,
753,3115-czmzd,
936,5709-lvoeq,
1082,4367-nuyao,
1340,1371-dwpaz,
3331,7644-omvmy,
3826,3213-vvolg,
4380,2520-sgtta,
5218,2923-arzlg,
6670,4075-wkniu,


In [11]:
# I will replace the empty cells with 0.
customers.totalcharges.fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers.totalcharges.fillna(0, inplace=True)


In [12]:
# I'll have a look at the churn column.
customers.churn

0        no
1        no
2       yes
3        no
4       yes
       ... 
7038     no
7039     no
7040     no
7041    yes
7042     no
Name: churn, Length: 7043, dtype: object

In [13]:
# I will change yes/no to 1 and 0.
customers.churn = (customers.churn == 'yes').astype(int)


In [14]:
customers.churn

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

### Setting Up the Validation Framework

In [16]:
# I will split the data into train, validation and test sets, using scikit-learn. Train: 60%, Validation: 20%, Test: 20%. First, I will split the data into train (full train: %80) and test sets (%20), then I will split the train set into train and validation sets.

from sklearn.model_selection import train_test_split

customers_full_train, customers_test = train_test_split(customers, test_size = 0.2, random_state = 1)
customers_train, customers_val = train_test_split(customers_full_train, test_size = 0.25, random_state = 1)  # 0.2 / 0.8 = 0.25 because the validation set should be 20% of the full data and 20% of full data set is 25% of the full train set.
len(customers_train), len(customers_val), len(customers_test)

(4225, 1409, 1409)

In [17]:
customers_train = customers_train.reset_index(drop=True) 
customers_val = customers_val.reset_index(drop=True)
customers_test = customers_test.reset_index(drop=True)

In [18]:
# Let's get the y variables
y_train = customers_train.churn.values
y_val = customers_val.churn.values
y_test = customers_test.churn.values

# "y_test = customers_test.churn" returns a pandas Series object.
# "y_test = customers_test.churn.values" returns a NumPy array.


In [19]:
del customers_train['churn']
del customers_val['churn']
del customers_test['churn']

### Exploratory Data Analysis

In [20]:
customers_full_train = customers_full_train.reset_index(drop=True)

In [None]:
customers_full_train.isnull().sum()
#There are no missing values in the data.

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [24]:
# Let's have a look at the target variable. 
customers_full_train.churn.value_counts(normalize = True) #normalize changes the counts to percentages.

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [27]:
global_churn_rate = customers_full_train.churn.mean()
round(global_churn_rate,2)

np.float64(0.27)

In [None]:
# Let's look at categorical and numerical variables.