# Telco Customer Churn Project

Libraries needed

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import resample


Dataset cleaned 

In [2]:
telco_df = pd.read_csv('TelcoClean.csv')

## Machine learning: trying to predict churn

The objetive is to detect the highest quatity of possitive churns because these are the customers who the company wants to retain 

### Preparing data

First of all, I'll drop the CustomerID column because it's an identifier

In [3]:
telco_df = telco_df.drop(['customerID'], axis=1)

I'll drop the TotalCharges column because it's mainly the same as MonthlyCharges x Tenure so the correlation is very high 

In [4]:
telco_df = telco_df.drop(['TotalCharges'], axis=1)

### Encoding categorical data

I'll use ordinal encoding

In [5]:
telco_df['gender'] = telco_df['gender'].map({'Male': 0, 'Female': 1}).astype(int)
telco_df['Partner'] = telco_df['Partner'].map({'No': 0, 'Yes': 1}).astype(int)
telco_df['Dependents'] = telco_df['Dependents'].map({'No': 0, 'Yes': 1}).astype(int)
telco_df['PhoneService'] = telco_df['PhoneService'].map({'No': 0, 'Yes': 1}).astype(int)
telco_df['PaperlessBilling'] = telco_df['PaperlessBilling'].map({'No': 0, 'Yes': 1}).astype(int)

#I'll encode No Service = No
telco_df['MultipleLines'] = telco_df['MultipleLines'].map({'No': 0, 'Yes': 1, 'No phone service': 0}).astype(int)

telco_df['OnlineSecurity'] = telco_df['OnlineSecurity'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)
telco_df['OnlineBackup'] = telco_df['OnlineBackup'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)
telco_df['DeviceProtection'] = telco_df['DeviceProtection'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)
telco_df['TechSupport'] = telco_df['TechSupport'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)
telco_df['StreamingTV'] = telco_df['StreamingTV'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)
telco_df['StreamingMovies'] = telco_df['StreamingMovies'].map({'No': 0, 'Yes': 1, 'No internet service': 0}).astype(int)

telco_df['InternetService'] = telco_df['InternetService'].map({'No': 0, 'Fiber optic': 1, 'DSL': 2}).astype(int)
telco_df['Contract'] = telco_df['Contract'].map({'Month-to-month': 0, 'Two year': 1, 'One year': 2}).astype(int)
telco_df['PaymentMethod'] = telco_df['PaymentMethod'].map({'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3}).astype(int)


I'll encode the target

In [6]:
telco_df['Churn'] = telco_df['Churn'].map({'No': 0, 'Yes': 1}).astype(int)

### Splitting the data

In [7]:
y = telco_df['Churn']
X = telco_df.drop(['Churn'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling the data 

In [8]:
transformer = MinMaxScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(transformer.transform(X_test), columns=X.columns)

In [9]:
X_train_scaled

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges
0,1.0,0.0,0.0,0.0,0.591549,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.666667,0.306468
1,0.0,0.0,0.0,0.0,0.028169,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.000000,0.349751
2,1.0,0.0,1.0,0.0,0.760563,1.0,1.0,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.592040
3,0.0,0.0,1.0,1.0,0.619718,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.666667,0.362189
4,1.0,0.0,1.0,1.0,0.760563,1.0,1.0,0.5,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.333333,0.822388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,0.0,0.0,0.0,0.0,0.028169,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.319900
5621,1.0,0.0,0.0,0.0,0.704225,1.0,1.0,0.5,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000000,0.927861
5622,0.0,0.0,0.0,0.0,0.112676,1.0,1.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.625871
5623,1.0,0.0,0.0,0.0,0.690141,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.014925


In [10]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

### Logistic Regression Model

First of all, I will try a a Logistic Regression Model

In [11]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_scaled, y_train)
pred = LR.predict(X_test_scaled)

print("LR score:", LR.score(X_test_scaled, y_test))
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LR score: 0.7896233120113717
precision:  0.63
recall:  0.5053475935828877
f1:  0.5608308605341247


In [12]:
confusion_matrix(y_test,pred)

array([[922, 111],
       [185, 189]])

As I can observe I'm been able to detect the half of the possitive customer churns (recall)

It is not a bad accuracy but I will try other models to see if I can improve it

### KNN model

In [13]:
KNN = KNeighborsClassifier(n_neighbors = 8) 
KNN.fit(X_train,y_train)
pred = KNN.predict(X_test)
print("KNN score:",KNN.score(X_test,y_test))
print("KNN precision: ",precision_score(y_test,pred))
print("KNN recall: ",recall_score(y_test,pred))
print("KNN f1: ",f1_score(y_test,pred))

KNN score: 0.7697228144989339
KNN precision:  0.6008064516129032
KNN recall:  0.3983957219251337
KNN f1:  0.47909967845659157


I've obtained better results with LR this time

### Oversampling

Let's check the target 

In [14]:
y.value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

I'm going to do oversampling because the data is unbalanced

In [15]:
train = pd.concat([X_train_scaled, y_train],axis=1)

In [16]:
no_churn = train[train['Churn']==0]
yes_churn = train[train['Churn']==1]

In [17]:
yes_churn_oversampled = resample(yes_churn, replace=True, n_samples = len(no_churn), random_state=0)

In [18]:
train_oversampled = pd.concat([no_churn,yes_churn_oversampled],axis=0)

In [19]:
y_train_over = train_oversampled['Churn'].copy()
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()

Trying Logistic Regression Model

In [20]:
LR_over = LogisticRegression(max_iter=1000)
LR_over.fit(X_train_over, y_train_over)
pred = LR_over.predict(X_test_scaled)

print("LR with Oversampling score:", LR_over.score(X_test_scaled, y_test))
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LR with Oversampling score: 0.7484008528784648
precision:  0.5180505415162455
recall:  0.767379679144385
f1:  0.6185344827586207


In [21]:
confusion_matrix(y_test,pred)

array([[766, 267],
       [ 87, 287]])

Using oversampling the results are better, in fact, now I'm detecting the 75% of the customer churns, this means that I can predict 1 of 4 churns more using oversampling and the company will be able to try to retain 3 of 4 customers who want leave.