In [1]:
# Importing libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

churn_df = pd.read_csv('Telco_Customer_Churn.csv')

churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Consider only Gender, PaymentMethod, MonthlyCharges, Tenure and Churn

churn_df = churn_df[['gender', 'PaymentMethod', 'MonthlyCharges',

'tenure', 'Churn']].copy()

churn_df.head()

Unnamed: 0,gender,PaymentMethod,MonthlyCharges,tenure,Churn
0,Female,Electronic check,29.85,1,No
1,Male,Mailed check,56.95,34,No
2,Male,Mailed check,53.85,2,Yes
3,Male,Bank transfer (automatic),42.3,45,No
4,Female,Electronic check,70.7,2,Yes


In [4]:
# Replacing missing values with zero
df = churn_df.copy()

df.fillna(0, inplace=True)

In [5]:
# Create machine readable dummy variables

encode = ['gender', 'PaymentMethod']

for col in encode:
    dummy = pd.get_dummies(df[col], prefix=col)
    
    df = pd.concat([df,dummy], axis=1)
    
    del df[col]

In [6]:
df.head()

Unnamed: 0,MonthlyCharges,tenure,Churn,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,29.85,1,No,1,0,0,0,1,0
1,56.95,34,No,0,1,0,0,0,1
2,53.85,2,Yes,0,1,0,0,0,1
3,42.3,45,No,0,1,1,0,0,0
4,70.7,2,Yes,1,0,0,0,1,0


In [7]:
# Map the churn column values in to binaries
import numpy as np

df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0)

In [8]:
df.head()

Unnamed: 0,MonthlyCharges,tenure,Churn,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,29.85,1,0,1,0,0,0,1,0
1,56.95,34,0,0,1,0,0,0,1
2,53.85,2,1,0,1,0,0,0,1
3,42.3,45,0,0,1,1,0,0,0
4,70.7,2,1,1,0,0,0,1,0


In [9]:
X = df.drop('Churn', axis=1)
Y = df['Churn']

In [10]:
#creating a model
clf = RandomForestClassifier()
clf.fit(X, Y)

RandomForestClassifier()

In [11]:
# Saving our model in pickle format
pickle.dump(clf, open('churn_clf.pkl', 'wb'))