# Step 1 :  Import Library and Dataset

In [43]:
import pandas as pd
import numpy as np

In [44]:
# Read the data in
employee = pd.read_csv("churn.csv")

In [45]:
employee.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer,42.3,1840.75,No
4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Step 2 : Data Pre-Processing

### Univariate Analysis

In [46]:
employee.describe()

Unnamed: 0,tenure,MonthlyCharges
count,7043.0,7043.0
mean,32.371149,64.761692
std,24.559481,30.090047
min,0.0,18.25
25%,9.0,35.5
50%,29.0,70.35
75%,55.0,89.85
max,72.0,118.75


### Removing Irrelavent Variable

In [47]:
employee = employee.drop(['customerID'],axis=1)
employee.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [48]:
#Replacing spaces with null values in total charges column
employee['TotalCharges'] =employee["TotalCharges"].replace(" ",np.nan).astype(float) 
# string cannot be convert float direclty 

### Checking Missing Value

In [49]:
# Do we have NA's in data
employee.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [50]:
employee.TotalCharges.fillna(employee.TotalCharges.mean(),inplace=True) # one column at a time bb

In [51]:
# Do we have NA's in data
employee.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [52]:
employee.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer,42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [53]:
employee.OnlineSecurity=employee.OnlineSecurity.replace({'No internet service' : 'No'})
employee.OnlineBackup=employee.OnlineBackup.replace({'No internet service' : 'No'})
employee.DeviceProtection=employee.DeviceProtection.replace({'No internet service' : 'No'})
employee.TechSupport=employee.TechSupport.replace({'No internet service' : 'No'})
employee.StreamingTV=employee.StreamingTV.replace({'No internet service' : 'No'})
employee.StreamingMovies=employee.StreamingMovies.replace({'No internet service' : 'No'})
employee.MultipleLines=employee.MultipleLines.replace({'No phone service' : 'No'})

In [54]:
employee.MultipleLines.value_counts()

No     4072
Yes    2971
Name: MultipleLines, dtype: int64

### Taking subset data of Number 

In [55]:
# #Employee Numeric columns
employee_num = employee[employee.select_dtypes(include=[np.number]).columns.tolist()]
employee_num.head(3)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5
2,2,53.85,108.15


### Taking subset data of Category 

In [56]:
employee_dummies = employee[employee.select_dtypes
                        (include=['object']).columns.tolist()]
employee_dummies.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Female,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,Male,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes


In [57]:
employee_dummies.describe(include=[object])

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
count,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043
unique,2,2,2,2,2,2,3,2,2,2,2,2,2,3,2,4,2
top,Male,No,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
freq,3555,5901,3641,4933,6361,4072,3096,5024,4614,4621,4999,4336,4311,3875,4171,2365,5174


### Converting Quality Variable to Number

In [58]:
from sklearn.preprocessing import LabelEncoder
employee_dummies=employee_dummies.apply(LabelEncoder().fit_transform)
employee_dummies.head(3)
# label in ascending order

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,2,0
1,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,3,0
2,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,3,1


### Combine to Dataset

In [59]:
employee_combined = pd.concat([employee_num, employee_dummies],axis=1)

In [60]:
employee_combined.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,1,29.85,29.85,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,2,0
1,34,56.95,1889.5,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,3,0
2,2,53.85,108.15,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,3,1
3,45,42.3,1840.75,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0
4,2,70.7,151.65,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


# Step 3: Data Partition

In [61]:
#Dividing data into train and test dataset
from sklearn.model_selection import train_test_split
#from random import seed

#seed(20)
train_x = employee_combined.drop(['Churn'],axis=1)
train_y = employee_combined['Churn']

# Train test split

X_train, X_test, y_train, y_test =train_test_split(train_x,
                    train_y,test_size=0.3,random_state=231)

# Step 4: Model Building

In [62]:
#Import Tree Classifier model
from sklearn import tree

dt = tree.DecisionTreeClassifier()
#Train the model using the training sets
dt.fit(X_train,y_train)

DecisionTreeClassifier()

# Step 5: Plotting the Tree 

In [63]:
# Ploting Tree
import graphviz 
from six import StringIO
#from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import pydot

In [64]:
df=pd.concat([y_train,X_train],axis=1)

In [65]:
features = list(df.columns[1:])
features

['tenure',
 'MonthlyCharges',
 'TotalCharges',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [66]:
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,filled=True,
                feature_names=features,rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found

# Step 6 : Predictions on Train Dataset

In [None]:
train=pd.concat([X_train,y_train],axis=1)
train.head()

In [None]:
train['Predicted']=dt.predict(X_train)
train.head()

# Step 7 : Model Performance Metrics

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(train['Churn'],train['Predicted'])
print(confusion_matrix)

In [None]:
Accuracy_Train=((3616+1306)/(4930)*100)
print(Accuracy_Train)

In [None]:
Accuracy_Train=((3353+529)/(4930)*100)
print(Accuracy_Train)

# Step 8 : Predictions on Test Dataset

In [None]:
test=pd.concat([X_test,y_test],axis=1)

In [None]:
test['Predicted']=dt.predict(X_test)
test.head()

# Step 9 : Model Performance Metrics on Test data 

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(test['Churn'],test['Predicted'])
print(confusion_matrix)

In [None]:
Accuracy_test=((1441+203)/(2113)*100)
Accuracy_test

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test['Predicted']))

### Model Improvement by Pruning Method

In [None]:
#Import Tree Classifier model
from sklearn import tree

dt = tree.DecisionTreeClassifier(criterion='gini',
                                 min_samples_leaf=40,
                                 min_samples_split=100,
                                 max_depth=4)
#Train the model using the training sets
dt.fit(X_train,y_train)

### After Running this go back then run from  Plotting the Graph

# Exporting Model

In [None]:
import os 
os.chdir(r"C:\Users\vinit\Downloads\Data Science\decision tree")

In [None]:
import pickle
# Saving model
pickle.dump(dt, open('model.pkl','wb'))
model=pickle.load(open('model.pkl','rb'))