## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
                                                                                   
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import pickle

import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [8]:
telco_customer = pd.read_csv("ChurnTrainDataset.csv.csv")

## Preprocessing

In [9]:
# Encoding categorical data using cat codes
for col in telco_customer.columns[telco_customer.dtypes == 'object']:
    if col!='churn':
        telco_customer[col]=telco_customer[col].astype('category').cat.codes

# Fill Null Values of target column
telco_customer['churn'] =  telco_customer['churn'].fillna(telco_customer['churn'].mode()[0])

# Manual encoding the target variable
a={'yes':1,'no':0}
telco_customer['churn']=telco_customer['churn'].map(a)

## Seperate Features & Target Variable 

In [10]:
X = telco_customer.drop('churn',axis=1)
y=telco_customer['churn']

In [11]:
cat_cols = X.select_dtypes(include=['int8']).columns.values

In [12]:
num_cols = X.select_dtypes(include=['float64']).columns.values


## Train Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

## Feature Engineering pipeline

In [14]:
# Imputation Transformer to fill null values
fill_null_col = ColumnTransformer([
        ('FillCat',SimpleImputer(strategy='most_frequent'),cat_cols),
        ('FillNumeric',SimpleImputer(strategy='median'),num_cols),
    ],remainder='passthrough')

# Scaling
scale_col = ColumnTransformer([
    ('scale', StandardScaler(),slice(5,18))
])

# Principal Component Analysis
pca_col = ColumnTransformer([
    ('PCA', PCA(n_components=10),slice(0,18))
])

# # Model
logistic=LogisticRegression()
decision=DecisionTreeClassifier()
random = RandomForestClassifier()

## Model Pipeline

In [15]:
pipeline_model1 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier1', logistic)])

pipeline_model2 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier2', decision)])

pipeline_model3 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier3', random)])

In [16]:
pipelines=[pipeline_model1,pipeline_model2,pipeline_model3]

In [17]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [18]:
pipe_dict={0:'Logistic Regression',1:'Decision Tree',2:'Random Forest'}

# Fitting pipelines
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [19]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy : {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy : 0.8552941176470589
Decision Tree Test Accuracy : 0.8105882352941176
Random Forest Test Accuracy : 0.8811764705882353


In [20]:
for i,model in enumerate(pipelines):
    pred=model.predict(X_test)
    print("\n{} Classification Report :\n {}".format(pipe_dict[i],classification_report(y_test,pred)))


Logistic Regression Classification Report :
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       724
           1       1.00      0.02      0.05       126

    accuracy                           0.86       850
   macro avg       0.93      0.51      0.48       850
weighted avg       0.88      0.86      0.79       850


Decision Tree Classification Report :
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       724
           1       0.38      0.42      0.40       126

    accuracy                           0.81       850
   macro avg       0.64      0.65      0.64       850
weighted avg       0.82      0.81      0.81       850


Random Forest Classification Report :
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       724
           1       0.88      0.23      0.36       126

    accuracy                           0.88       850
  

In [21]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with the best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with the best accuracy:Random Forest


## Best Hyperparameters for Random Forest Classifier Using GridSearchCV using a pipeline

In [None]:
params = [{"classifier3__n_estimators": range(100, 501, 100),
           "classifier3__max_depth":range(10,30,10),
           "classifier3__min_samples_leaf":[1, 2, 4],
           "classifier3__min_samples_split": [2, 5, 10],
          }]
                 
randomized_cv = GridSearchCV(estimator = pipeline_model3,
                           param_grid = params,
                           cv = 10 ,verbose = 1, n_jobs=-1)

randomized_cv.fit(X_train,y_train)
randomized_cv.best_params_

In [None]:
randomized_cv = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier3', RandomForestClassifier(max_depth=20,min_samples_leaf=1,min_samples_split=5,n_estimators=400))])

randomized_cv.fit(X_train,y_train)
filename = 'modelv1.pkl'
pickle.dump(randomized_cv, open(filename, 'wb'))

## Load the model from disk

In [None]:
filename = 'modelv1.pkl'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

## Model Prediction

In [None]:
prediction = loaded_model.predict(X_test)
prediction

## Model Evaluation

In [None]:
conf_matrix = confusion_matrix(y_test, prediction)
print("confusion matrix")
print(conf_matrix)
print(classification_report(y_test,prediction))