<a href="https://colab.research.google.com/github/swethag04/ml-projects/blob/main/classification/comparing_classification_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('sample_data/telecom_churn.csv')
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
df.shape

(3333, 20)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

In [6]:
# Checking for nulls
df.isnull().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [7]:
# Converting object Column to numeric
object_cols = ['State', 'International plan', 'Voice mail plan' ]
LE = LabelEncoder()
for i in object_cols:
  df[i] = df[[i]].apply(LE.fit_transform)
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,16,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,35,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,31,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,35,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,36,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [8]:
# prediction class
df['Churn'].value_counts()

False    2850
True      483
Name: Churn, dtype: int64

In [9]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']
X_scaled = X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, random_state = 42)

In [10]:
# Logistic Regression
params = {
          'C': [0.1, 1, 10, 100],
          'penalty': ['l1', 'l2'],
          'solver': ['liblinear']
          }
t0= time.time()
lgr_grid = GridSearchCV(param_grid=params, cv=5,
                        estimator = LogisticRegression(random_state=42)).fit(X_train, y_train)
lgr_fit_time= time.time()-t0
print(f"Logistic regression training time: {lgr_fit_time}s")

lgr_train_accuracy =  lgr_grid.score(X_train, y_train)
lgr_test_accuracy =  lgr_grid.score(X_test, y_test)
print("Logistic regression train accuracy: ", lgr_train_accuracy)
print("Logistic regression test accuracy: ", lgr_test_accuracy)

Logistic regression training time: 31.653472423553467s
Logistic regression train accuracy:  0.8611444577831132
Logistic regression test accuracy:  0.8597122302158273


In [11]:
#KNN
params = {'n_neighbors': [3,5,7,9],
          'weights': ['uniform', 'distance'],
           'p': [1,2]}
t0= time.time()
knn_grid = GridSearchCV(param_grid=params, cv=5,
                   estimator = KNeighborsClassifier()).fit(X_train, y_train)
knn_fit_time= time.time()-t0
print(f"KNN classifier training time: {knn_fit_time} s")

knn_train_accuracy =  knn_grid.score(X_train, y_train)
knn_test_accuracy =  knn_grid.score(X_test, y_test)
print("KNN classifier train accuracy: ", knn_train_accuracy)
print("KNN classifier test accuracy: ", knn_test_accuracy)

KNN classifier training time: 4.618025302886963 s
KNN classifier train accuracy:  0.9267707082833133
KNN classifier test accuracy:  0.8884892086330936


In [12]:
#SVC
params = {'kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
         'gamma': [0.1, 1.0, 10.0, 100.0],}
t0= time.time()
svc_grid = GridSearchCV(param_grid=params, cv=5,
                   estimator = SVC())
svc_grid.fit(X_train, y_train)
svc_fit_time = time.time()-t0
print(f"SVC training time: {svc_fit_time}s")

svc_train_accuracy =  svc_grid.score(X_train, y_train)
svc_test_accuracy =  svc_grid.score(X_test, y_test)
print("SVC train accuracy: ", svc_train_accuracy)
print("SVC test accuracy: ", svc_test_accuracy)

SVC training time: 17.39738154411316s
SVC train accuracy:  0.9715886354541817
SVC test accuracy:  0.9004796163069544


In [15]:
#Decision tree
params = {'criterion': ['gini', 'entropy'],
         'max_depth': np.arange(3, 15)}
t0= time.time()
dt_grid = GridSearchCV(param_grid=params, cv=5,
                   estimator = DecisionTreeClassifier())
dt_grid.fit(X_train, y_train)
dt_fit_time = time.time()-t0
print(f"Decision tree training time: {dt_fit_time}s")

dt_train_accuracy =  dt_grid.score(X_train, y_train)
dt_test_accuracy =  dt_grid.score(X_test, y_test)
print("Decision tree train accuracy: ", dt_train_accuracy)
print("Decision tree test accuracy: ", dt_test_accuracy)

Decision tree training time: 3.1807384490966797s
Decision tree train accuracy:  0.9755902360944377
Decision tree test accuracy:  0.9304556354916067


In [16]:
results_df = pd.DataFrame()
results ={}
results = {'model':['KNN', "Logistic Regression", 'SVC', 'Decision Tree'],
           'train score': [knn_train_accuracy, lgr_train_accuracy, svc_train_accuracy, dt_train_accuracy  ],
           'test score': [knn_test_accuracy, lgr_test_accuracy, svc_test_accuracy, dt_test_accuracy],
           'average fit time': [knn_fit_time, lgr_fit_time, svc_fit_time,dt_fit_time ]}
results_df = pd.DataFrame.from_dict(results)
print(results_df)

                 model  train score  test score  average fit time
0                  KNN     0.926771    0.888489          4.618025
1  Logistic Regression     0.861144    0.859712         31.653472
2                  SVC     0.971589    0.900480         17.397382
3        Decision Tree     0.975590    0.930456          3.180738
