In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
churn_df  = pd.read_csv(r"C:\Users\Mohit sharma\Downloads\Telecom Churn.csv")
churn_df.head(10)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
5,AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False
6,MA,121,510,No,Yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,False
7,MO,147,415,Yes,No,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,False
8,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
9,WV,141,415,Yes,Yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,False


In [3]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

In [4]:
churn_df['Churn'].value_counts()

False    2850
True      483
Name: Churn, dtype: int64

In [5]:
churn_df['Churn'].value_counts()/len(churn_df)

False    0.855086
True     0.144914
Name: Churn, dtype: float64

In [6]:
churn_df.isnull().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [7]:
churn_df.columns

Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')

In [8]:
cat_columns = ['State','Area code','International plan','Voice mail plan']
num_columns = ['Account length','Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls']

In [9]:
X = churn_df.drop(['Churn'],axis=1)
y = churn_df.Churn

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=101)

In [11]:
from sklearn.pipeline import Pipeline
cat_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(drop='first'))
])

num_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

preprocessing_pipe = ColumnTransformer([
    ('cat',cat_pipe,cat_columns),
    ('num',num_pipe,num_columns)
]) 

X_train = preprocessing_pipe.fit_transform(X_train)
X_test = preprocessing_pipe.transform(X_test)

In [12]:
pd.DataFrame(X_train.toarray()).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
count,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,...,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0
mean,0.02561,0.013605,0.018007,0.010004,0.020408,0.02401,0.014406,0.018407,0.018407,0.016006,...,-2.455463e-16,3.254922e-16,-3.256921e-16,-2.483452e-16,2.698477e-16,-1.980098e-16,9.862726000000001e-17,-1.339909e-16,2.180906e-16,-2.05118e-16
std,0.158001,0.115869,0.133004,0.099538,0.14142,0.15311,0.11918,0.134446,0.134446,0.125525,...,1.0002,1.0002,1.0002,1.0002,1.0002,1.0002,1.0002,1.0002,1.0002,1.0002
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.961989,-4.998761,-3.962092,-3.52538,-3.258254,-3.527105,-3.679733,-1.788524,-3.680541,-1.186191
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.6779245,-0.6524825,-0.6784373,-0.6623186,-0.6907015,-0.6601229,-0.6217347,-0.5943163,-0.6157563,-0.4334506
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01738578,-0.003038614,0.01711512,0.002676181,0.02821328,0.003530277,0.02584143,-0.196247,0.0238509,-0.4334506
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.6793055,0.6464052,0.6803162,0.68559,0.695777,0.6848809,0.6734176,0.5998915,0.6767832,0.3192896
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.181637,3.493967,3.180608,3.877167,3.006574,3.87484,3.515557,6.172861,3.51504,5.588471


In [13]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
y_pred_log_reg = log_reg.predict(X_test)

In [15]:
#Train score for Logistic regression
train_score = log_reg.score(X_train,y_train)
train_score

0.8671468587434974

In [16]:
#Test score for Logistic regression
test_score = log_reg.score(X_test,y_test)
test_score

0.8609112709832134

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
#Classification report for logistic regression tree
print(classification_report(y_test,y_pred_log_reg))

              precision    recall  f1-score   support

       False       0.89      0.95      0.92       729
        True       0.40      0.21      0.28       105

    accuracy                           0.86       834
   macro avg       0.65      0.58      0.60       834
weighted avg       0.83      0.86      0.84       834



In [18]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=6)
tree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [19]:
y_pred_tree = tree.predict(X_test)

In [20]:
#Train score for Decision Tree Classifier
Train_score = tree.score(X_train,y_train)
Train_score

0.9615846338535414

In [21]:
#Test score for Decision Tree Classifier
Test_score = tree.score(X_test,y_test)
Test_score

0.9400479616306955

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
#Classification report for Decision tree
print(classification_report(y_test,y_pred_tree))

              precision    recall  f1-score   support

       False       0.96      0.97      0.97       729
        True       0.78      0.72      0.75       105

    accuracy                           0.94       834
   macro avg       0.87      0.85      0.86       834
weighted avg       0.94      0.94      0.94       834



In [23]:
confusion_matrix(y_test, y_pred_tree)

array([[708,  21],
       [ 29,  76]], dtype=int64)