In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,recall_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.combine import SMOTEENN
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('tel_churn.csv')

In [7]:
df.head()


Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


### Creating independent and dependent variable

In [8]:
x = df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [9]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Decision tree classifier

In [12]:
model_dt = DecisionTreeClassifier()

In [14]:
model_dt.fit(x_train,y_train)

In [16]:
y_pred = model_dt.predict(x_test)

In [17]:
print(classification_report(y_test,y_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      1031
           1       0.46      0.47      0.47       376

    accuracy                           0.71      1407
   macro avg       0.63      0.64      0.64      1407
weighted avg       0.71      0.71      0.71      1407



Since we have an imbalanced dataset so accuracy cannot be considered as the metric, instead we use precision or recall of the minority class to understand the performance of the model.
In order to work with imbalanced dataset we use SMOTEENN.

### Upsampling

In [21]:
sm = SMOTEENN()

In [24]:
x_resampled, y_resampled = sm.fit_resample(x,y)

In [31]:
xr_train,xr_test,yr_train,yr_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

In [32]:
model_dtr = DecisionTreeClassifier()

In [33]:
model_dtr.fit(xr_train,yr_train)

In [34]:
yr_pred = model_dtr.predict(xr_test)

In [35]:
print(classification_report(yr_test,yr_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       510
           1       0.94      0.95      0.94       654

    accuracy                           0.94      1164
   macro avg       0.94      0.94      0.94      1164
weighted avg       0.94      0.94      0.94      1164



### Random Forest Classifier

#### Upsampling

In [38]:
sm = SMOTEENN()

In [39]:
x_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [40]:
xrf_train,xrf_test,yrf_train,yrf_test = train_test_split(x_resampled1, y_resampled1,test_size=0.2)

In [105]:
model_rf = RandomForestClassifier(n_estimators=120)

In [106]:
model_rf.fit(xrf_train,yrf_train)

In [107]:
yrf_pred = model_rf.predict(xrf_test)

In [108]:
print(classification_report(yrf_test,yrf_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95       524
           1       0.95      0.98      0.96       657

    accuracy                           0.96      1181
   macro avg       0.96      0.96      0.96      1181
weighted avg       0.96      0.96      0.96      1181



### Logistic Regression

In [69]:
lr = LogisticRegression()

In [75]:
xlr_train,xlr_test,ylr_train,ylr_test = train_test_split(x_resampled1, y_resampled1,test_size=0.2)

In [76]:
lr.fit(xlr_train,ylr_train)

In [77]:
ylr_pred = lr.predict(xrf_test)

In [78]:
print(classification_report(yrf_test,yrf_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       524
           1       0.95      0.97      0.96       657

    accuracy                           0.95      1181
   macro avg       0.95      0.95      0.95      1181
weighted avg       0.95      0.95      0.95      1181



### SVM

In [82]:
sv = SVC()

In [83]:
xsv_train,xsv_test,ysv_train,ysv_test = train_test_split(x_resampled1, y_resampled1,test_size=0.2)

In [84]:
sv.fit(xsv_train,ysv_train)

In [85]:
ysv_pred = sv.predict(xsv_test)

In [86]:
print(classification_report(ysv_test,ysv_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.71      0.83      0.77       521
           1       0.85      0.74      0.79       660

    accuracy                           0.78      1181
   macro avg       0.78      0.78      0.78      1181
weighted avg       0.79      0.78      0.78      1181



With Random Forest we are getting better results as compared to other algorithms, so we will use the same. Also there were so significant improvement in the performance of the model with hyperparameter tuning.

In [109]:
import pickle

In [110]:
pickle.dump(model_rf,open('model.pkl','wb'))

In [111]:
load = pickle.load(open('model.pkl','rb'))

In [112]:
result = load.score(xrf_test,yrf_test)
result

0.9593564775613886