### Import Libraries

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

<font size="4">Read CSV</font>

In [3]:
df = pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


Drop the "Unnamed: 0" column

In [4]:
df = df.drop("Unnamed: 0", axis = 1)
df.head(5)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [5]:
# Create X and Y variables
x= df.drop("Churn", axis = 1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [6]:
y = df["Churn"]
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

## Create Model

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2) # 20% testing data, 80% training data

### Decision Tree Classifier

In [8]:
model_dt = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)

In [9]:
model_dt.fit(x_train, y_train)

In [10]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
model_dt.score(x_test,y_test)

0.7874911158493249

In [12]:
print(classification_report(y_test, y_pred, labels = [0,1])) # accuracy = (true positives + true negatives) / total outcome

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1038
           1       0.66      0.39      0.49       369

    accuracy                           0.79      1407
   macro avg       0.74      0.66      0.68      1407
weighted avg       0.77      0.79      0.77      1407



In [13]:
print(confusion_matrix(y_test, y_pred))

[[965  73]
 [226 143]]


The precision, recall, and f1-score are too low for churned customers, so this model is not effective. This is because the data is imbalanced.

We are going to use SMOTEENN to over/under sample our data.

In [14]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [15]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [16]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [17]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9468354430379747
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       522
           1       0.94      0.97      0.95       663

    accuracy                           0.95      1185
   macro avg       0.95      0.94      0.95      1185
weighted avg       0.95      0.95      0.95      1185



In [18]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[479  43]
 [ 20 643]]


Now we can see quite better results, i.e. Accuracy: 94 %, and a very good recall, precision & f1 score for minority class.

Let's try with some other classifier!

**Random Forest Classifier**

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)


In [21]:
model_rf.fit(x_train,y_train)

In [22]:
y_pred=model_rf.predict(x_test)

In [23]:
model_rf.score(x_test,y_test)

0.8009950248756219

In [24]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1038
           1       0.70      0.43      0.53       369

    accuracy                           0.80      1407
   macro avg       0.76      0.68      0.70      1407
weighted avg       0.79      0.80      0.78      1407



Based on the results showing imbalanced precision and recall scores between the two classes (0 and 1), it is recommended to use SMOTEENN resampling technique to balance the data and improve the model's performance.

In [26]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [27]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [28]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
model_rf_smote.fit(xr_train1,yr_train1)

In [30]:
yr_predict1 = model_rf_smote.predict(xr_test1)


In [31]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [32]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9284497444633731
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       552
           1       0.90      0.97      0.93       622

    accuracy                           0.93      1174
   macro avg       0.93      0.93      0.93      1174
weighted avg       0.93      0.93      0.93      1174



The results indicate a highly effective model in predicting customer churn. For the non-churned customers (class 0), the model achieved a precision of 0.96, meaning a high proportion of customers predicted as non-churned were indeed non-churned, and a recall of 0.88, indicating that 88% of all actual non-churned customers were correctly identified. On the other hand, for the churned customers (class 1), the model demonstrated excellent performance with a precision of 0.90, implying that a significant portion of customers predicted as churned indeed churned, and a high recall of 0.97, indicating that 97% of all actual churned customers were correctly classified. The overall accuracy of the model is 0.93, indicating its proficiency in correctly classifying both non-churned and churned customers. These impressive results suggest that the model is well-suited for identifying potential churned customers, which is crucial for businesses in retaining their valuable clientele.


**Pickling the Model**

We will be using the Random Forest Model for our Flask application.

In [33]:
import pickle

In [34]:
filename = 'my_model.sav'

In [35]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [36]:
load_model = pickle.load(open(filename, 'rb'))

In [37]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [38]:
model_score_r1

0.9284497444633731

**Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI**