In [2]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("telcom.csv") #Load data

In [4]:
data.shape #Rows,colums

(7043, 21)

In [5]:
data.isnull().sum() #check null values in data

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
data = data.drop('customerID',axis=1) #Drop the irrelevant column

In [7]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [8]:
data['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [9]:
data['Churn'] = data['Churn'].map({'No':0,'Yes':1}) #turing categorical to numeric value

In [10]:
data['Churn'].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [11]:
X = data.drop('Churn',axis=1) #seprate the colums X and Y
Y= data['Churn']

In [12]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'],errors='coerce') #converting the total charges into the numeric value

In [13]:
data['TotalCharges'].isnull().sum() #found 11 nan value in total charges

11

In [14]:
data.dropna(inplace = True)

In [15]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y) #Train and test split

In [16]:
cat_cols = X_train.select_dtypes(include='object').columns #diffrentiate the numeric and the categorical columns 
num_cols = X_train.select_dtypes(exclude='object').columns 

In [17]:
#Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num_cols),
        ('cat',OneHotEncoder(handle_unknown='ignore'),cat_cols)
    ])

In [18]:
#Add model to pipeline
model = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('classifier',LogisticRegression(max_iter=1000))
])

In [19]:
model.fit(X_train,Y_train)

In [22]:
Y_pred = model.predict(X_test) #Predictions

In [28]:
from sklearn.metrics import accuracy_score #Accuracy
print("Accuracy:" , accuracy_score(Y_test,Y_pred))

Accuracy: 0.794180269694819


In [30]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred)) #classification report

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409



In [32]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test,Y_pred) #confusion matrix
print(cm)

[[917 118]
 [172 202]]


In [33]:
y_prob = model.predict_proba(X_test)[:,1] 

In [34]:
y_prob

array([0.03286906, 0.70502069, 0.06194178, ..., 0.13675628, 0.0070805 ,
       0.00733226])

In [35]:
import joblib
import os

In [36]:
os.makedirs("model",exist_ok=True) #create model folder

In [37]:
joblib.dump(model,"model/churn_model.pkl") #save trained pipeline model

['model/churn_model.pkl']

In [38]:
print("Model save succesfully as model/churn_model.pkl")

Model save succesfully as model/churn_model.pkl
