In [80]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv(r"/Users/ziadali/Documents/Personal Projects/Projects/Customer Churn Prediction/processed_data.csv")

In [14]:
df.head(1)

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,Male,Young,0,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1


In [None]:
#drop useless features
df.drop(columns=['CustomerID','Zip Code','Churn Score','CLTV','Monthly_Charges_bin','Total_Charges_bin','Tenure_Months_bin','City'] , inplace= True)

In [None]:
#convert binary_objects features to integers
Gender_dic = {'Male' : 1 , 'Female' : 0}
df['Gender'] = df['Gender'].replace(Gender_dic)
Senior_citizen_dic = {'Young' : 0 , 'Senior' : 1}
df['Senior Citizen'] = df['Senior Citizen'].replace(Senior_citizen_dic)
Dependents_dic = {'Yes' : 1 , 'No' : 0}
df['Dependents'] = df['Dependents'].replace(Dependents_dic)
Phone_service_dic = {'Yes' : 1 , 'No' : 0}
df['Phone Service'] = df['Phone Service'].replace(Phone_service_dic)
Paperless_Billing_dic = {'Yes' : 1 , 'No' : 0}
df['Paperless Billing'] = df['Paperless Billing'].replace(Paperless_Billing_dic)

  df['Paperless Billing'] = df['Paperless Billing'].replace(Paperless_Billing_dic)


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             7043 non-null   int64  
 1   Senior Citizen     7043 non-null   int64  
 2   Partner            7043 non-null   int64  
 3   Dependents         7043 non-null   int64  
 4   Tenure Months      7043 non-null   int64  
 5   Phone Service      7043 non-null   int64  
 6   Multiple Lines     7043 non-null   object 
 7   Internet Service   7043 non-null   object 
 8   Online Security    7043 non-null   object 
 9   Online Backup      7043 non-null   object 
 10  Device Protection  7043 non-null   object 
 11  Tech Support       7043 non-null   object 
 12  Streaming TV       7043 non-null   object 
 13  Streaming Movies   7043 non-null   object 
 14  Contract           7043 non-null   object 
 15  Paperless Billing  7043 non-null   int64  
 16  Payment Method     7043 

In [67]:
#spliting the data
y = df['Churn Value']
X = df.drop(columns=['Churn Value'])

In [68]:
numerical_features = [
    'Tenure Months',
    'Monthly Charges',
    'Total Charges'
]
binary_features = [
    'Gender',
    'Phone Service',
    'Senior Citizen',
    'Dependents',
    'Partner',
    'Paperless Billing'
]
categorical_features = [
    'Multiple Lines',
    'Internet Service',
    'Online Security',
    'Online Backup',
    'Device Protection',
    'Tech Support',
    'Streaming TV',
    'Streaming Movies',
    'Contract',
    'Payment Method'
]

In [69]:
#num_column_transform
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [70]:
#cat_column_transform
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(
        drop='first',
        handle_unknown='ignore'
    ))
])

In [71]:
#columns_trans
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_features),
        ('cat', cat_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [72]:
model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000
)

In [73]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', model)
])

In [74]:
#train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , stratify=y ,random_state=42)

In [75]:
#train_model
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [77]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

In [81]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.73      0.81      1035
           1       0.51      0.78      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.76      1409

