## Telcom Company Customer Churning Analysis

### Packages Importation

In [1]:
#importing the necessary packages
import pyodbc 
from dotenv import dotenv_values
import pandas as pd
import numpy as np
import warnings 
import seaborn as sns
import matplotlib.pyplot as plt

#modelling Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from imblearn.combine import SMOTEENN
from sklearn.neighbors import KNeighborsClassifier

# Feature Processing (Scikit-learn processing, etc.)
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#Algorithms and pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from imblearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier

##handling imbalance datasets

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

warnings.filterwarnings('ignore')




from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import classification_report



## DATA LOADING

In [2]:
full_data = pd.read_csv('finaldata.csv')

 To understand the structure of our dataset we use.info() to get:
 * The total number of rows in the DataFrame.
 * A summary of each column, including:
  * The column name.
  * The number of non-null values in that column.
  * The data type of values in that column.
 * The memory usage of the DataFrame.

In [3]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         3000 non-null   object 
 1   gender             5043 non-null   object 
 2   Senior Citizen     3000 non-null   float64
 3   Partner            5043 non-null   object 
 4   Dependents         5043 non-null   object 
 5   tenure             5043 non-null   int64  
 6   PhoneService       5043 non-null   object 
 7   MultpleLines       2731 non-null   float64
 8   Internet Service   3000 non-null   object 
 9   Online Security    2349 non-null   float64
 10  Online Backup      2349 non-null   float64
 11  Device Protection  2349 non-null   float64
 12  Tech Support       2349 non-null   float64
 13  Streaming TV       2349 non-null   float64
 14  Streaming Movies   2349 non-null   float64
 15  Contract           5043 non-null   object 
 16  Paperless Billing  3000 

In [4]:
# Check if our dataset has null values
full_data.isnull().sum()

CustomerID           2043
gender                  0
Senior Citizen       2043
Partner                 0
Dependents              0
tenure                  0
PhoneService            0
MultpleLines         2312
Internet Service     2043
Online Security      2694
Online Backup        2694
Device Protection    2694
Tech Support         2694
Streaming TV         2694
Streaming Movies     2694
Contract                0
Paperless Billing    2043
Payment Method       2043
Monthly Charges      2043
Total Charges        2048
Churn                   1
customerID           3000
SeniorCitizen        3000
MultipleLines        3000
InternetService      3000
OnlineSecurity       3000
OnlineBackup         3000
DeviceProtection     3000
TechSupport          3000
StreamingTV          3000
StreamingMovies      3000
PaperlessBilling     3000
PaymentMethod        3000
MonthlyCharges       3000
TotalCharges         3000
dtype: int64

In [5]:
# Check the descriptive statistics of numeric variables
full_data.head()

Unnamed: 0,CustomerID,gender,Senior Citizen,Partner,Dependents,tenure,PhoneService,MultpleLines,Internet Service,Online Security,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,Female,0.0,1,0,1,0,,DSL,0.0,...,,,,,,,,,,
1,5575-GNVDE,Male,0.0,0,0,34,1,0.0,DSL,1.0,...,,,,,,,,,,
2,3668-QPYBK,Male,0.0,0,0,2,1,0.0,DSL,1.0,...,,,,,,,,,,
3,7795-CFOCW,Male,0.0,0,0,45,0,,DSL,1.0,...,,,,,,,,,,
4,9237-HQITU,Female,0.0,0,0,2,1,0.0,Fiber optic,0.0,...,,,,,,,,,,


## DATA CLEANING

In [6]:
columns_to_replace = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                      'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                      'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

for col in columns_to_replace:
    full_data[col] = full_data[col].replace({True: 'Yes', False: 'No'})

In [7]:
full_data['SeniorCitizen'] = full_data['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})

We shall create an assumption that fills nan values with 'No phone service' and 'No internet service'. This will apply to the following columns:'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'

In [8]:
full_data['MultipleLines'] = full_data['MultipleLines'].fillna('No phone service')

# Define a dictionary with replacement values for each column 
replacement_dict = {
    'OnlineSecurity': 'No internet service', 
    'OnlineBackup':'No internet service' , 
    'DeviceProtection': 'No internet service',
    'TechSupport' : 'No internet service',
    'StreamingTV' : 'No internet service',
    'StreamingMovies' : 'No internet service',
} 

# Replace NaN values in the DataFrame with the specified words based on the dictionary 
full_data= full_data.fillna(replacement_dict)

In [9]:
# Check if our dataset has null values after NULL Replacement
full_data.isnull().sum()

CustomerID           2043
gender                  0
Senior Citizen       2043
Partner                 0
Dependents              0
tenure                  0
PhoneService            0
MultpleLines         2312
Internet Service     2043
Online Security      2694
Online Backup        2694
Device Protection    2694
Tech Support         2694
Streaming TV         2694
Streaming Movies     2694
Contract                0
Paperless Billing    2043
Payment Method       2043
Monthly Charges      2043
Total Charges        2048
Churn                   1
customerID           3000
SeniorCitizen        3000
MultipleLines           0
InternetService      3000
OnlineSecurity          0
OnlineBackup            0
DeviceProtection        0
TechSupport             0
StreamingTV             0
StreamingMovies         0
PaperlessBilling     3000
PaymentMethod        3000
MonthlyCharges       3000
TotalCharges         3000
dtype: int64

### Handling Total Charges Missing Values 

In [10]:
# Convert the object column to float, preserving missing values as NaN
full_data['TotalCharges'] = pd.to_numeric(full_data['TotalCharges'], errors='coerce')

# Round the values to 2 decimal places
full_data['TotalCharges'] = full_data['TotalCharges'].round(2)

In [11]:
full_data['TotalCharges'].describe()

count    2040.000000
mean     2303.214681
std      2261.934905
min        18.850000
25%       434.250000
50%      1398.275000
75%      3846.962500
max      8670.100000
Name: TotalCharges, dtype: float64

Imputing with the median is a common approach when dealing with skewed distributions or data containing outliers. The median is less sensitive to extreme values compared to the mean, making it a robust measure of central tendency. Imputing missing values with the median helps preserve the overall distribution's central tendency while minimizing the impact of outliers.

In [12]:
# Impute missing values with the median
median_TotalCharges=full_data['TotalCharges'].median()
full_data['TotalCharges'].fillna(median_TotalCharges, inplace=True)

#### Handling churn missing values

In [13]:
#check the row with missing column in 'Churn'
column_name = 'Churn'
# Filter rows with missing values in the specified column
rows_with_missing_in_column = full_data[full_data[column_name].isnull()]
print(rows_with_missing_in_column)

      CustomerID gender  Senior Citizen Partner Dependents  tenure  \
2988  6295-OSINB   Male             0.0       1          0      72   

     PhoneService  MultpleLines Internet Service  Online Security  ...  \
2988            1           1.0      Fiber optic              1.0  ...   

           OnlineSecurity         OnlineBackup     DeviceProtection  \
2988  No internet service  No internet service  No internet service   

              TechSupport          StreamingTV      StreamingMovies  \
2988  No internet service  No internet service  No internet service   

      PaperlessBilling PaymentMethod  MonthlyCharges  TotalCharges  
2988               NaN           NaN             NaN      1398.275  

[1 rows x 35 columns]


This DataFrame contains only the rows from the original 'full_data' DataFrame where the churn column has missing values. This data represents a customer's information across various services, billing, and contract details.

In [14]:
#drop missing value in 'Churn' column
full_data = full_data.dropna(subset=['Churn'])
full_data.isnull().sum()

CustomerID           2043
gender                  0
Senior Citizen       2043
Partner                 0
Dependents              0
tenure                  0
PhoneService            0
MultpleLines         2312
Internet Service     2043
Online Security      2694
Online Backup        2694
Device Protection    2694
Tech Support         2694
Streaming TV         2694
Streaming Movies     2694
Contract                0
Paperless Billing    2043
Payment Method       2043
Monthly Charges      2043
Total Charges        2048
Churn                   0
customerID           2999
SeniorCitizen        2999
MultipleLines           0
InternetService      2999
OnlineSecurity          0
OnlineBackup            0
DeviceProtection        0
TechSupport             0
StreamingTV             0
StreamingMovies         0
PaperlessBilling     2999
PaymentMethod        2999
MonthlyCharges       2999
TotalCharges            0
dtype: int64

In [15]:
full_data.head(5)

Unnamed: 0,CustomerID,gender,Senior Citizen,Partner,Dependents,tenure,PhoneService,MultpleLines,Internet Service,Online Security,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,Female,0.0,1,0,1,0,,DSL,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
1,5575-GNVDE,Male,0.0,0,0,34,1,0.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
2,3668-QPYBK,Male,0.0,0,0,2,1,0.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
3,7795-CFOCW,Male,0.0,0,0,45,0,,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
4,9237-HQITU,Female,0.0,0,0,2,1,0.0,Fiber optic,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275


In [16]:
full_data.describe()

Unnamed: 0,Senior Citizen,tenure,MultpleLines,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Paperless Billing,Monthly Charges,Total Charges,MonthlyCharges,TotalCharges
count,2999.0,5042.0,2730.0,2348.0,2348.0,2348.0,2348.0,2348.0,2348.0,2999.0,2999.0,2994.0,2043.0,5042.0
mean,0.158386,32.568822,0.473626,0.370528,0.437819,0.448041,0.371806,0.493186,0.510221,0.591864,65.332628,2299.414931,64.712555,1764.414816
std,0.365164,24.525954,0.499395,0.483049,0.496224,0.497399,0.48339,0.50006,0.500002,0.491571,30.131213,2273.080813,29.97001,1505.587205
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.4,18.799999,18.55,18.85
25%,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.775,415.175003,35.825,1398.275
50%,0.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,70.900002,1402.900024,70.25,1398.275
75%,0.0,56.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,90.25,3865.562561,89.625,1398.275
max,1.0,72.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,118.650002,8564.75,118.35,8670.1


 We used .describe() calculate various statistical measures for the numerical columns in the DataFrame. Which are tenure column,	MonthlyCharges column,and TotalCharges column. We can see that in:
 
 #### Count:

All three columns ("tenure," "MonthlyCharges," and "TotalCharges") have the same count of 5042. This suggests that there are no missing values in these columns.

#### Mean (Average):

* "tenure": The average tenure (duration of service) is approximately 32.57 months. This gives an idea of the typical length of time customers have been using the service.
* "MonthlyCharges": The average monthly charge is around $65.08 This provides insight into the average cost customers are paying each month for the service.

* "TotalCharges": The average total charges amount to about $2299.53. This is the accumulated amount customers have paid over their tenure.

#### Standard Deviation:

* "tenure": The relatively high standard deviation of approximately 24.53 for tenure indicates a wide range of variability in how long customers have been with the service.
* "MonthlyCharges": The standard deviation of around 30.06 for monthly charges suggests that there is some variability in the pricing plans chosen by customers.
* "TotalCharges": With a standard deviation of about 2266.83 for total charges, there is significant variability in the amount customers have paid overall.

#### Minimum and Maximum:

* "tenure": The minimum tenure is 0, indicating that there might be instances where customers have just started or haven't stayed long. The maximum is 72, indicating long-standing customers.
* "MonthlyCharges": The range of monthly charges extends from a minimum  of $18.4 to a maximum of  $118.65.
* "TotalCharges": Total charges vary between  $18.8  and  $8670.1.

#### Percentiles:

Percentiles provide insight into the data distribution.
Median ("50%"): For "tenure," the median is 29 months, meaning half of the customers have tenures shorter than this, and half have tenures longer.
For "MonthlyCharges," the median is $70.55, indicating the middle point of the charge distribution.
For "TotalCharges," the median is  $1401.15, indicating the middle point of the accumulated charges.

In [17]:
#drop column customerID and tenure that we don't need
full_data.drop(columns= ['customerID'], axis=1, inplace=True)
full_data.head()

Unnamed: 0,CustomerID,gender,Senior Citizen,Partner,Dependents,tenure,PhoneService,MultpleLines,Internet Service,Online Security,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,Female,0.0,1,0,1,0,,DSL,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
1,5575-GNVDE,Male,0.0,0,0,34,1,0.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
2,3668-QPYBK,Male,0.0,0,0,2,1,0.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
3,7795-CFOCW,Male,0.0,0,0,45,0,,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275
4,9237-HQITU,Female,0.0,0,0,2,1,0.0,Fiber optic,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,,,,1398.275


In [18]:
telco_data = full_data.copy()

##### Divide customers into bins based on tenure e.g. for tenure < 12 months: assign a tenure group if 1-12, for tenure between 1 to 2 Yrs, tenure group of 13-24; so on...

In [19]:
# Get the max tenure
print(full_data['tenure'].max())

# Get the min tenure
print(full_data['tenure'].min())

72
0


In [20]:
# Group the tenure in bins of 12 months
labels = ["{0} - {1}".format(i, i + 11) for i in range(0, 73, 12)]
bins = [i for i in range(0, 85, 12)]

full_data['tenure_group'] = pd.cut(full_data.tenure, bins=bins, labels=labels, right=False)

In [21]:
full_data['tenure_group'].value_counts()

0 - 11     1453
60 - 71     821
12 - 23     762
24 - 35     628
48 - 59     579
36 - 47     543
72 - 83     256
Name: tenure_group, dtype: int64

In [22]:
full_data.drop(columns= ['tenure'], axis=1, inplace=True)

In [23]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5042 entries, 0 to 5042
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   CustomerID         2999 non-null   object  
 1   gender             5042 non-null   object  
 2   Senior Citizen     2999 non-null   float64 
 3   Partner            5042 non-null   object  
 4   Dependents         5042 non-null   object  
 5   PhoneService       5042 non-null   object  
 6   MultpleLines       2730 non-null   float64 
 7   Internet Service   2999 non-null   object  
 8   Online Security    2348 non-null   float64 
 9   Online Backup      2348 non-null   float64 
 10  Device Protection  2348 non-null   float64 
 11  Tech Support       2348 non-null   float64 
 12  Streaming TV       2348 non-null   float64 
 13  Streaming Movies   2348 non-null   float64 
 14  Contract           5042 non-null   object  
 15  Paperless Billing  2999 non-null   float64 
 16  Paymen

#### Remove columns not required for processing

In [24]:
100*full_data['Churn'].value_counts()/len(full_data['Churn'])

0.0    44.010313
No     29.492265
1.0    15.470052
Yes    11.027370
Name: Churn, dtype: float64

In [25]:
full_data['Churn'].value_counts()

0.0    2219
No     1487
1.0     780
Yes     556
Name: Churn, dtype: int64

Data is highly imbalanced, ratio = 73:26

## DATA EXPLORATION - EDA

### UNIVARIATE ANALYSIS

1. Plot distribution of individual predictors by churn

Tenure_group, Contract Type and Churn Type

Negative Correlations (Likely to Decrease Churn):

Variables with negative correlation coefficients (ranging from approximately -0.31 to -0.13) suggest that higher values in these variables are associated with a lower likelihood of customer churn. These variables might have a mitigating effect on churn. Factors related to these variables seem to encourage customer retention.

Positive Correlations (Likely to Increase Churn):

Variables with positive correlation coefficients (ranging from approximately 0.12 to 0.41) suggest that higher values in these variables are associated with a higher likelihood of customer churn. These variables might contribute to an increased risk of churn. Factors related to these variables seem to encourage customer attrition.

### FEATURE ENGINEERING

In [26]:
telco_data['Churn'] = np.where(telco_data.Churn == 'Yes',1,0)

This transforms the values in the 'Churn' column of our DataFrame It's converting categorical values into binary numeric values for further analysis and modeling.
* If the value is 'Yes', it will be replaced with 1.
* If the value is not 'Yes' (i.e., it's NO), it will be replaced with 0.

In [27]:
##creating our features and label

X= telco_data.drop("Churn", axis=1)
y= telco_data.Churn

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

We split the dataset into training and testing sets, along with their corresponding target variables.This help us evaluate the performance of our model on unseen data. The model will be trained on the training set and then tested on the testing set to assess its generalization ability.

In [29]:
##getting our categorical attributes 
cat_attr= [i for i in telco_data.drop(["TotalCharges", "MonthlyCharges","tenure","Churn"], axis= 1)]


##getting our numerical attributes
num_attr= ["TotalCharges", "MonthlyCharges","tenure"]

We separate the columns of a DataFrame (telco_data) into two lists: one for categorical attributes and another for numerical attributes. we also drops the specified columns ("TotalCharges", "MonthlyCharges", "tenure", "Churn") from the DataFrame along the columns axis (axis=1). This results in a DataFrame with only the remaining columns, which are categorical attributes.

In [30]:
# This pipeline will handle the standardize our numerical values

# we are using mean because from our previous analysis, there were no outliers

scaler= Pipeline([("scaler", StandardScaler())])

encoder= Pipeline([("one_hot", OneHotEncoder())])

We are setting up two pipelines: one for standardizing numerical values and another for one-hot encoding categorical values.

In [31]:
##we are combining our numeric and categorical pipelines with a Columntransformer

col_pipe= ColumnTransformer([("num_pipe", scaler, num_attr),("cat_pipe", encoder, cat_attr)])
col_pipe

ColumnTransformer(transformers=[('num_pipe',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['TotalCharges', 'MonthlyCharges', 'tenure']),
                                ('cat_pipe',
                                 Pipeline(steps=[('one_hot', OneHotEncoder())]),
                                 ['CustomerID', 'gender', 'Senior Citizen',
                                  'Partner', 'Dependents', 'PhoneService',
                                  'MultpleLines', 'Internet Service',
                                  'Online Security', 'Online Backup',
                                  'Device Protection', 'Tech Support',
                                  'Streaming TV', 'Streaming Movies',
                                  'Contract', 'Paperless Billing',
                                  'Payment Method', 'Monthly Charges',
                                  'Total Charges', 'SeniorCitizen',
                              

## Creating a pipeline for each Classifier (ML Algorithm)

#### DecisionTree CLassifier

In [32]:
DTP= Pipeline([("coltrans", col_pipe), 
              ("model", DecisionTreeClassifier(random_state= 100))
              ])

This pipeline encapsulates the entire machine learning workflow, including preprocessing and modeling, in a single object. It helps ensure consistent preprocessing when making predictions and simplifies the process of building and evaluating our models.

In [33]:
# Check for missing values in X_train
missing_values = X_train.isnull().sum()
print(missing_values)


CustomerID           1632
gender                  0
Senior Citizen       1632
Partner                 0
Dependents              0
tenure                  0
PhoneService            0
MultpleLines         1847
Internet Service     1632
Online Security      2146
Online Backup        2146
Device Protection    2146
Tech Support         2146
Streaming TV         2146
Streaming Movies     2146
Contract                0
Paperless Billing    1632
Payment Method       1632
Monthly Charges      1632
Total Charges        1634
SeniorCitizen        2401
MultipleLines           0
InternetService      2401
OnlineSecurity          0
OnlineBackup            0
DeviceProtection        0
TechSupport             0
StreamingTV             0
StreamingMovies         0
PaperlessBilling     2401
PaymentMethod        2401
MonthlyCharges       2401
TotalCharges            0
dtype: int64


In [34]:
# Check for missing values in y_train
missing_values = y_train.isnull().sum()
print(missing_values)

0


In [35]:
X_train.columns

Index(['CustomerID', 'gender', 'Senior Citizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultpleLines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charges', 'Total Charges', 'SeniorCitizen',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [36]:

# Separate numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Impute missing values for numeric columns using the mean strategy
numeric_imputer = SimpleImputer(strategy='median')
X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])

# Impute missing values for categorical columns using the most frequent strategy
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

# Now, your X_train should have missing values imputed for both numeric and categorical columns

# Create transformers for each type of feature
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')  # Set handle_unknown to 'ignore'

# Use ColumnTransformer to apply transformations to the right columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Now apply the same transformations to your test data
# Impute missing values
X_test[numeric_cols] = numeric_imputer.transform(X_test[numeric_cols])
X_test[categorical_cols] = categorical_imputer.transform(X_test[categorical_cols])


In [37]:
DTP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [38]:
DTP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [39]:
result_1 = DTP.predict(X_test)

This Ensure that X_test is properly preprocessed using the same pipeline (DTP) that we fitted to the training data (X_train) to ensure consistent preprocessing transformations is applied.
The resulting predictions (result_1) will be used for evaluation and analysis to assess the performance of our model.

In [40]:
print(classification_report(y_test,result_1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       898
           1       1.00      1.00      1.00       111

    accuracy                           1.00      1009
   macro avg       1.00      1.00      1.00      1009
weighted avg       1.00      1.00      1.00      1009



#### RandomForestClassifier

In [41]:
RFP= Pipeline([("coltrans", col_pipe), 
              ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
              ("model", RandomForestClassifier(n_estimators= 50, random_state= 100))
              ])

This pipeline encapsulates the entire machine learning workflow, including preprocessing, feature selection, and modeling, in a single object. It helps ensure consistent preprocessing and feature selection when making predictions and simplifies the process of building and evaluating our models.

In [42]:
RFP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

This Ensure that X_train is properly preprocessed and contains the same features as the training data you used to create the pipeline (col_pipe).
After fitting the pipeline, the trained model is ready to make predictions using the predict() method.

In [43]:
RFP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [44]:
result_2 = RFP.predict(X_test)

In [45]:
print(classification_report( y_test,result_2))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



#### Logistic RegressionClassifier 

In [46]:
LRP= Pipeline([("coltrans", col_pipe), 
              ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
              ("model", LogisticRegression(random_state= 100))
              ])

In [47]:
LRP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [48]:
LRP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [49]:
result_3 = LRP.predict(X_test)

In [50]:
print(classification_report(y_test,result_3))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



#### Support Vector Classifier (SVC)

In [51]:
SVP= Pipeline([("coltrans", col_pipe),  
               ("feature_selection: ", SelectKBest(score_func=f_classif, k= 10)),
               ("model", SVC(random_state= 100))
              ])

In [52]:
SVP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

After creating and configuring the pipeline, you use the fit() method to train the model on your training data.
The pipeline takes care of all the preprocessing, feature selection, and training steps in a sequential manner.
Note:

By Ensure that X_train is properly preprocessed and contains the same features as the training data you used to create the pipeline (col_pipe).
After fitting the pipeline, the trained model is ready to make predictions using the predict() method.

In [53]:
SVP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [54]:
result_4 = SVP.predict(X_test)

In [55]:
print(classification_report(y_test, result_4))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



In [56]:
#### Results after base modeling:

base_result= {"DTP": result_1, "RFP":result_2, "LRP": result_3, "SVP":result_4}

for key, value in base_result.items():
    
    print(f"The performance of {key} is: \n\n", classification_report(y_test, value))

The performance of DTP is: 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00       898
           1       1.00      1.00      1.00       111

    accuracy                           1.00      1009
   macro avg       1.00      1.00      1.00      1009
weighted avg       1.00      1.00      1.00      1009

The performance of RFP is: 

               precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009

The performance of LRP is: 

               precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70     

### Dealing with Imbalance

In this section, we are going to see how functions like:

Class_weight for models that we will be using class weight, i will be appending "_CW" to the name to signify class_weight

SMOTE for models that we will be using class weight, i will be appending "_SMO" to the name to signify SMOTE
affect a model's performance

## Using Class_Weight to Handle imbalance

In [57]:
##initializing our class weight for each class

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

We are calculating class weights for each class in a binary classification problem. Class weights are used to address class imbalance in the training data, where one class might have significantly more samples than the other. They help the model to give more accurate predictions for both classes by considering the underlying class distribution.

In [58]:
##assigning our weight to the respective class 

weight= dict(zip([0, 1], class_weights))

 Here we are creating a dictionary that maps class labels to their corresponding class weights. This dictionary will be used to provide custom class weights to a machine learning model during training. This helps the model learn from the underrepresented class more effectively.

#### Decision Tree

In [59]:
CW_DTP = Pipeline([
    ("coltrans", col_pipe),
    ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
    ("model", DecisionTreeClassifier(random_state= 100,class_weight= weight))
])

The custom class weights (weight) are assigned to the Decision Tree classifier through the class_weight parameter, which allows the model to address class imbalance during training.

In [60]:
CW_DTP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [61]:
CW_DTP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [62]:
result_1_1 = CW_DTP.predict(X_test)

Here we are using the pipeline CW_DTP to make predictions on the testing data (X_test). 

In [63]:
print(classification_report(y_test, result_1_1))

              precision    recall  f1-score   support

           0       0.99      0.79      0.88       898
           1       0.36      0.94      0.52       111

    accuracy                           0.81      1009
   macro avg       0.68      0.87      0.70      1009
weighted avg       0.92      0.81      0.84      1009



From the above output we can see that:

* Precision: For class 0, the precision is 0.87, which means that when the model predicts class 0, it is correct around 87% of the time. For class 1, the precision is 0.50, indicating that when the model predicts class 1, it is correct around 50% of the time.

* Recall: For class 0, the recall is 0.75, which means that out of all the actual instances of class 0 in the dataset, the model correctly identified 75% of them. For class 1, the recall is 0.70, indicating that the model correctly identified 70% of the actual instances of class 1.

* F1-Score: The F1-score is a balance between precision and recall. For class 0, the F1-score is 0.81, and for class 1, it is 0.58. The F1-score takes into account both false positives and false negatives, providing a balanced measure of a model's performance.

* Accuracy: The overall accuracy of the model is 0.74 or 74%. This means that the model correctly classified 74% of all instances in the testing data.

* Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.69, recall is 0.72, and F1-score is 0.69.

* Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.77, recall is 0.74, and F1-score is 0.75.

#### Interpretation:

* The model has better precision for class 0 than for class 1, which means it's more accurate in predicting class 0.
* The recall for class 1 is higher than for class 0, indicating that the model is better at identifying class 1 instances.
* The F1-score provides a balanced measure considering both precision and recall. It's a good metric to assess the overall performance of the model.
* The accuracy is decent at 74%, but it's essential to consider precision, recall, and F1-score, especially in cases of class imbalance.
* The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

#### Conclusion:

The analysis of the classification report suggests that the model might be better at identifying instances of class 1, but it needs further evaluation, optimization, or potential handling of class imbalance to improve overall performance.

#### Random Forest Class Weight

In [64]:
CW_RFC= Pipeline([("coltrans", col_pipe), 
                ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
               ("model",RandomForestClassifier(random_state= 100, n_estimators= 50, class_weight=weight))
              ])

In [65]:
CW_RFC.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [66]:
CW_RFC.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [67]:
result_2_2 = CW_RFC.predict(X_test)

In [68]:
print(classification_report(y_test, result_2_2))

              precision    recall  f1-score   support

           0       0.99      0.79      0.88       898
           1       0.36      0.94      0.52       111

    accuracy                           0.81      1009
   macro avg       0.68      0.87      0.70      1009
weighted avg       0.92      0.81      0.84      1009



From the above output we can see that:

* Precision: For class 0, the precision is 0.86, indicating that when the model predicts class 0, it is correct around 86% of the time. For class 1, the precision is 0.52, indicating that when the model predicts class 1, it is correct around 52% of the time.

* Recall: For class 0, the recall is 0.78, indicating that out of all the actual instances of class 0 in the dataset, the model correctly identified 78% of them. For class 1, the recall is 0.65, indicating that the model correctly identified 65% of the actual instances of class 1.

* F1-Score: The F1-score is a balance between precision and recall. For class 0, the F1-score is 0.82, and for class 1, it is 0.57. The F1-score provides a balanced measure of a model's performance.

* Accuracy: The overall accuracy of the model is 0.75 or 75%. This means that the model correctly classified 75% of all instances in the testing data.

* Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.69, recall is 0.71, and F1-score is 0.70.

* Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.77, recall is 0.75, and F1-score is 0.75.

#### Interpretation:

* Similar to the previous analysis, the model has better precision for class 0 than for class 1.
* The recall for class 1 is improved compared to the previous model, indicating that the model is better at identifying class 1 instances.
* The F1-score provides a balanced measure considering both precision and recall.
* The accuracy is decent at 75%, but as before, it's essential to consider precision, recall, and F1-score, especially in cases of class imbalance.
* The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

#### Conclusion:

The analysis of the classification report suggests that the model's performance has improved compared to the previous model in terms of recall for class 1. However, further evaluation, optimization, or handling of class imbalance may still be necessary to achieve even better performance.

#### Logistic RegressionClassifier 

In [69]:
CW_LRP= Pipeline([("coltrans", col_pipe),  
            ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
            ("model", LogisticRegression(random_state=100,class_weight=weight))
            ])

In [70]:
CW_LRP.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [71]:
CW_LRP.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [72]:
result_3_3 = CW_LRP.predict(X_test)

In [73]:
print(classification_report(y_test, result_3_3))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       898
           1       0.38      0.79      0.52       111

    accuracy                           0.84      1009
   macro avg       0.68      0.82      0.71      1009
weighted avg       0.91      0.84      0.86      1009



From the above output we can see that:

* Precision: For class 0, the precision is 0.91, indicating that when the model predicts class 0, it is correct around 91% of the time. For class 1, the precision is 0.50, indicating that when the model predicts class 1, it is correct around 50% of the time.

* Recall: For class 0, the recall is 0.71, indicating that out of all the actual instances of class 0 in the dataset, the model correctly identified 71% of them. For class 1, the recall is 0.79, indicating that the model correctly identified 79% of the actual instances of class 1.

* F1-Score: The F1-score is a balance between precision and recall. For class 0, the F1-score is 0.80, and for class 1, it is 0.61. The F1-score provides a balanced measure of a model's performance.

* Accuracy: The overall accuracy of the model is 0.73 or 73%. This means that the model correctly classified 73% of all instances in the testing data.

* Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.70, recall is 0.75, and F1-score is 0.71.

* Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.80, recall is 0.73, and F1-score is 0.75.

#### Interpretation:

* The precision for class 0 is high, indicating that the model is quite accurate when predicting class 0, but the precision for class 1 is lower.
* The recall for class 1 is higher, indicating that the model is better at identifying class 1 instances.
* The F1-score provides a balanced measure considering both precision and recall.
* The accuracy is decent at 73%, but it's essential to consider precision, recall, and F1-score, especially in cases of class imbalance.
* The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

#### Conclusion:

The analysis of the classification report suggests that the model has improved recall for class 1 compared to the previous models. However, there's still room for improvement, and further evaluation or optimization may be needed to achieve better overall performance.






#### Support Vector Classifier (SVC)

In [74]:
CW_SVM= Pipeline([("coltrans", col_pipe), 
                ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
               ("model", SVC( random_state= 100, class_weight=weight))
              ])

In [75]:
CW_SVM.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [76]:
CW_SVM.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [77]:
result_4_4 = CW_SVM.predict(X_test)

In [78]:
print(classification_report(y_test, result_4_4))

              precision    recall  f1-score   support

           0       0.99      0.78      0.87       898
           1       0.34      0.95      0.50       111

    accuracy                           0.80      1009
   macro avg       0.67      0.86      0.69      1009
weighted avg       0.92      0.80      0.83      1009



 From the above output we can see that:
 
* Precision: For class `0`, the precision is 0.89, indicating that when the model predicts class `0`, it is correct around 89% of the time. For class `1`, the precision is 0.51, indicating that when the model predicts class `1`, it is correct around 51% of the time.

* Recall: For class `0`, the recall is 0.74, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 74% of them. For class `1`, the recall is 0.75, indicating that the model correctly identified 75% of the actual instances of class `1`.

* F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.81, and for class `1`, it is 0.61. The F1-score provides a balanced measure of a model's performance.


* Accuracy: The overall accuracy of the model is 0.74 or 74%. This means that the model correctly classified 74% of all instances in the testing data.

* Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.70, recall is 0.74, and F1-score is 0.71.

* Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.79, recall is 0.74, and F1-score is 0.75.


**Interpretation:**

* The precision for class `0` is high, indicating that the model is quite accurate when predicting class `0`, but the precision for class `1` is lower.
* The recall for both classes is balanced, with values of 0.74 and 0.75, indicating that the model is able to identify instances of both classes reasonably well.
* The F1-score provides a balanced measure considering both precision and recall.
* The accuracy is decent at 74%, but it's essential to consider precision, recall, and F1-score, especially in cases of class imbalance.
* The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Conclusion:**

The analysis of the classification report suggests that the model has improved recall for both classes compared to some of the previous models. However, there's still room for optimization and further evaluation to achieve better overall performance.

## Using SMOTE to Handle imbalance

**DECISION TreeClassifier**

In [79]:
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin

class SMOTETransformer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=None, **kwargs):
        self.smote = SMOTE(random_state=random_state, **kwargs)
        self.random_state = random_state  # Add random_state attribute

    def fit(self, X, y):
        X_resampled, y_resampled = self.smote.fit_resample(X, y)
        return self

    def transform(self, X):
        return X  # SMOTE is applied during the fitting process


In [80]:
# Now, you can use SMOTETransformer in the Pipeline
DTP_SMO = Pipeline([
    ("coltrans", col_pipe),
    ("feature_selection", SelectKBest(score_func=f_classif, k=10)), # Perform feature selection
    ("smote", SMOTETransformer(random_state=100)),   # Apply SMOTE for oversampling
    ("model", DecisionTreeClassifier(random_state=100))
])


In [81]:
DTP_SMO.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [82]:
DTP_SMO.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [83]:
result_1_1_1 = DTP_SMO.predict(X_test)

In [84]:
print(classification_report(y_test, result_1_1_1))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



From the above output we can see that:

- Precision: For class `0`, the precision is 0.84, indicating that when the model predicts class `0`, it is correct around 84% of the time. For class `1`, the precision is 0.48, indicating that when the model predicts class `1`, it is correct around 48% of the time.

- Recall: For class `0`, the recall is 0.77, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 77% of them. For class `1`, the recall is 0.58, indicating that the model correctly identified 58% of the actual instances of class `1`.

- F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.80, and for class `1`, it is 0.53. The F1-score provides a balanced measure of a model's performance.

- Accuracy: The overall accuracy of the model is 0.72 or 72%. This means that the model correctly classified 72% of all instances in the testing data.

- Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.66, recall is 0.68, and F1-score is 0.66.

- Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.74, recall is 0.72, and F1-score is 0.73.

**Interpretation:**
- The precision for class `0` is relatively high, indicating that the model is reasonably accurate when predicting class `0`. The precision for class `1` is lower.
- The recall for class `0` is also reasonably high, but the recall for class `1` is moderate.
- The F1-score provides a balanced measure considering both precision and recall.
- The accuracy is 72%, but it's important to consider precision, recall, and F1-score for a comprehensive understanding of model performance.
- The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Conclusion:**
- The analysis of the classification report suggests that the model's performance is moderate, with relatively good accuracy for class `0` but room for improvement in terms of precision and recall for class `1`. Further optimization or different techniques might be considered to improve overall performance.

**RANDOM FORREST**

In [85]:

class SMOTETransformer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=None, **kwargs):
        self.smote = SMOTE(random_state=random_state, **kwargs)
        self.random_state = random_state  # Add random_state attribute

    def fit(self, X, y):
        X_resampled, y_resampled = self.smote.fit_resample(X, y)
        return self

    def transform(self, X):
        return X  # SMOTE is applied during the fitting process

In [86]:
RF_SMO = Pipeline([
    ("coltrans", col_pipe),
    ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
    ("smote", SMOTETransformer(random_state=100)),  # Use the custom SMOTETransformer
    ("model", RandomForestClassifier(random_state=100, n_estimators=50))
])


In [87]:
RF_SMO.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [88]:
RF_SMO.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [89]:
result_2_2_2 = RF_SMO.predict(X_test)

In [90]:
print(classification_report(y_test, result_2_2_2))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



From the above output we can see that:

- Precision: For class `0`, the precision is 0.87, indicating that when the model predicts class `0`, it is correct around 87% of the time. For class `1`, the precision is 0.51, indicating that when the model predicts class `1`, it is correct around 51% of the time.

- Recall: For class `0`, the recall is 0.77, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 77% of them. For class `1`, the recall is 0.67, indicating that the model correctly identified 67% of the actual instances of class `1`.

- F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.81, and for class `1`, it is 0.58. The F1-score provides a balanced measure of a model's performance.

- Accuracy: The overall accuracy of the model is 0.74 or 74%. This means that the model correctly classified 74% of all instances in the testing data.

- Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.69, recall is 0.72, and F1-score is 0.70.

- Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.77, recall is 0.74, and F1-score is 0.75.

**Interpretation:**
- The precision for class `0` is relatively high, indicating that the model is quite accurate when predicting class `0`, but the precision for class `1` is lower.
- The recall for both classes is reasonable, with values of 0.77 and 0.67, indicating that the model is able to identify instances of both classes with moderate accuracy.
- The F1-score provides a balanced measure considering both precision and recall.
- The accuracy is decent at 74%, but it's important to consider precision, recall, and F1-score for a comprehensive understanding of model performance.
- The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Conclusion:**
- The analysis of the classification report suggests that the model's performance is moderate, with relatively good accuracy for class `0` but room for improvement in terms of precision and recall for class `1`. Further optimization or different techniques might be considered to improve overall performance.

**Logistic RegressionClassifier**

In [91]:
LGR_SMO = Pipeline([("coltrans", col_pipe), 
               ("feature_selection", SelectKBest(score_func=f_classif, k=10)),# Perform feature selection
               ("smote", SMOTETransformer(random_state=100)),  # Apply SMOTE for oversampling
               ("model", LogisticRegression(random_state= 100))  
              ])

In [92]:
LGR_SMO.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [93]:
LGR_SMO.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [94]:
result_3_3_3 = LGR_SMO.predict(X_test)

In [95]:
print(classification_report(y_test, result_3_3_3))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009



From the above output we can see that:

- Precision: For class `0`, the precision is 0.90, indicating that when the model predicts class `0`, it is correct around 90% of the time. For class `1`, the precision is 0.50, indicating that when the model predicts class `1`, it is correct around 50% of the time.

- Recall: For class `0`, the recall is 0.72, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 72% of them. For class `1`, the recall is 0.79, indicating that the model correctly identified 79% of the actual instances of class `1`.

- F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.80, and for class `1`, it is 0.61. The F1-score provides a balanced measure of a model's performance.

- Accuracy: The overall accuracy of the model is 0.73 or 73%. This means that the model correctly classified 73% of all instances in the testing data.

- Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.70, recall is 0.75, and F1-score is 0.70.

- Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.80, recall is 0.73, and F1-score is 0.75.

**Interpretation:**
- The precision for class `0` is high, indicating that the model is quite accurate when predicting class `0`, but the precision for class `1` is lower.
- The recall for class `1` is higher, indicating that the model is better at identifying class `1` instances.
- The F1-score provides a balanced measure considering both precision and recall.
- The accuracy is decent at 73%, but it's important to consider precision, recall, and F1-score for a comprehensive understanding of model performance.
- The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Conclusion:**
- The analysis of the classification report suggests that the model's performance has improved recall for class `1`, similar to some of the previous models. However, there's still room for optimization and further evaluation to achieve better overall performance.

#### Support Vector Classifier (SVC)

In [96]:
SVM_SMO = Pipeline([("coltrans", col_pipe), 
               ("feature_selection", SelectKBest(score_func=f_classif, k=10)),# Perform feature selection
               ("smote", SMOTETransformer(random_state=100)),  # Apply SMOTE for oversampling
               ("model", SVC(random_state= 100))  
              ])

In [97]:
SVM_SMO.fit(X_train, y_train)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [98]:
SVM_SMO.fit(X_test, y_test)

Pipeline(steps=[('coltrans',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['TotalCharges',
                                                   'MonthlyCharges',
                                                   'tenure']),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['CustomerID', 'gender',
                                                   'Senior Citizen', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultpleLines',
               

In [99]:
result_4_4_4 = SVM_SMO.predict(X_test)

In [100]:
print(classification_report(y_test, result_4_4_4))

# Evaluate the model
accuracy = accuracy_score(y_test, result_4_4_4)
print(f'Accuracy: {accuracy:.7f}')

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                           0.90      1009
   macro avg       0.74      0.70      0.72      1009
weighted avg       0.89      0.90      0.90      1009

Accuracy: 0.8999009


From the above output we can see that:

- Precision: For class `0`, the precision is 0.90, indicating that when the model predicts class `0`, it is correct around 90% of the time. For class `1`, the precision is 0.52, indicating that when the model predicts class `1`, it is correct around 52% of the time.

- Recall: For class `0`, the recall is 0.74, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 74% of them. For class `1`, the recall is 0.76, indicating that the model correctly identified 76% of the actual instances of class `1`.

- F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.81, and for class `1`, it is 0.62. The F1-score provides a balanced measure of a model's performance.

- Accuracy: The overall accuracy of the model is 0.7482656 or approximately 74.83%. This means that the model correctly classified about 74.83% of all instances in the testing data.

- Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.71, recall is 0.75, and F1-score is 0.71.

- Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.80, recall is 0.75, and F1-score is 0.76.

**Interpretation:**
- The precision for class `0` is high, indicating that the model is quite accurate when predicting class `0`, but the precision for class `1` is lower.
- The recall for class `1` is higher, indicating that the model is better at identifying class `1` instances.
- The F1-score provides a balanced measure considering both precision and recall.
- The accuracy is approximately 74.83%, but it's important to consider precision, recall, and F1-score for a comprehensive understanding of model performance.
- The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Conclusion:**
- The analysis of the classification report suggests that the model's performance has improved recall for class `1`, similar to some of the previous models. The accuracy is also decent, and the model is providing a balanced trade-off between precision and recall.

**Metrics we would use for assessments**

Accuracy score: This measures the overall accuracy of the model for both the training and testing datasets.

Precision-Recall Curve: This illustrates the diagnostic ability of the model by examining false positive rate (FPR) and false negative rate (FNR) at different thresholds of class predictions. This metric is suitable for datasets with imbalanced classes as it is not reliant on the number of true negatives.

F1 Score: This metric calculates the harmonic mean of precision and recall to determine the balance between both metrics.

**Comparing results of Class_weight vs SMOTE vs Baseline**

In [101]:
imbalance_result= {"Decision Tree":result_1, "Decision Tree_SM":result_1_1, "Decision_Tree_CW":result_1_1_1, 
               
                   "Logistic Regression": result_2, "Logistic Regression_SM":result_2_2, "Logistic_Regression_CW": result_2_2_2, 
            
             "Random Forest": result_3, "Random Forest_SM": result_3_3, "Random_Forest_CW":result_3_3_3, 
                   
                   "SVM": result_4, "SVM_SM": result_4_4,"SVM_CW":result_4_4_4}

for key, value in imbalance_result.items():
    
    print(f"Classification Report for {key}, is: \n\n",(classification_report(y_test,value)))

Classification Report for Decision Tree, is: 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00       898
           1       1.00      1.00      1.00       111

    accuracy                           1.00      1009
   macro avg       1.00      1.00      1.00      1009
weighted avg       1.00      1.00      1.00      1009

Classification Report for Decision Tree_SM, is: 

               precision    recall  f1-score   support

           0       0.99      0.79      0.88       898
           1       0.36      0.94      0.52       111

    accuracy                           0.81      1009
   macro avg       0.68      0.87      0.70      1009
weighted avg       0.92      0.81      0.84      1009

Classification Report for Decision_Tree_CW, is: 

               precision    recall  f1-score   support

           0       0.93      0.96      0.94       898
           1       0.56      0.45      0.50       111

    accuracy                    

Considering these metrics, it's important to balance precision and recall, especially in the context of customer churn analysis:

Best Precision: Random Forest has the highest precision (0.62) among the models, indicating fewer false positive predictions. However, precision alone might not be the only deciding factor.

Best Recall: SVM_SM has the highest recall (0.75), indicating that it captures a higher proportion of actual positive instances (churn cases).

Best F1-Score: Among the models, SVM_SM and SVM_CW have the highest F1-score (0.61), indicating a balance between precision and recall.

Best Accuracy: Random Forest has the highest accuracy (0.79), but accuracy alone might not provide a complete picture, especially in imbalanced datasets like customer churn.

Given the context of customer churn analysis, where correctly identifying churn instances is crucial for business decisions, a model with a good balance between precision and recall is desirable. Therefore, considering the F1-score and considering the trade-off between precision and recall, the SVM_SM or SVM_CW models might be more suitable choices. These models have the highest F1-score (0.61) and a relatively balanced recall and precision.

**HYPERPARAMETER TUNING - GridSearchCV**

In [102]:


# Separate numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Create transformers for each type of feature
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Use ColumnTransformer to apply transformations to the right columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Combine the preprocessing with your main pipeline
CW_SVM = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_classif, k=10)),
    ("model", SVC(random_state=100, class_weight=weight))
])

# Define the parameter grid for tuning
param_grid = {
    'feature_selection__k': [5, 10, 15],
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(CW_SVM, param_grid, cv=5, scoring='f1')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator from the grid search
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Make predictions on the test data using the best estimator
result_4_4_2 = best_estimator.predict(X_test)

# Print the classification report
print(classification_report(y_test, result_4_4_2))


              precision    recall  f1-score   support

           0       0.98      0.78      0.87       898
           1       0.33      0.86      0.48       111

    accuracy                           0.79      1009
   macro avg       0.65      0.82      0.67      1009
weighted avg       0.91      0.79      0.83      1009



From the above output we can see that:

- Precision: For class `0`, the precision is 0.89, indicating that when the model predicts class `0`, it is correct around 89% of the time. For class `1`, the precision is 0.51, indicating that when the model predicts class `1`, it is correct around 51% of the time.

- Recall: For class `0`, the recall is 0.74, indicating that out of all the actual instances of class `0` in the dataset, the model correctly identified 74% of them. For class `1`, the recall is 0.75, indicating that the model correctly identified 75% of the actual instances of class `1`.

- F1-Score: The F1-score is a balance between precision and recall. For class `0`, the F1-score is 0.81, and for class `1`, it is 0.61. The F1-score provides a balanced measure of a model's performance.

- Accuracy: The overall accuracy of the model is 0.74 or 74%. This means that the model correctly classified 74% of all instances in the testing data.

- Macro Average: The macro average of precision, recall, and F1-score is calculated by taking the average of these metrics across both classes. The macro average precision is 0.70, recall is 0.74, and F1-score is 0.71.

- Weighted Average: The weighted average takes into account the number of instances in each class. It gives more weight to the class with more instances. The weighted average precision is 0.79, recall is 0.74, and F1-score is 0.75.

**Interpretation:**
- The precision for class `0` is high, indicating that the model is quite accurate when predicting class `0`, but the precision for class `1` is lower.
- The recall for class `0` is decent, and the recall for class `1` is also reasonable.
- The F1-score provides a balanced measure considering both precision and recall.
- The accuracy is 74%, which is decent, but it's important to consider precision, recall, and F1-score for a comprehensive understanding of model performance.
- The macro and weighted averages give a summarized view of the model's performance across classes, with weighted averages considering class distribution.

**Hyperparameter Tuning (GridSearchCV):**
- GridSearchCV is used to search for the best combination of hyperparameters from the provided parameter grid.
- It performs cross-validation to evaluate different combinations of hyperparameters and selects the best performing one based on the provided scoring metric (in this case, 'f1').
- The best parameters and best estimator are obtained from the `best_params_` and `best_estimator_` attributes of the GridSearchCV object.
- The best estimator is then used to make predictions on the test data.

**Conclusion:**
- The analysis of the classification report suggests that the tuned model's performance is similar to the base model in terms of accuracy, precision, recall, and F1-score. 

In [103]:
result_4_4_2

array([0, 0, 1, ..., 1, 1, 0])

In [104]:
X_test.head()

Unnamed: 0,CustomerID,gender,Senior Citizen,Partner,Dependents,tenure,PhoneService,MultpleLines,Internet Service,Online Security,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2859,6963-EZQEE,Male,1.0,1,0,70.0,1,1.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275
2061,4958-GZWIY,Male,0.0,1,1,7.0,1,0.0,DSL,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275
3788,0002-ORFBO,Female,0.0,Yes,No,4.0,No,0.0,Fiber optic,0.0,...,No,No,No,No,No,No,Yes,Electronic check,25.15,99.95
1035,1767-CJKBA,Male,0.0,0,0,66.0,1,1.0,No,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275
523,6825-UYPFK,Female,0.0,0,0,23.0,1,1.0,Fiber optic,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275


In [105]:
# Create a DataFrame to hold the test data and predictions
test_data = pd.DataFrame(X_test, columns=X_test.columns)
test_data['Churn'] = result_4_4_2

# Print the first few rows of the combined DataFrame
test_data.head()

Unnamed: 0,CustomerID,gender,Senior Citizen,Partner,Dependents,tenure,PhoneService,MultpleLines,Internet Service,Online Security,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2859,6963-EZQEE,Male,1.0,1,0,70.0,1,1.0,DSL,1.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275,0
2061,4958-GZWIY,Male,0.0,1,1,7.0,1,0.0,DSL,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275,0
3788,0002-ORFBO,Female,0.0,Yes,No,4.0,No,0.0,Fiber optic,0.0,...,No,No,No,No,No,Yes,Electronic check,25.15,99.95,1
1035,1767-CJKBA,Male,0.0,0,0,66.0,1,1.0,No,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275,0
523,6825-UYPFK,Female,0.0,0,0,23.0,1,1.0,Fiber optic,0.0,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Electronic check,70.35,1398.275,0


We combined the original features of the test data with the predicted churn values made by the model. By creating the test_data DataFrame, helps us to easily observe and analyze how the model's predictions align with the actual test data. The printed output displays a table where each row represents an instance from the test data, and the columns show the individual features as well as the predicted churn values.


In [106]:
# Exporting the requirements
requirements = "\n".join(f"{m.__name__}=={m.__version__}" for m in globals().values() if getattr(m, "__version__", None))

with open("requirements.txt", "w") as f:
    f.write(requirements)

In [107]:
X_encoded= encoder.fit(X_train[categorical_cols])

In [108]:
# Creating a dictionary of objects to export
exports = {"encoder": encoder,
           "scaler":scaler,
           "model":RFP}

In [109]:
import pickle

In [110]:
# Exporting the dictionary with Pickle
with open("Gradio_toolkit", "wb") as file:
    pickle.dump(exports, file)

In [111]:
# Save the trained Random Forest model to a file
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(RFP, model_file)


In [112]:
# Load the saved Random Forest model from a file
with open('random_forest_model.pkl', 'rb') as model_file:
    loaded_rf_model = pickle.load(model_file)

# Now 'loaded_rf_model' contains your trained Random Forest model
