# Cell 1: Import Libraries


pandas / numpy → handling dataset

train_test_split → splitting data

ColumnTransformer / Pipeline → preprocessing & pipeline

LogisticRegression / RandomForestClassifier → ML models

GridSearchCV → hyperparameter tuning

joblib → export & reuse model

In [34]:
# Data manipulation
import pandas as pd
import numpy as np

# ML utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Save/load model
import joblib

# Cell 2: Load Dataset

Reads CSV into a DataFrame

df.head() shows first 5 rows

Expected Output: Table with columns like customerID, gender, tenure, TotalCharges, Churn

In [35]:
# Load Telco Churn dataset

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Cell 3: Data Set 
# 1. df.shape

df is your pandas DataFrame containing the dataset.

.shape returns a tuple (number_of_rows, number_of_columns).

print("Dataset Shape:", df.shape) will show something like:

Dataset Shape: (7043, 21)

# 2. df.info()

df.info() prints a summary of the DataFrame.

Output includes:

Number of rows

Column names

Non-null counts (helps check for missing values)

Data type of each column (int64, float64, object)

# 3. Outcome
7043 rows → 7043 customers

21 columns → features + target (Churn)


In [36]:
print("Dataset Shape:", df.shape)
df.info()

Dataset Shape: (7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling

# Cell 4: Data Cleaning & Target Encoding

# 1.d.to_numeric(..., errors="coerce")

Converts TotalCharges from object → float

Invalid entries become NaN

# 2.df.dropna(inplace=True)

Removes rows with NaN in TotalCharges

Ensures all features are numeric/categorical

# 3.Target Encoding

Churn: Yes → 1 (customer left)

No → 0 (customer stayed)

# 4.Drop ID column

customerID is unique → irrelevant for prediction

# 5.Check dataset

.shape → rows & columns after cleaning

.info() → confirms data types & no missing values

In [37]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop missing values
df.dropna(inplace=True)

# Convert target to binary
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

print("Cleaned Shape:", df.shape)

Cleaned Shape: (7032, 20)


# Cell 5: Features & Target Split
# 1. X = df.drop("Churn", axis=1)

Drops the target column Churn from the DataFrame

X contains all input features the model will use to predict churn

# 2. y = df["Churn"]

Extracts the target variable

y will be 0 or 1 (encoded in the previous cell)

# 3. print(X.shape) & print(y.shape)

Checks the number of rows and columns

Confirms that feature matrix and target vector match in rows

In [38]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (7032, 19)
y shape: (7032,)


# Cell 6: Train-Test Split

# train_test_split(...)

Splits your dataset into training and testing sets.

Training set → used to train the model

Testing set → used to evaluate the model

# Parameters explained:

X, y → features and target

test_size=0.2 → 20% of data for testing, 80% for training

random_state=42 → ensures reproducibility (same split every time)

stratify=y → maintains the same proportion of churn (1) and non-churn (0) in both sets

# X_train.shape[0] & X_test.shape[0]

Prints number of samples in each set

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples: 5625
Testing samples: 1407


# Cell 7: Preprocessing Pipeline
#  Numeric Transformer
1. Pipeline: a sequence of steps; here, only scaling.

2. StandardScaler:  
    Transforms each numeric feature: (value - mean) / std

    Result: mean = 0, standard deviation = 1

Prevents numeric features with large values from dominating smaller ones

# Create numeric transformer pipeline
Scales numeric features → zero mean, unit variance

Improves the performance of models like Logistic Regression

# Create categorical transformer pipeline
Converts categorical values to binary columns (0/1)

handle_unknown="ignore" → prevents errors if unseen categories appear in test data

# Combinining numeric and categorical transformers using ColumnTransformer
Applies numeric transformation to numeric features

Applies categorical transformation to categorical features

Returns a single transformed feature array ready for ML

# Confirmation
confirms the preprocessing pipeline is successfully created

In [40]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

print("Preprocessing pipeline ready")


Preprocessing pipeline ready


# Cell 8: Logistic Regression Pipeline

# Purpose:

Combines preprocessing and Logistic Regression into one unified pipeline.

Makes the workflow clean, reusable, and production-ready.

# Pipeline Steps

1. "preprocessor" → the ColumnTransformer from Cell 7

    Scales numeric features

     One-hot encodes categorical features

2. "classifier" → Logistic Regression model

    max_iter=1000 ensures the solver has enough iterations to converge

# Why use a pipeline?

   Automatically applies preprocessing during training and testing

   Prevents data leakage (test data must be transformed the same way as training data)

   Allows hyperparameter tuning with GridSearchCV seamlessly

   Easy to save and reuse with joblib

# print(log_pipeline)

Shows both preprocessing and model steps

Confirms pipeline is ready for training or GridSearchCV

In [41]:
log_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

print(log_pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object'))])),
                ('

# Cell 9: Hyperparameter Tuning for Logistic Regression
# Define hyperparameters to tune
  classifier__C → C parameter of Logistic Regression

    Controls regularization strength (C inversely proportional to regularization)

    Smaller C → stronger regularization → simpler model

  The classifier__ prefix is needed because the Logistic Regression is inside the pipeline under the step named "classifier"
# Set up GridSearchCV
  log_pipeline → pipeline with preprocessing + Logistic Regression

  log_params → hyperparameters to test

  cv=5 → 5-fold cross-validation

  Splits training data into 5 parts, trains on 4, validates on 1, repeats

  scoring="f1" → uses F1 score to evaluate model

  Good for imbalanced datasets like churn (Churn = 1 is a minority)

  n_jobs=-1 → uses all CPU cores for faster computation
# Train and search best parameters
   Fits the pipeline with all parameter combinations using cross-validation
   
   Finds the best C value based on highest F1 score
# View results
  best_params_ → the C value that performed best

  best_score_ → cross-validated F1 score for best model

In [42]:
log_params = {
    "classifier__C": [0.1, 1, 10]
}

log_grid = GridSearchCV(
    log_pipeline,
    log_params,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

log_grid.fit(X_train, y_train)

print("Best LR Params:", log_grid.best_params_)
print("Best LR F1 Score:", log_grid.best_score_)


Best LR Params: {'classifier__C': 10}
Best LR F1 Score: 0.596454909838984


# Cell 10: Random Forest Pipeline
# Purpose

  Combines the preprocessing pipeline with a Random Forest classifier

  Makes the workflow clean, reusable, and ready for training/testing

# Pipeline Steps
"preprocessor" → ColumnTransformer from Cell 7

    Scales numeric features

    One-hot encodes categorical features

"classifier" → RandomForestClassifier

    random_state=42 ensures reproducibility of results
 
    Random Forest is an ensemble of decision trees, robust to non-linear relationships and handles categorical data well

In [43]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Cell 11: Random Forest GridSearchCV
# Purpose:
Tune Random Forest hyperparameters (n_estimators, max_depth) using GridSearchCV.

# Pipeline:
    Uses rf_pipeline (preprocessing + Random Forest).

# Cross-validation: 
    cv=5 ensures robust evaluation.

# Scoring: 
    F1 score, suitable for imbalanced churn data.

# Fit: 
    Trains all parameter combinations and finds best params.

# Output:

  best_params_ → best n_estimators & max_depth

  best_score_ → corresponding F1 score

In [44]:
rf_params = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    rf_params,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print("Best RF Params:", rf_grid.best_params_)
print("Best RF F1 Score:", rf_grid.best_score_)


Best RF Params: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Best RF F1 Score: 0.5826828193237013


# Cell 12: Evaluate Random Forest Model
best_model → the trained Random Forest pipeline with best hyperparameters.

y_pred → predictions on the test set.

Evaluation metrics:

Accuracy → overall correct predictions

F1 Score → balance of precision & recall, important for churn

classification_report → detailed precision, recall, F1 for each class.

In [45]:
best_model = rf_grid.best_estimator_

y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7910447761194029
F1 Score: 0.5676470588235294
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



# Cell 13: Save the Model
1. joblib.dump()

   Saves the entire trained pipeline (best_model) to a file

   Includes preprocessing + trained Random Forest classifier

   File name: "churn_prediction_pipeline.pkl"


In [46]:
joblib.dump(best_model, "churn_prediction_pipeline.pkl")
print("Model saved successfully")

Model saved successfully


# Cell 14: Load & Test Saved Model

In [47]:
import joblib
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Step 1: Load the saved model
model = joblib.load("churn_prediction_pipeline.pkl")
print("Model loaded successfully!\n")

# Step 2: Make predictions on the test set
# Make sure X_test and y_test are defined as in your training
y_pred = model.predict(X_test)

# Step 3: Show first 20 predictions
print("First 20 predictions (0=No Churn, 1=Churn):")
print(y_pred[:20], "\n")

# Step 4: Evaluate model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Model loaded successfully!

First 20 predictions (0=No Churn, 1=Churn):
[0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0] 

Accuracy: 0.79
F1 Score: 0.57

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

