In [3]:
#We needa do our modelling while handling class imbalance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv") #load data

In [4]:
#Prepare X-features and Y-Targets(needed for our train-test splitting later)

X = df.drop('Churn', axis=1) # Features, everything except target
Y = df['Churn'].map({'Yes': 1, 'No': 0}) # Target, encode as yes/no as 0/1

print(f"\nClass Distribution:")
print(f"   No Churn (0): {sum(Y == 0):,} customers ({np.mean(Y == 0):.1%})") #shows sum and percentage of non churners VS churners
print(f"   Churn (1):    {sum(Y == 1):,} customers ({np.mean(Y == 1):.1%})")
print(f"   Imbalance ratio: {sum(Y == 0)/sum(Y == 1):.1f}x more non-churners")


Class Distribution:
   No Churn (0): 5,174 customers (73.5%)
   Churn (1):    1,869 customers (26.5%)
   Imbalance ratio: 2.8x more non-churners


In [5]:
# Split BEFORE any resampling to avoid data leakage! (called-Stratified Splitting)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.2,        # 20% for testing, 80% for training
    random_state=42,      #Using number 42 shuffle order
    stratify=Y           # Keep the ratio of classes(non-churners & churners) the same in both sets
)

print(f"   Training set: {X_train.shape[0]:,} samples")
print(f"   Testing set:  {X_test.shape[0]:,} samples")
print(f"   Training class ratio: {np.mean(Y_train == 0):.1%} No churn")
print(f"   Testing class ratio:  {np.mean(Y_test == 0):.1%} No churn")
print(f"   Testing class ratio:  {np.mean(Y_test == 1):.1%} churn")


   Training set: 5,634 samples
   Testing set:  1,409 samples
   Training class ratio: 73.5% No churn
   Testing class ratio:  73.5% No churn
   Testing class ratio:  26.5% churn


In [6]:
#Import our PREPROCESSING PIPELINE from src
import sys
import os

# Add project root to Python path(so we dont get that "src not defined" error again like we did in '01_eda.ipyn')
project_root = os.path.dirname(os.getcwd())

sys.path.insert(0, project_root)
print(f"Added to path: {project_root}")

# Now import
from src.preprocessing import create_preprocessing_pipeline
print("Pipeline imported!")


Added to path: C:\Users\User\Desktop\Data Science Projects\3)Telecome-churn-prediction proj
Pipeline imported!


In [7]:
#Use our preprocessing Pipeline
preprocessor_pipeline = create_preprocessing_pipeline()

# Fit ONLY on training data
preprocessor_pipeline.fit(X_train, Y_train)

#Transform X
X_train_processed = preprocessor_pipeline.transform(X_train)
X_test_processed = preprocessor_pipeline.transform(X_test)

print(f"Preprocessing done!")
print(f"   Training shape: {X_train_processed.shape}")
print(f"   Testing shape:  {X_test_processed.shape}")

#NOW u can easily try multiple models!

Preprocessing done!
   Training shape: (5634, 41)
   Testing shape:  (1409, 41)


In [8]:
#MODELLING STARTS FROM HERE:
#Once we find our perfect model to use then we can create our FINAL PIPELINE that will do the preprocessing+modelling=Prediction


In [9]:
# TRAIN LOGISTIC REGRESSION MODEL(WITH CLASS WEIGHTS TO HANDLE IMBALANCE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

#Create model with class weights to handle imbalance
lr_model = LogisticRegression(
    class_weight='balanced',  # CRITICAL for imbalance!
    random_state=42,
    max_iter=1000            # Ensure convergence
)


lr_model.fit(X_train_processed, Y_train)

Y_pred_lr= lr_model.predict(X_test_processed)
Y_pred_proba_lr = lr_model.predict_proba(X_test_processed)[:, 1]  # Probability of churn thats why used 1 bec 1=churn


#Evaluate the model
print(classification_report(Y_test, Y_pred_lr, target_names=['No Churn', 'Churn']))


roc_auc_lr = roc_auc_score(Y_test, Y_pred_proba_lr)
print(f"ROC-AUC Score: {roc_auc_lr:.3f}")  #That .3f just means show 3 decimal places


              precision    recall  f1-score   support

    No Churn       0.90      0.72      0.80      1035
       Churn       0.50      0.78      0.61       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.75      1409

ROC-AUC Score: 0.842


In [10]:
#insights from our classification report and roc_auc_score
#Recall = 0.78: You're catching 78% of actual churners! This is EXCELLENT!
#(Missing only 22% of churners)

#Precision = 0.50: When you flag someone as "will churn", you're right 50% of the time
#(Half are false alarms - but that's OK for churn prediction!)

#High precision (0.90): When you say "won't churn", you're right 90% of the time
#Lower recall (0.72): incorrectly flagging 28% of loyal customers as "risky"
#Low precision (0.50): When you say "churn", you're right 50% of the time
#Higher recall (0.78): incorrectly flagging 22% of unloyal customers as "not risky"

#ROC-AUC = 0.842 (thats very good)

In [11]:
from sklearn.ensemble import RandomForestClassifier

#Create Random Forest with weights
rf_model = RandomForestClassifier(
    class_weight = 'balanced',
    random_state = 42,
    n_estimators = 100,
    max_depth = 10,   #
    n_jobs = -1     #Use all CPU cores, so it'll train the model faster
)

#Fit the model
rf_model.fit(X_train_processed, Y_train)

#Predict
Y_pred_rf = rf_model.predict(X_test_processed)
Y_pred_proba_rf = rf_model.predict_proba(X_test_processed)[:, 1]


#Evaluate the model
print(classification_report(Y_test, Y_pred_rf, target_names=['No Churn', 'Churn']))

roc_auc_rf = roc_auc_score(Y_test, Y_pred_proba_rf)
print(f"ROC-AUC Score: {roc_auc_rf:.3f}")

              precision    recall  f1-score   support

    No Churn       0.88      0.79      0.83      1035
       Churn       0.55      0.71      0.62       374

    accuracy                           0.77      1409
   macro avg       0.72      0.75      0.73      1409
weighted avg       0.79      0.77      0.78      1409

ROC-AUC Score: 0.837


In [16]:
#Logistic Regression is BETTER at:
#REM: Recall is how many churners are we actually catching?
#Catching more churners (78% recall vs 71% recall) â†’ Saves more revenue!
#Overall predictive skill (0.842 vs 0.837 ROC-AUC)

#Random Forest is BETTER at:
#Fewer false alarms (55% vs 50% wrong flags)
#Slightly better balanced performance (0.62 vs 0.61 F1)

#Business Wise
#Remember tho it costs less to save existing customers than too get new customers
#Therefore we will choose the Logistic Regression model
#Also log_reg model is easier to explain WHY a customer might churn


In [17]:
#OKAY WE GOING TO ADD OUR MLFLOW CODE HERE(Just after trainig the models):

# MLFLOW: TRACK AND COMPARE OUR MODELS

# Import our tracking functions that we created in our 'mlflow_tracking.py'
from mlflow_tracking import track_experiment, compare_models, show_mlflow_instructions

# Method 1: Track models ONE BY ONE
print("\n METHOD 1: Track Logistic Regression...")
lr_params = {
    'model': 'LogisticRegression',
    'class_weight': 'balanced',
    'max_iter': 1000,
    'random_state': 42
}

lr_tracked, lr_metrics = track_experiment(
    lr_model, 
    'Logistic_Regression_Churn',
    X_train_processed, X_test_processed, 
    Y_train, Y_test,
    lr_params
)

print("\n METHOD 1: Track Random Forest...")
rf_params = {
    'model': 'RandomForest',
    'n_estimators': 100,
    'max_depth': 10,
    'class_weight': 'balanced',
    'random_state': 42
}

rf_tracked, rf_metrics = track_experiment(
    rf_model,
    'Random_Forest_Churn',
    X_train_processed, X_test_processed,
    Y_train, Y_test,
    rf_params
)


# Method 2: OR compare both at once
print("\n METHOD 2: Compare both models at once...")
models_to_compare = {
    'Logistic_Regression': lr_model,
    'Random_Forest': rf_model
}

comparison_results = compare_models(
    models_to_compare,
    X_train_processed, X_test_processed,
    Y_train, Y_test
)

# Show how to view results
show_mlflow_instructions()

ImportError: cannot import name 'track_experiment' from 'mlflow_tracking' (C:\Users\User\Desktop\Data Science Projects\3)Telecome-churn-prediction proj\notebooks\mlflow_tracking.py)

In [18]:
# Debug import
import sys
print(f"Python path: {sys.path}")

# Try absolute import
import importlib.util
spec = importlib.util.spec_from_file_location(
    "mlflow_tracking", 
    r"C:\Users\User\Desktop\Data Science Projects\3)Telecome-churn-prediction proj\notebooks\mlflow_tracking.py"
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

print(f"Functions in module: {dir(module)}")

Python path: ['C:\\Users\\User\\Desktop\\Data Science Projects\\3)Telecome-churn-prediction proj', 'C:\\Users\\User\\Desktop\\Data Science Projects\\3)Telecome-churn-prediction proj\\notebooks', 'C:\\Users\\User\\anaconda3\\python311.zip', 'C:\\Users\\User\\anaconda3\\DLLs', 'C:\\Users\\User\\anaconda3\\Lib', 'C:\\Users\\User\\anaconda3', '', 'C:\\Users\\User\\anaconda3\\Lib\\site-packages', 'C:\\Users\\User\\anaconda3\\Lib\\site-packages\\win32', 'C:\\Users\\User\\anaconda3\\Lib\\site-packages\\win32\\lib', 'C:\\Users\\User\\anaconda3\\Lib\\site-packages\\Pythonwin']





Functions in module: ['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'compare_models', 'mlflow', 'pd', 'precision_score', 'recall_score', 'roc_auc_score', 'show_mlflow_instructions', 'track_experiment']


In [None]:
#Now lets create a completed Pipeline = preprocessing + model
import joblib
from sklearn.pipeline import Pipeline as SKPipeline

full_pipeline = SKPipeline([
    ('preprocessing', preprocessor_pipeline),  # Your existing preprocessing
    ('classifier', lr_model)                    # Your trained model
])

#-------------------------------------------------------------------------------------------------------------------------------
# NOW save the pipeline
#IMP!!= (This code helped save our pipeline where it suppose to be saved and not in the 'notebooks/'' folder location YAY)
import os

# Create models folder in project root (not in notebooks/)
os.makedirs('../models', exist_ok=True)

# Save with ../ to go up from notebooks/ to project root
model_path = '../models/churn_pipeline.pkl'
joblib.dump(full_pipeline, model_path)

print(f" Model saved to: {os.path.abspath(model_path)}")
print(f"File size: {os.path.getsize(model_path):,} bytes") #If shows 0 bytes then something not right
#-------------------------------------------------------------------------------------------------------------------------------

#Test our pipeline
test_customer = X_test.iloc[[0]]  # First test customer
prediction = full_pipeline.predict(test_customer)
probability = full_pipeline.predict_proba(test_customer)[0, 1]  #Do row 1 customer, get probability if churn(churn=1)

print(f"Test prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Probability: {probability:.1%}")

In [None]:
#Lets try reload the full_pipeline
loaded_pipeline = joblib.load('../models/churn_pipeline.pkl')