In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
df = pd.read_csv('../artifacts/TelcoCustomerChurn.csv')

# Handle missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)

# Drop customerID as it's not relevant for prediction
df = df.drop('customerID', axis=1)

In [3]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Convert categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [5]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50,0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90,0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45,0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60,1


In [6]:
# Split features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=42)

In [7]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [8]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('telco_churn_prediction')

def evaluate_model(y_true, y_pred, y_prob):
    """Calculate evaluation metrics"""
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob)
    }

2025/01/11 07:07:55 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/01/11 07:07:55 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [9]:
# Model configurations
models = {
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': 5,
            'min_samples_split': 2,
            'random_state': 42
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': 100,
            'learning_rate': 0.1,
            'max_depth': 3,
            'random_state': 42
        }
    }
}

In [10]:
# Train and evaluate models
for model_name, model_info in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log model parameters
        mlflow.log_params(model_info['params'])
        
        # Train model
        model = model_info['model']
        model.set_params(**model_info['params'])
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        # Calculate and log metrics
        metrics = evaluate_model(y_test, y_pred, y_prob)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(model, 
                                 model_name,
                                 registered_model_name=f"telco_churn_{model_name}")
        
        print(f"\nModel: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value:.4f}")

Successfully registered model 'telco_churn_decision_tree'.
Created version '1' of model 'telco_churn_decision_tree'.



Model: decision_tree
accuracy: 0.7850
precision: 0.6035
recall: 0.5535
f1: 0.5774
roc_auc: 0.8217


Successfully registered model 'telco_churn_random_forest'.
Created version '1' of model 'telco_churn_random_forest'.



Model: random_forest
accuracy: 0.7949
precision: 0.6441
recall: 0.5080
f1: 0.5680
roc_auc: 0.8356





Model: gradient_boosting
accuracy: 0.8027
precision: 0.6714
recall: 0.5027
f1: 0.5749
roc_auc: 0.8446


Successfully registered model 'telco_churn_gradient_boosting'.
Created version '1' of model 'telco_churn_gradient_boosting'.


In [11]:
# Transition the best model to production
from mlflow.tracking import MlflowClient

def load_production_model(model_name):
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/telco_churn_{model_name}/Production"
    )
    return model



client = MlflowClient()

def transition_model_to_production(model_name):
    client = MlflowClient()
    latest_version = client.get_latest_versions(f"telco_churn_{model_name}", stages=["None"])[0]
    client.transition_model_version_stage(
        name=f"telco_churn_{model_name}",
        version=latest_version.version,
        stage="Production"
    )

In [12]:
# Example: Transition the best performing model to production
# Note: You should choose the best model based on your evaluation metrics
transition_model_to_production('random_forest')


In [13]:
def get_all_runs():
    experiment = mlflow.get_experiment_by_name('telco_churn_prediction')
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    return runs

In [19]:
runs_df = get_all_runs()
metrics_comparison = runs_df[['tags.mlflow.runName', 'metrics.accuracy', 'metrics.precision', 
                            'metrics.recall', 'metrics.f1', 'metrics.roc_auc']]
print("\nModel Performance Comparison:")
print(metrics_comparison)


Model Performance Comparison:
  tags.mlflow.runName  metrics.accuracy  metrics.precision  metrics.recall  \
0   gradient_boosting          0.802697           0.671429        0.502674   
1       random_forest          0.794890           0.644068        0.508021   
2       decision_tree          0.784954           0.603499        0.553476   

   metrics.f1  metrics.roc_auc  
0    0.574924         0.844602  
1    0.568012         0.835640  
2    0.577406         0.821742  


In [20]:
# Analyze feature importance for the best model (Random Forest)
best_model = models['random_forest']['model']
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

# Sort features by importance
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Display top 10 most important features
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
             feature  importance
4             tenure    0.157593
18      TotalCharges    0.150879
17    MonthlyCharges    0.144349
14          Contract    0.127680
8     OnlineSecurity    0.077970
11       TechSupport    0.062379
7    InternetService    0.045406
16     PaymentMethod    0.044382
9       OnlineBackup    0.029205
15  PaperlessBilling    0.024329
