### Packages

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score

  import pkg_resources  # noqa: TID251


In [2]:
file_path = 'C:/Users/ldmag/Documents/GitHub/Code-Assignments-Projects/Projects/MLOps Drift Detection and Pipeline Optimization/data/Telco-Churn.csv'
data = pd.read_csv(file_path)

## Testing environment

In [None]:
def load_and_prep_telco_data(file_path): # very minimal preprocessing
    df = pd.read_csv(file_path)
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
    if 'Churn' in df.columns:
        df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        if col != 'customerID':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)
    return df

def train_baseline_model(df):
    X = df.drop('Churn', axis=1)
    y = df['Churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    return model, metrics, (X_train, X_test, y_train, y_test)

def main():
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("telco-baseline")
    with mlflow.start_run(run_name="baseline_model"):
        df = load_and_prep_telco_data(file_path)
        model, metrics, data_splits = train_baseline_model(df)
        X_train, X_test, y_train, y_test = data_splits
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("max_depth", 10)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)
        mlflow.log_param("dataset_size", len(df))
        mlflow.log_param("n_features", len(df.columns) - 1)
        mlflow.log_param("churn_rate", df['Churn'].mean())
        for metric_name, value in metrics.items():
            mlflow.log_metric(metric_name, value)
        mlflow.sklearn.log_model(model, "Churn-RF-baseline", registered_model_name="telco_churn_baseline")
        X_train.to_csv("baseline_training_data.csv", index=False)
        mlflow.log_artifact("baseline_training_data.csv")
        print("\n Baseline experiment complete! Check MLflow UI for results.")

if __name__ == "__main__":
    main()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
Successfully registered model 'telco_churn_baseline'.
2025/10/22 20:03:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: telco_churn_baseline, version 1
Created version '1' of model 'telco_churn_baseline'.



 Baseline experiment complete! Check MLflow UI for results.


## Simulating drift

In [None]:
import shap
import boto3
import os
from io import BytesIO

AWS_ACCESS_KEY_ID = os.getenv('MINIO_ACCESS_KEY')
AWS_SECRET_ACCESS_KEY = os.getenv('MINIO_SECRET_ACCESS_KEY')


def simulate_covariate(drift_strength: float):
    s3 = boto3.client('s3', endpoint_url='http://localhost:9000', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) #have to use localhost here
    obj = s3.get_object(Bucket='mlflow', Key='2/c806c8bb21434aaf96b80175f97dd7da/artifacts/baseline_training_data.csv')
    baseline_data = pd.read_csv(BytesIO(obj['Body'].read()))

    drift_explanations = []
    drifted = baseline_data.copy()
    numeric_columns = [col for col in numeric_columns if col in drifted.columns]

    for col in numeric_columns:
        if col == 'tenure':
            drifted[col] = drifted[col] + np.random.normal(5, 2, len(drifted)).astype('int64')
            drift_explanations.append(f'Increased Tenure')
        elif col == 'MonthlyCharges':
            drifted[col] = drifted[col] * (1+np.random.normal(0.15, 0.05, len(drifted)).astype('float64'))
            drift_explanations.append(f'Increased Monthly Charges / Inflation')
        elif col == 'TotalCharges':
            drifted[col] = drifted['tenure'] * drifted['MonthlyCharges'] + np.random.normal(0, 50, len(drifted))
            drift_explanations.append(f'Adjusted charges for changes in tenure and monthly charges')

    
    return baseline_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1,0,0,0,35,0,1,0,0,0,2,0,2,2,0,0,2,49.2,1701.65
1,1,0,1,1,15,1,0,1,2,0,0,0,0,0,0,0,3,75.1,1151.55
2,1,0,1,1,13,0,1,0,2,2,0,2,0,0,2,0,3,40.55,590.35
3,0,0,1,0,26,1,0,0,0,2,2,0,2,2,2,1,1,73.5,1905.7
4,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,2,44.55,44.55
5,1,0,0,1,66,1,0,0,0,2,2,2,2,2,2,1,1,79.5,5196.1
6,0,0,0,0,1,1,0,2,1,1,1,1,1,1,0,0,1,19.8,19.8
7,0,0,1,0,40,1,2,0,0,2,0,0,0,0,0,0,1,56.6,2379.1
8,0,0,1,0,65,1,0,0,0,0,0,2,2,0,2,1,1,59.8,3808.2
9,0,0,0,0,60,1,2,0,0,0,2,2,2,2,1,0,1,80.55,4847.05


In [10]:
baseline_data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [34]:
baseline_data['MonthlyCharges'] * baseline_data['tenure'] + np.random.normal(0, 50, len(baseline_data))

0       1690.486834
1       1122.561419
2        514.995418
3       1951.338967
4         84.255835
           ...     
5629    7786.728389
5630     125.447740
5631    2513.778761
5632     474.819637
5633     224.858769
Length: 5634, dtype: float64

In [35]:
baseline_data['TotalCharges']

0       1701.65
1       1151.55
2        590.35
3       1905.70
4         44.55
         ...   
5629    7707.70
5630      80.35
5631    2660.20
5632     482.80
5633     109.30
Name: TotalCharges, Length: 5634, dtype: float64

In [36]:
baseline_data.corr()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
gender,1.0,0.007095,-0.00515,0.008329,-0.003024,0.003666,-0.012235,0.003639,-0.022955,-0.013961,0.00463,-0.018022,-0.013591,-0.018821,-0.008318,-0.025967,0.005764,-0.022118,-0.00897
SeniorCitizen,0.007095,1.0,0.015729,-0.2133,0.01698,0.008483,0.143994,-0.033199,-0.113953,-0.009927,-0.019862,-0.152885,0.040783,0.043526,-0.142953,0.157326,-0.034873,0.220376,0.101996
Partner,-0.00515,0.015729,1.0,0.446276,0.373689,0.023485,0.151746,0.007331,0.152095,0.148506,0.16766,0.128177,0.140947,0.132637,0.294694,-0.014067,-0.160857,0.101317,0.315955
Dependents,0.008329,-0.2133,0.446276,1.0,0.154892,-0.004428,-0.011911,0.052915,0.144639,0.085048,0.074525,0.132809,0.046677,0.018039,0.240206,-0.116574,-0.045481,-0.114901,0.060534
tenure,-0.003024,0.01698,0.373689,0.154892,1.0,0.003653,0.349918,-0.034786,0.32214,0.376786,0.37382,0.321684,0.286577,0.300744,0.670011,0.010525,-0.373146,0.2567,0.829055
PhoneService,0.003666,0.008483,0.023485,-0.004428,0.003653,1.0,-0.018338,0.392659,-0.018954,0.022191,0.004724,-0.022236,0.05619,0.035936,-0.00138,0.005433,-0.00645,0.248911,0.111669
MultipleLines,-0.012235,0.143994,0.151746,-0.011911,0.349918,-0.018338,1.0,-0.103685,0.011646,0.130552,0.130892,0.020454,0.181295,0.188681,0.115106,0.15827,-0.183152,0.436575,0.458927
InternetService,0.003639,-0.033199,0.007331,0.052915,-0.034786,0.392659,-0.103685,1.0,-0.032926,0.034818,0.051899,-0.029863,0.103602,0.090085,0.093072,-0.139438,0.082943,-0.319716,-0.174073
OnlineSecurity,-0.022955,-0.113953,0.152095,0.144639,0.32214,-0.018954,0.011646,-0.032926,1.0,0.171442,0.162167,0.283377,0.042676,0.062637,0.378294,-0.161843,-0.089526,-0.055614,0.249384
OnlineBackup,-0.013961,-0.009927,0.148506,0.085048,0.376786,0.022191,0.130552,0.034818,0.171442,1.0,0.186642,0.197193,0.141195,0.133803,0.27849,-0.010509,-0.132885,0.126153,0.381308
