### Packages

In [1]:
import pandas as pd
import json
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score

  import pkg_resources  # noqa: TID251


In [3]:
file_path = 'C:/Users/ldmag/Documents/GitHub/Code-Assignments-Projects/Projects/MLOps Drift Detection and Pipeline Optimization/data/Telco-Churn.csv'
data = pd.read_csv(file_path)

## Testing the environment with a baseline experiment

In [7]:
from cgi import test


def load_and_prep_telco_data(file_path): # very minimal preprocessing
    df = pd.read_csv(file_path)
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    if 'Churn' in df.columns:
        df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        if col != 'customerID':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    if 'Churn' in numeric_features:
        numeric_features.remove('Churn')
    categoric = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'DeviceProtection', 'TechSupport',
                'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
    return df, numeric_features, categoric

def preprocessing_pipeline(numeric_features, categoric_features):
    steps = []

    if numeric_features:
        scaler = StandardScaler()
        steps.append(('num', scaler, numeric_features))
        print(f'Used StandardScaler for numeric variables')

    if categoric_features:
        scaler = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
        steps.append(('cat', scaler, categoric_features))
        print(f'Used OneHotEncoder for categorical variables')

    preprocessor = ColumnTransformer(
        transformers=steps,
        remainder='passthrough'
    )

    return preprocessor

def train_baseline_model(df, numeric_features, categorical_features):
    X = df.drop('Churn', axis=1)
    y = df['Churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    preprocessor = preprocessing_pipeline(numeric_features, categorical_features)

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced'))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    return model, metrics, (X_train, X_test, y_train, y_test)

def save(model, X_train, X_test, y_train, y_test, feature_names):
    X_train_processed = model.named_steps['preprocessor'].transform(X_train)
    X_test_processed = model.named_steps['preprocessor'].transform(X_test)

    n_features = X_train_processed.shape[1]
    feature_name_list = [f'feature_{i}' for i in range(n_features)]

    #try:
    #    feat_names = model.named_steps['preprocessor'].get_feature_names_out()
    #except:
    #    feat_names = [f'feature_{i}' for i in range(X_train_processed.shape[1])]

    X_trainDF = pd.DataFrame(X_train_processed, columns=feature_name_list)
    X_testDF = pd.DataFrame(X_test_processed, columns=feature_name_list)

    X_trainDF.to_csv('baseline_training_data-V2.csv', index=False)

    testLabelsDF = X_testDF.copy()
    testLabelsDF['Churn'] = y_test.values
    testLabelsDF.to_csv('baseline_test_data-V2.csv', index=False)

    raw_train = X_train.copy()
    raw_train['Churn'] = y_train.values
    raw_train.to_csv('baseline_raw_data-V2.csv', index=False)

    feature_info = {
        'original_feature_names': feature_names,
        'processed_feature_names': feature_name_list,
        'n_original_features': len(feature_names),
        'n_processed_features': len(feature_name_list),
        'preprocessing_steps': {
            'numeric_features': [f for f in feature_names if f in X_train.select_dtypes(include=[np.number]).columns],
            'categorical_features': [f for f in feature_names if f in X_train.select_dtypes(include=['object']).columns]
        }
    }
    
    with open("feature_metadata.json", "w") as f:
        json.dump(feature_info, f, indent=2)
    print(f"Saved feature metadata")
    
    return X_train_processed, feature_info

def main():
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("telco-baseline")
    with mlflow.start_run(run_name="baseline_model_V2"):
        df, numeric_features, categoric_features = load_and_prep_telco_data(file_path)
        model, metrics, data_splits = train_baseline_model(df, numeric_features, categoric_features)
        X_train, X_test, y_train, y_test = data_splits
        processed_feats, feature_info = save(model, X_train, X_test, y_train, y_test, numeric_features + categoric_features)
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("max_depth", 10)
        #mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)
        mlflow.log_param("dataset_size", len(df))
        mlflow.log_param("n_features_original", len(numeric_features + categoric_features))
        mlflow.log_param("n_features_processed", feature_info['n_processed_features'])
        mlflow.log_param("n_numeric_features", len(numeric_features))
        mlflow.log_param("n_categorical_features", len(categoric_features))
        mlflow.log_param("churn_rate", df['Churn'].mean())
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))
        for metric_name, value in metrics.items():
            mlflow.log_metric(metric_name, value)
        mlflow.sklearn.log_model(model, "Churn-RF-baseline-V2", registered_model_name="telco_churn_baseline", signature=mlflow.models.infer_signature(X_train, y_train))
        #X_train.to_csv("baseline_training_data.csv", index=False)
        mlflow.log_artifact("baseline_training_data-V2.csv")
        mlflow.log_artifact("baseline_test_data-V2.csv") 
        mlflow.log_artifact("baseline_raw_data-V2.csv")
        mlflow.log_artifact("feature_metadata.json")
        print("\n Data artifacts logged")

        feature_importance = model.named_steps['classifier'].feature_importances_
        importance_df = pd.DataFrame({
            'feature': feature_info['processed_feature_names'],
            'importance': feature_importance
        }).sort_values('importance', ascending=False)
        importance_df.to_csv('baseline_featureImp.csv', index=False)
        mlflow.log_artifact('baseline_featureImp.csv')
        print('Logged feature importance')

        run_id = mlflow.active_run().info.run_id

        print(f'Experiment completed with run id: {run_id}.')

        return model, metrics, processed_feats, feature_info



if __name__ == "__main__":
    model, metrics, processed_feats, feature_info = main()

Used StandardScaler for numeric variables
Used OneHotEncoder for categorical variables
Saved feature metadata


Registered model 'telco_churn_baseline' already exists. Creating a new version of this model...
2025/10/26 20:12:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: telco_churn_baseline, version 5
Created version '5' of model 'telco_churn_baseline'.



 Data artifacts logged
Logged feature importance
Experiment completed with run id: afc13c69403a4669b6415e3af2ce996d.


## Simulating drift

In [None]:
def load_artifact_minio(experiment_name, run_name):
    runs = mlflow.search_runs(experiement_ids=[experiment_name.experiment_id])
    baseline_id = runs.iloc[0]['run_id']

    datapath = mlflow.artifacts.download_artifacts(
        run_id=baseline_id,
        artifact_path='baseline_training_data-V2.csv'
    )

    data = pd.read_csv(datapath)

    baseline_model = mlflow.sklearn.load_model("models:/telco-baseline/latest")

    return data, baseline_model, baseline_id

In [None]:
# This cell is now retired
'''

from pandas.core.arrays import categorical
import shap
import boto3
import os
from io import BytesIO

AWS_ACCESS_KEY_ID = os.getenv('MINIO_ACCESS_KEY')
AWS_SECRET_ACCESS_KEY = os.getenv('MINIO_SECRET_ACCESS_KEY')


def simulate_covariate(drift_strength: float):
    s3 = boto3.client('s3', endpoint_url='http://localhost:9000', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) #have to use localhost here
    obj = s3.get_object(Bucket='mlflow', Key='2/b10db814ad384b3ebe421587de03d728/artifacts/baseline_training_data-V2.csv') # use baseline data for data lineage, unfortunately need to point it in the right direction since I can't use the run_id
    baseline_data = pd.read_csv(BytesIO(obj['Body'].read()))

    drift_explanations = []
    drifted = baseline_data.copy()
    numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
    numeric_columns = [col for col in numeric_columns if col in drifted.columns]

    for col in numeric_columns:
        if col == 'tenure':
            drifted[col] = drifted[col] + np.random.normal(5, 2, len(drifted)).astype('int64')
            drift_explanations.append(f'Increased Tenure')
        elif col == 'MonthlyCharges':
            drifted[col] = drifted[col] * (1+np.random.normal(0.15, 0.05, len(drifted)).astype('float64'))
            drift_explanations.append(f'Increased Monthly Charges / Inflation')
        elif col == 'TotalCharges':
            drifted[col] = drifted['tenure'] * drifted['MonthlyCharges'] + np.random.normal(0, 50, len(drifted))
            drift_explanations.append(f'Adjusted charges for changes in tenure and monthly charges')

    categorical_cols = np.random.choice([0,1,2,], size=len(drifted), p=[0.7,0.2,0.1])

    encoded = []
    for col in drifted.columns:
        if col not in numeric_columns:
            unique_vals = drifted[col].nunique()
            if 2 <= unique_vals <= 10: 
                encoded.append(col)

    if encoded:
        col_shift = np.random.choice(encoded)
        mask = np.random.random(len(drifted)) < drift_strength
        drifted.loc[mask, col_shift] = (drifted.loc(mask, col_shift) + 1) % drifted[col_shift].max()
        drift_explanations.append(f'Shift in {col_shift} distribution')

    return drifted, drift_explanations

            

def simulate_concept(drift_strength: float):
    s3 = boto3.client('s3', endpoint_url='http://localhost:9000', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) #have to use localhost here
    obj = s3.get_object(Bucket='mlflow', Key='2/b10db814ad384b3ebe421587de03d728/artifacts/baseline_training_data-V2.csv')
    baseline_data = pd.read_csv(BytesIO(obj['Body'].read()))

    drifted = baseline_data.copy()

    synthetic_labels = np.zeros(len(drifted))

    # Following rule-based logic, similar to what we are simulating above
    more_charges = drifted['MonthlyCharges'] > drifted['MonthlyCharges'].median()
    low_tenure = drifted['tenure'] < drifted['tenure'].median()
    synthetic_labels[(more_charges & low_tenure)] = 1

    # apparently need to add a noise mask
    noise_mask = np.random.random(len(drifted)) < drift_strength
    synthetic_labels[noise_mask] = 1 - drifted[noise_mask]

    explanation = ["Concept drift: Charges and Tenure now directly affects churn", 
        f'Added {drift_strength*100} label noise to simulate uncertainty']

    return drifted, synthetic_labels, explanation
    '''

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
baseline_data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [36]:
baseline_data.corr()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
gender,1.0,0.007095,-0.00515,0.008329,-0.003024,0.003666,-0.012235,0.003639,-0.022955,-0.013961,0.00463,-0.018022,-0.013591,-0.018821,-0.008318,-0.025967,0.005764,-0.022118,-0.00897
SeniorCitizen,0.007095,1.0,0.015729,-0.2133,0.01698,0.008483,0.143994,-0.033199,-0.113953,-0.009927,-0.019862,-0.152885,0.040783,0.043526,-0.142953,0.157326,-0.034873,0.220376,0.101996
Partner,-0.00515,0.015729,1.0,0.446276,0.373689,0.023485,0.151746,0.007331,0.152095,0.148506,0.16766,0.128177,0.140947,0.132637,0.294694,-0.014067,-0.160857,0.101317,0.315955
Dependents,0.008329,-0.2133,0.446276,1.0,0.154892,-0.004428,-0.011911,0.052915,0.144639,0.085048,0.074525,0.132809,0.046677,0.018039,0.240206,-0.116574,-0.045481,-0.114901,0.060534
tenure,-0.003024,0.01698,0.373689,0.154892,1.0,0.003653,0.349918,-0.034786,0.32214,0.376786,0.37382,0.321684,0.286577,0.300744,0.670011,0.010525,-0.373146,0.2567,0.829055
PhoneService,0.003666,0.008483,0.023485,-0.004428,0.003653,1.0,-0.018338,0.392659,-0.018954,0.022191,0.004724,-0.022236,0.05619,0.035936,-0.00138,0.005433,-0.00645,0.248911,0.111669
MultipleLines,-0.012235,0.143994,0.151746,-0.011911,0.349918,-0.018338,1.0,-0.103685,0.011646,0.130552,0.130892,0.020454,0.181295,0.188681,0.115106,0.15827,-0.183152,0.436575,0.458927
InternetService,0.003639,-0.033199,0.007331,0.052915,-0.034786,0.392659,-0.103685,1.0,-0.032926,0.034818,0.051899,-0.029863,0.103602,0.090085,0.093072,-0.139438,0.082943,-0.319716,-0.174073
OnlineSecurity,-0.022955,-0.113953,0.152095,0.144639,0.32214,-0.018954,0.011646,-0.032926,1.0,0.171442,0.162167,0.283377,0.042676,0.062637,0.378294,-0.161843,-0.089526,-0.055614,0.249384
OnlineBackup,-0.013961,-0.009927,0.148506,0.085048,0.376786,0.022191,0.130552,0.034818,0.171442,1.0,0.186642,0.197193,0.141195,0.133803,0.27849,-0.010509,-0.132885,0.126153,0.381308


In [None]:

if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if col != 'customerID':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
if 'Churn' in numeric_features:
    numeric_features.remove('Churn')
categoric = df.select_dtypes(include=['object']).columns.tolist()