In [1]:
# 📦 Imports
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# 📊 Load Data
df = pd.read_csv('../data/telco_churn.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df = df.drop(columns=['customerID'], errors='ignore')

# 🧱 Feature Separation
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod'
]

X = df[numerical_cols + categorical_cols]
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔧 Preprocessing Pipelines
numeric_transformer = Pipeline([('scaler', StandardScaler())])
categorical_transformer = Pipeline([('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# 📁 Set up MLflow
mlflow.set_tracking_uri("file:///D:/projects/churn_prediction_mlops/mlruns")


mlflow.set_experiment("Churn_Model_Comparison")

# --- RUN 1: XGBoost Default ---
with mlflow.start_run(run_name="XGBoost_Default"):
    xgb_clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ])
    xgb_clf.fit(X_train, y_train)
    y_pred = xgb_clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    mlflow.log_param("model_type", "XGBoost_Default")
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(xgb_clf, "model")

# --- RUN 2: XGBoost Balanced ---
with mlflow.start_run(run_name="XGBoost_Balanced"):
    xgb_bal = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(scale_pos_weight=2, use_label_encoder=False, eval_metric='logloss'))
    ])
    xgb_bal.fit(X_train, y_train)
    y_pred = xgb_bal.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    mlflow.log_param("model_type", "XGBoost_Balanced")
    mlflow.log_param("scale_pos_weight", 2)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(xgb_bal, "model")

# --- RUN 3: Logistic Regression Balanced ---
with mlflow.start_run(run_name="Logistic_Regression_Balanced"):
    logreg = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
    ])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    mlflow.log_param("model_type", "Logistic_Balanced")
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(logreg, "model")


2025/05/17 22:13:44 INFO mlflow.tracking.fluent: Experiment with name 'Churn_Model_Comparison' does not exist. Creating a new experiment.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
