In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import pickle

import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn 2.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# ML pipeline

In [6]:
# Define which columns need which transformations
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [col for col in df.columns if df[col].dtype == 'object' and col not in ['Churn','customerID']]

# Create preprocessing steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop ='first'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

# Define the stacking classifier's base models and meta model
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42, n_estimators=100)),
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, n_estimators = 300, max_depth = 10,
                          learning_rate=0.1, eval_metric='logloss'))
]
meta_model = LogisticRegression()
stacking = StackingClassifier(estimators=base_learners, final_estimator=meta_model, stack_method='auto')


# Create the pipeline
churn_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', stacking)
])

# Train the pipeline
X = df.drop(columns=['Churn', 'customerID'])
y = df['Churn'].map({'No':0,'Yes':1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
churn_pipeline.fit(X_train, y_train)

# Evaluate the pipeline
from sklearn.metrics import classification_report
y_pred = churn_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 4138, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2738
[LightGBM] [Info] Number of data points in the train set: 8276, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3310, number of negative: 3310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2738
[LightGBM] [Info] Number of data points in the train set: 6620, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [7]:
churn_pipeline

In [None]:
# Save model pipeline
model_filename = "stacking_pipeline_model.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(churn_pipeline, f)
print(f"\nModel successfully saved as {model_filename}")



Model successfully saved as stacking_pipeline_model.pkl
