# Pipeline Construction


### 1. Import libraries


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import pandas as pd

### 2. Import Dataset

In [15]:
df = pd.read_csv('../data/processed/TelcoCustomerChurnPredictionFeatureEngineered.csv')

In [16]:
X = X = df.drop(columns=["Churn"])
Y = df["Churn"].map({"Yes" : 1, "No": 0})

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [18]:
# Define categorical & numerical features
nominal_features = [
                        'gender', 'SeniorCitizen', 'Partner', 'PhoneService', 'Dependents',
                        'OnlineSecurity', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                        'StreamingMovies', 'PaperlessBilling', 'PaymentMethod', "InternetService",	
                        'OnlineBackup', 'MultipleLines', 'tenure_binns', 'PaymentMethodRisk', 'isPayementReliable'
                    ]
ordinal_features = ['Contract']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'ServiceAdoptionScore', 'AvgMonthlyChargesPerService', ]

In [19]:
# Define ColumnTransformer for preprocessing

# Preprocessor for Random forest
preprocessor_rfc = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_features),
        ('ord', OrdinalEncoder(categories=[['Month-to-month','One year','Two year']]), ordinal_features)
    ]
)

preprocessor_cat = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # leave categorical features untouched
)

In [24]:
# Create pipeline with classifier
pipeline_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor_rfc),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline_cat = Pipeline([
    #('preprocessor', preprocessor_cat),
    ('classifier', CatBoostClassifier(
                                        random_seed=42,
                                        verbose=0,
                                        eval_metric='Accuracy'
                                    ))
])

In [25]:
# Cross-Validation Strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_rfc = cross_val_score(pipeline_rfc, X_train, y_train, cv=cv, scoring='accuracy')
print("Cross-validated Accuracy Scores for Random Forest:", scores_rfc)
print("Mean CV Accuracy for Random Forest:", scores_rfc.mean())

Cross-validated Accuracy Scores for Random Forest: [0.77639752 0.77462289 0.80124224 0.78527063 0.80106572]
Mean CV Accuracy for Random Forest: 0.787719798707961


In [27]:
# Hyperparameter Tuning

param_grid_rfc = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__max_features': ['sqrt', 'log2']
}

param_grid_cat = {
    'classifier__depth': [4, 6, 8],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__iterations': [200, 500, 1000],
    'classifier__l2_leaf_reg': [1, 3, 5],
    'classifier__subsample': [0.7, 0.8, 1.0]
}

grid_search_rfc = GridSearchCV(
    estimator=pipeline_rfc,
    param_grid=param_grid_rfc,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=False
)

grid_search_cat = GridSearchCV(
    estimator=pipeline_cat,
    param_grid=param_grid_cat,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=False
)

grid_search_rfc.fit(X_train, y_train)

print("Best Parameters in Random Forest:", grid_search_rfc.best_params_)
print("Best CV Accuracy in Random Forest:", grid_search_rfc.best_score_)


Best Parameters in Random Forest: {'classifier__max_depth': 10, 'classifier__max_features': 'log2', 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best CV Accuracy in Random Forest: 0.8001443654147117
