## Data Preparation for the Models

In the data preparation step, main goal is to build an advanced pipeline to preprocess the data set and keep ready for the model

### 1. Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from Scripts.custom_transformers import (
                                        TotalChargesCleaner,
                                        NewCustomerFlagger,
                                        TenureSegmenter,
                                        ServiceUsageClassifier,
                                        BillingLevelCreator,
                                        BillingTenureSegmenter,
                                        EngagementScorer,
                                        EngagementSegmenter,
                                        BillingEngageSegmenter,
                                        TenureEngageSegmenter,
                                        MonthToMonthFlagger,
                                        CoreProtectionFlagger,
                                        HighRiskFinancialProfileFlagger,
                                        OverallRiskScorer,
                                        KMeansClusterer
                                        )

### 2. Loading Data and Explore

In [2]:
df = pd.read_csv('../Data/Raw/Telco_Churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

### 3. Split the Data Set to Train-Test

In [5]:
df_cp = df.copy()

X = df_cp.drop(columns=['Churn'])
Y = df_cp['Churn']

X_train, X_test, Y_train, Y_test  = train_test_split(
                                                    X, Y, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                    )

### 3. Build Scikit-learn Pipelines

In [2]:
#Define the  final list of columns for the preprocessor step
numerical_features = ['tenure', 'MonthlyCharges', 'EngagementScore', 'OverallRiskScore', 'is_new_customer', 
                      'IsMonthToMonth', 'HasCoreProtection', 'HighRiskFinancialProfile'
                      ]

categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TenureSegment', 
                        'ServiceUsage', 'BillingLevel', 'BillingTenureSegment', 'EngagementSegment', 'BillingEngageSegment', 
                        'TenureEngageSegment', 'Cluster']

features_to_drop = ['customerID', 'TotalCharges']


# Define the final preprocessor ColumnTransformer
preprocessor = ColumnTransformer(
                                transformers=[
                                            ('num', StandardScaler(), numerical_features ),
                                            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
                                            ],
                                remainder='drop'
                                )

#Asssemble the full pipeline in the correct, logical order
full_pipeline = Pipeline(
                            steps=[
                            ('total_charges_cleaner', TotalChargesCleaner()),
                            ('new_customer_flagger', NewCustomerFlagger()),
                            ('tenure_segmenter', TenureSegmenter()),
                            ('service_usage_classifier', ServiceUsageClassifier()),
                            ('billing_level_creator', BillingLevelCreator()),
                            ('engagement_scorer', EngagementScorer()),
                            ('engagement_segmenter', EngagementSegmenter()),
                            ('core_protection_flagger', CoreProtectionFlagger()),
                            ('billing_tenure_segmenter', BillingTenureSegmenter()),
                            ('billing_engage_segmenter', BillingEngageSegmenter()),
                            ('tenure_engage_segmenter', TenureEngageSegmenter()),
                            ('month_to_month_flagger', MonthToMonthFlagger()),
                            ('high_risk_profile_flagger', HighRiskFinancialProfileFlagger()),
                            ('overall_risk_scorer', OverallRiskScorer()),
                            ('kmeans_clusterer', KMeansClusterer(n_clusters=4)),
                            # Feature Dropping
                            ('feature_dropper', ColumnTransformer([('drop_cols', 'drop', features_to_drop)], remainder='passthrough')),
                            # Final Scaling and Encoding
                            ('final_preprocessor', preprocessor)
                                ]
                            )

print("Pipeline successfully created!")

print(full_pipeline)

Pipeline successfully created!
Pipeline(steps=[('total_charges_cleaner', TotalChargesCleaner()),
                ('new_customer_flagger', NewCustomerFlagger()),
                ('tenure_segmenter', TenureSegmenter()),
                ('service_usage_classifier', ServiceUsageClassifier()),
                ('billing_level_creator', BillingLevelCreator()),
                ('engagement_scorer', EngagementScorer()),
                ('engagement_segmenter', EngagementSegmenter()),
                ('core_prot...
                                                   'Dependents', 'PhoneService',
                                                   'MultipleLines',
                                                   'InternetService',
                                                   'OnlineSecurity',
                                                   'OnlineBackup',
                                                   'DeviceProtection',
                                                   'TechSupport