In [1]:
#Importing libraries
import mlflow
import numpy as np
import pandas as pd
from mlflow import sklearn
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read the balancced_df data from the Parquet file
transformed_df = pd.read_csv("balanced_df.csv")
transformed_df.shape

(391866, 37)

In [3]:
# Calculate the correlation matrix
correlation_matrix = transformed_df.corr()

# Sort the correlations by the 'Churn' column, and remove the self-correlation
sorted_correlation = correlation_matrix['Churn'].drop('Churn', errors='ignore').sort_values(key=np.abs, ascending=False)

# Step 1: Select the subset of features
features = [
    'MonthlyCharges', 'SupportTicketsPerMonth', 'UserRating', 
    'WatchlistSize', 'AccountAge', 'TotalCharges', 
    'ViewingHoursPerWeek', 'ContentDownloadsPerMonth', 
    'AverageViewingDuration'
]

X_subset = transformed_df[features]

# Step 2: Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X_subset)

# Step 3: Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_std)

In [4]:
# Split the data
X = transformed_df.drop(['Churn'], axis=1)
y = transformed_df['Churn']

In [5]:
# Identify the most correlated features for PCA
most_correlated_features = ['MonthlyCharges', 'SupportTicketsPerMonth', 'UserRating', 
                            'WatchlistSize', 'AccountAge', 'TotalCharges', 
                            'ViewingHoursPerWeek', 'ContentDownloadsPerMonth', 
                            'AverageViewingDuration']

# Define the feature processing for PCA
pca_pipeline = Pipeline(steps=[('scale', StandardScaler()), ('pca', PCA(n_components=9))])

# Pipeline for already encoded features
# Using FunctionTransformer to create a no-op (no operation) pipeline step
encoded_features = [
    'SubscriptionTypeEncoded','PaymentMethod_Bank transfer','PaymentMethod_Credit card',
    'PaymentMethod_Electronic check','PaymentMethod_Mailed check','PaperlessBilling_No',
    'PaperlessBilling_Yes','ContentType_Both','ContentType_Movies',
    'ContentType_TV Shows','MultiDeviceAccess_No','MultiDeviceAccess_Yes',
    'DeviceRegistered_Computer','DeviceRegistered_Mobile','DeviceRegistered_TV',
    'DeviceRegistered_Tablet','GenrePreference_Action','GenrePreference_Comedy',
    'GenrePreference_Drama','GenrePreference_Fantasy','GenrePreference_Sci-Fi',
    'Gender_Female','Gender_Male','ParentalControl_No',
    'ParentalControl_Yes','SubtitlesEnabled_No','SubtitlesEnabled_Yes'
]

In [8]:
pca_pipeline

In [6]:
pass_through_pipeline = Pipeline([('identity', FunctionTransformer())])

# Define the ColumnTransformer for selecting appropriate features
preprocess_pipeline = ColumnTransformer(transformers=[('pca_features', pca_pipeline, most_correlated_features),
                                                      ('encoded_features', pass_through_pipeline, encoded_features)
                                                      ], remainder='drop')  # 'drop' drops features not specified in transformers

# Create a pipeline that combines feature processing, resampling, and classifier
pipeline = ImbPipeline(steps=[('preprocess', preprocess_pipeline), 
                              ('classifier', RandomForestClassifier(random_state=42,class_weight='balanced'))])

In [9]:
preprocess_pipeline

In [10]:
pipeline #combines feature processing, resampling, and classifier

In [7]:
# Start an MLflow run
with mlflow.start_run():
    #Fit the model (ensure your data is ready and split as needed)
    pipeline.fit(X, y)
    
    #Log the model
    mlflow.sklearn.log_model(pipeline, "churn_prediction_pipeline")

    #Log additional information, if necessary
    mlflow.log_param("features_used", features)
    mlflow.log_param("PCA_components", 9)
    mlflow.log_param("classifier", "RandomForest")
# The run ends when exiting the 'with' block