## Clean and Preprocessing Data

In [15]:
# import necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
import mlflow

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

In [16]:
# read train set as a pandas dataframe
train_set = pd.read_csv('data/train_set.csv')

In [17]:
# select only the features that we are going to use
X = train_set.drop(['Churn'], axis=1)
y = train_set['Churn']

In [18]:
# 1. make pipelines to do the necessary transformations

# 1.1 divide the qualitative and quantitative features
quantitative_columns = selector(dtype_exclude=['object'])
qualitative_columns = selector(dtype_include=['object'])

quantitative_columns = quantitative_columns(X)
qualitative_columns = qualitative_columns(X)

# 1.2 apply the respective transformations with columntransformer method
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), qualitative_columns)],
     remainder='passthrough')

## Setting up MLFlow Experiments

In [19]:
# Setting up the mlflow experiment
experiment_path = 'mlflow_experiments' # Defining the path of experiments in MLFlow
experiment_name = '01_churn_customers' # Defining the experiment name in MLFlow

if(not(mlflow.get_experiment_by_name(experiment_name))): # If the experiment does not exist, create it
    mlflow.create_experiment(experiment_name)
    
mlflow.set_experiment(experiment_name) # Set the current experiment to register in MLFlow

<Experiment: artifact_location='file:///c:/Users/4YouSee/Desktop/personal_work/Customer_Churn/mlruns/750646704189071249', creation_time=1672789451297, experiment_id='750646704189071249', last_update_time=1672789451297, lifecycle_stage='active', name='01_churn_customers', tags={}>

In [20]:
# Setting current date to save
year = str(datetime.today().year)
month = str(datetime.today().month)
if len(month) == 1:
    month = "0" + month
day = str(datetime.today().day)
if len(day) == 1:
    day = "0" + day
date = year + "/" + month + "/" + day 

## Training and Model Selection

In [21]:
def run_classifier_models(X, y, cv, scoring):
    '''Função que treina os seguintes modelos de machine learning:
    RandomForestClassifier, DecisionTreeClassifier, SGDClassifier, SVC,
    LGBMClassifier, GaussianNB.
    A função aplica a validação cruzada no conjunto de dados e retorna a média
    da métrica selecionada no conjunto de treino e validação.
    As únicas métricas ativas são Acurácia e F1 score.
    Os experimentos com os modelos são acompanhados pelo MLflow.
    
    :param X: (dataframe or numpy array) 
    Dataframe ou array com o conjunto de variáveis independentes.
    
    :param y: (series or numpy array)
    Coluna ou array com a variável dependente.
    
    :param cv: (int)
    Determina a estratégia de divisão de validação cruzada.
    
    :param scoring: (str)
    Estratégia para avaliar o desempenho do modelo de validação cruzada no conjunto de validação.
    Deve ser passada entre aspas ao chamar a função.
    '''
    # 1. Instantiate the models
    rf = RandomForestClassifier()
    dt = DecisionTreeClassifier()
    sgdc = SGDClassifier()
    svc = SVC()
    lgbm = LGBMClassifier()
    gnb = GaussianNB()

    # 2. train and evaluate the models
    for model in (rf, dt, sgdc, svc, lgbm, gnb):
        pipe = Pipeline(
            steps=[('preprocessor', preprocessor),
                   ('scaling', StandardScaler()),
                   ('classifier', model)
                  ]
                )
        scores = cross_validate(pipe, X, y, return_train_score=True,
                                scoring=scoring, cv=cv)

        # train and validation with accuracy
        if scoring == 'accuracy':
            log_train_acc = np.mean(scores['train_score'])
            log_test_acc = np.mean(scores['test_score'])

            # track the experiment with accuraccy
            mlflow.start_run(run_name = date) 
            mlflow.log_param('Date', date) 
            mlflow.log_param('Features', X.columns)
            mlflow.log_param('Pre-processing', preprocessor) 
            mlflow.log_param('ML model', pipe[2])

            mlflow.log_metric('Train_acc', log_train_acc)
            mlflow.log_metric('Test_acc', log_test_acc)

            mlflow.end_run()

        # train and validation with f1
        if scoring == 'f1':
            log_train_f1 = np.mean(scores['train_score'])
            log_test_f1 = np.mean(scores['test_score'])

            # track the experiment with f1 score
            mlflow.start_run(run_name = date) 
            mlflow.log_param('Date', date) 
            mlflow.log_param('Features', X.columns)
            mlflow.log_param('Pre-processing', preprocessor) 
            mlflow.log_param('ML model', pipe[2])

            mlflow.log_metric('Train_f1', log_train_f1)
            mlflow.log_metric('Test_f1', log_test_f1)

            mlflow.end_run()

In [22]:
# Run classifiers
run_classifier_models(X, y, 5, 'f1')

In [23]:
mlflow.end_run()

In [24]:
!mlflow ui

^C


## Improve the Model