In [69]:
import numpy as np
import pandas as pd
from core import PACKAGE_ROOT
from pathlib import Path
import warnings
import cmd
import textwrap
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
import mlflow
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import optuna

In [70]:
DATASET_DIR = PACKAGE_ROOT / 'data'
unused_fields = ['id', 'CustomerId', 'Surname', 'EstimatedSalary']
input_columns = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [71]:
def load_dataset(file_name: str) -> pd.DataFrame:
    dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
    return dataframe

def pre_pipeline_preparation(data_frame: pd.DataFrame) -> pd.DataFrame:
    # Drop unnecessary fields
    for field in unused_fields:
        if field in data_frame.columns:
            data_frame.drop(labels = field, axis=1, inplace=True)    

    return data_frame

In [72]:
#reading data files
train_data = load_dataset('train.csv')
test_data = load_dataset('test.csv')

In [73]:
# Drop unnecessary fields
train_data = pre_pipeline_preparation(train_data)
test_data = pre_pipeline_preparation(test_data)

In [74]:
train_data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited'],
      dtype='object')

In [75]:
preprocesser = ColumnTransformer([
    ('onehotencoder', OneHotEncoder(), ['Geography', 'Gender']),
    ('minmaxscaler', MinMaxScaler(), ['CreditScore', 'Tenure', 'NumOfProducts'])
])

In [76]:
pipe = Pipeline([
    ('preprocessor', preprocesser),
    ('model_rf', RandomForestRegressor(n_estimators = 100, 
                                       max_depth = 5,
                                      random_state = 1))
])

In [77]:
pipe.fit(train_data[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember']], train_data['Exited'])

In [78]:
pipe.predict(test_data[input_columns])

array([0.06361574, 0.37088072, 0.06308033, ..., 0.03222362, 0.37221097,
       0.39555481])

In [79]:
X_train, X_test, y_train, y_test = train_test_split(train_data[input_columns], train_data.Exited, test_size=0.25, random_state=1)

In [80]:
import dagshub
dagshub.init(repo_owner='sumanthegdegithub', repo_name='churn_prediction', mlflow=True)

In [81]:
def get_or_create_experiment(experiment_name):

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


In [89]:
experiment_id = get_or_create_experiment("Churn Prediction")

In [90]:
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='mlflow-artifacts:/346482be85e9401b895c1a2e0c4c92ac', creation_time=1714403965430, experiment_id='4', last_update_time=1714403965430, lifecycle_stage='active', name='Churn Prediction', tags={}>

In [91]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            'max_depth': trial.suggest_int('max_depth', 1, 9),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
            'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
            'eval_metric': 'mlogloss',
            'use_label_encoder': False
        }

        # Fit the model
        pipe = Pipeline([
            ('preprocessor', preprocesser),
            ('model_rf', XGBClassifier(**params))
        ])
        pipe.fit(X_train, y_train)

        # Make predictions
        y_pred = pipe.predict(X_test)

        # Evaluate predictions
        accuracy = accuracy_score(y_test, y_pred)
        
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        
        
        return accuracy


In [92]:
run_name = 'first_run'

In [93]:
# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="maximize")

    # Execute the hyperparameter optimization trials.
    # Note the addition of the `champion_callback` inclusion to control our logging
    study.optimize(objective, n_trials=30)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_acc", study.best_value)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "churn_prediction",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    pipe = Pipeline([
            ('preprocessor', preprocesser),
            ('model_rf', XGBClassifier(**study.best_params))
        ])
    pipe.fit(X_train, y_train)

    # Make predictions
    y_pred = pipe.predict(X_test)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    # Log the residuals plot

    artifact_path = "model"

    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path=artifact_path,
        input_example=X_train.iloc[[0]],
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)


[I 2024-04-29 20:49:37,358] A new study created in memory with name: no-name-e09e7c7d-a944-41f7-b1b0-d8460aa9f44e
[I 2024-04-29 20:49:44,343] Trial 0 finished with value: 0.8050122397537507 and parameters: {'max_depth': 2, 'learning_rate': 0.025672460402941775, 'n_estimators': 122, 'min_child_weight': 6, 'gamma': 3.74922751058321e-08, 'subsample': 0.3946833392174229, 'colsample_bytree': 0.3296859526498477, 'reg_alpha': 5.657517854505451e-08, 'reg_lambda': 0.2973204844211783}. Best is trial 0 with value: 0.8050122397537507.
[I 2024-04-29 20:49:50,392] Trial 1 finished with value: 0.8140284543978283 and parameters: {'max_depth': 2, 'learning_rate': 0.4876605248509711, 'n_estimators': 94, 'min_child_weight': 7, 'gamma': 0.38884809613858357, 'subsample': 0.3506847539345285, 'colsample_bytree': 0.28750564278930263, 'reg_alpha': 0.2606003744731174, 'reg_lambda': 1.8506758047733113e-08}. Best is trial 1 with value: 0.8140284543978283.
[I 2024-04-29 20:49:55,827] Trial 2 finished with value: 0