#### Checking Python version

In [None]:
!python -V

This is the same code present in `persistency_base_model - modified data3.ipynb`. I have just copied the notebook and renamed it. In this notebook, I am using the new data `master_data - modified3.csv` which I have renamed to `master_data_final2.csv` in this folder

#### Importing libraries

In [None]:
import os
import warnings
warnings.simplefilter("ignore", UserWarning)

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime as dt
import pickle

import matplotlib.pyplot as plt
from matplotlib.figure import Figure
%matplotlib inline
import seaborn as sns

from feature_engine import encoding as ce
from feature_engine import imputation as mdi
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
import sklearn.metrics as metrics

import mysql.connector
import mlflow

In [None]:
# Set the database connection parameters
db_name = "PersistencyDB"
user_name = "root"
password = "persistency_dna"
host_name = "127.0.0.1"
port_number = 3306

In [None]:
tracking_uri = f"mysql://{user_name}:{password}@{host_name}:{port_number}/{db_name}"
mlflow.set_tracking_uri(tracking_uri)

#### Importing mlflow and setting tracking uri

In [27]:
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("persistency-prediction-experiment")

#### Importing data

In [None]:
INPUT_FILEPATH = 'data'
INPUT_FILENAME = 'master_data_final2.csv'

INDEX = 'policy_number'
DATE_COLS = ['proposal_received_date', 'policy_issue_date', 'agent_dob', 'agent_doj']
NA_VALUES = ['', 'NA', 'N/A', 'NULL', 'null', '?', '*', '#N/A', '#VALUE!']
DTYPE_DICT = {'zipcode': 'str', 'agent_code': 'str'} ## These columns should be string

In [None]:
def load_data(INPUT_FILEPATH, INPUT_FILENAME):
    input_df = pd.read_csv(os.path.join(INPUT_FILEPATH, INPUT_FILENAME),
                      index_col = INDEX,
                      na_values = NA_VALUES,
                      parse_dates = DATE_COLS,
                      dayfirst = True,
                      dtype = DTYPE_DICT)
                    
    return input_df

In [None]:
input_df = load_data(INPUT_FILEPATH, INPUT_FILENAME)
input_df.head(10)

#### Checking data info

In [None]:
input_df.info()

In [None]:
input_df.columns

#### Target variable distribution

In [None]:
input_df['lapse'].value_counts()/len(input_df)*100

#### Creating feature: time_to_issue

In [None]:
def create_time_to_issue(df) -> pd.DataFrame:
    df['time_to_issue'] = (df['policy_issue_date'] - df['proposal_received_date']).dt.days

    return df

In [None]:
input_df = create_time_to_issue(input_df)
input_df['time_to_issue'].describe()

#### Creating feature: prem_to_income_ratio

In [None]:
def create_prem_to_income_ratio(df) -> pd.DataFrame:
    df['prem_to_income_ratio'] = np.where(df['income'] == 0, 0, (df['annual_premium']/df['income']))

    return df

In [None]:
input_df = create_prem_to_income_ratio(input_df)
input_df['prem_to_income_ratio'].describe()

## --------------------------------------------------------------------------------------------------------------------------------

The objective of this exercise is to create a demoable solution (maybe not the best possible one, given the augmented data). Hence, we are not going to deep dive into EDA and hypothesis testing. Instead, we will focus on building the ML product using different technologies. 

## --------------------------------------------------------------------------------------------------------------------------------

#### Columns to remove

In [None]:
COLS_TO_REM = ['proposal_received_date','policy_issue_date', 'zipcode', 'county', 'state', 'agent_code', 'agent_dob', 'agent_doj']

- `proposal_received_date`: Derived `time_to_issue` from this column
- `policy_issue_date`: Derived `time_to_issue` from this column
- `zipcode`: Too many values, high cardinality
- `county`: Too many values, high cardinality
- `state`: Too many values, high cardinality
- `agent_code`: Id column
- `agent_dob`: Derived `agent_age` from this column
- `agent_doj`: Derived `agent_tenure_days` from this column

In [None]:
def filter_df(df, COLS_TO_REM) -> pd.DataFrame:
    df = df.drop(COLS_TO_REM, axis = 1)
    return df

In [None]:
temp_df = filter_df(input_df, COLS_TO_REM)
temp_df.shape

In [None]:
input_df.shape

## Feature Engineering Steps

#### Missing Value Imputation

We have only one column where missing value is present. `agent_persistency`. Impute missing value with median

In [None]:
agent_persistency_missing_perc = round(temp_df['agent_persistency'].isnull().mean()*100,2)

print(f'Total missing percentage of column agent_persistency is: {agent_persistency_missing_perc}% ')

In [None]:
MISSING_COL = ['agent_persistency']

#### One-Hot Encoding

In [None]:
ONE_HOT_COLS = ['owner_gender', 'marital_status', 'smoker', 'medical', 'education', 'occupation', 'payment_freq',  
                'agent_status', 'agent_education']

#### Normalisation

We will normalise the columns using `StandardScaler` because we have values at different scale

#### Creating train test split

In [None]:
temp_df.columns

In [None]:
FEATURES = ['owner_age', 'owner_gender', 'marital_status', 'num_nominee', 'smoker',
       'medical', 'education', 'occupation', 'experience', 'income',
       'negative_zipcode', 'family_member', 'existing_num_policy',
       'has_critical_health_history', 'policy_term', 'payment_freq',
       'annual_premium', 'sum_insured', 'agent_status', 'agent_education',
       'agent_age', 'agent_tenure_days', 'agent_persistency',
       'last_6_month_submissions', 'average_premium', 'is_reinstated',
       'prev_persistency', 'num_complaints', 'target_completion_perc',
       'has_contacted_in_last_6_months', 'credit_score',
       'time_to_issue', 'prem_to_income_ratio']

TARGET = 'lapse'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(temp_df[FEATURES],
                                                    temp_df[TARGET],
                                                    test_size=0.3,
                                                    random_state = 786, 
                                                    shuffle = True,
                                                    stratify = temp_df[TARGET])

X_train.shape, X_test.shape

### Creating data transformation pipeline

In [None]:
model_input_pipe = Pipeline([
    
    ('imputer_num', mdi.MeanMedianImputer(imputation_method = 'median', variables = MISSING_COL )), 
    
    ('onehot_encoder', ce.OneHotEncoder(top_categories=None,
                                        variables= ONE_HOT_COLS,
                                        drop_last=True)),
    
    ('normalisation', StandardScaler())
    
    # ('clf', LogisticRegression(penalty,random_state = 786))
])

In [None]:
X_train_trf = model_input_pipe.fit_transform(X_train)

In [None]:
X_train_trf

In [None]:
# train = xgb.DMatrix(X_train_trf, label = y_train)
X_test_trf = model_input_pipe.transform(X_test)
# valid = xgb.DMatrix(X_test_trf, label = y_test) 

### Running xgboost with hyperopt and tracking using mlflow

In [None]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag("developer", "tanmoy")
        mlflow.set_tag("model", "xgboost-sklearn hyperparam")
        mlflow.set_tag("type", "experiment")

        mlflow.log_params(params)

        booster = xgb.XGBClassifier(**params)

        xgboost_model = booster.fit(X_train_trf,y_train)

        y_pred = xgboost_model.predict(X_test_trf)
        
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        recall = recall_score(y_test, y_pred)
        mlflow.log_metric("recall", recall)

        precision = precision_score(y_test, y_pred)
        mlflow.log_metric("precision", precision)

        f1 = f1_score(y_test, y_pred)
        mlflow.log_metric("f1_score", f1)

        roc_auc = roc_auc_score(y_test, y_pred)
        mlflow.log_metric("roc_auc", roc_auc)

        return {"loss": -recall, 'status': STATUS_OK}                    

In [None]:
search_space =  {
    'max_depth' : scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate' : hp.loguniform('learning_rate', -3, 0),
    'min_child_weight' : hp.loguniform('min_child_weight', -1, 3),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'objective' : 'binary:logistic',
    'seed' : 786
}

best_result = fmin(

    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials()
)

### Finalising model 
- Running with best model params (best model chosen in terms of highest recall)
- Autologging along with customised metrics logging
- storing prerocessor `model_input_pipe` as an artifact
- storing model as an artifact

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

MLFLOW_TRACKING_URI = tracking_uri
client = MlflowClient(tracking_uri= MLFLOW_TRACKING_URI)

runs = client.search_runs(
    experiment_ids= '1',
    filter_string= "tags.model = 'xgboost-sklearn hyperparam'",
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results= 1,
    order_by= ["metrics.recall DESC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, recall: {run.data.metrics['recall']:.4f}")

run.info.run_id

In [None]:
BEST_PARAMS = run.data.params
RUN_ID = run.info.run_id

In [None]:
bucket_name = "my-gcs-bucket-name"
artifact_path = "my-artifact-path"
mlflow.set_experiment("my-experiment-name")

In [None]:
best_params = BEST_PARAMS

with mlflow.start_run(artifact_location=f"gcs://{bucket_name}/{artifact_path}"):
    mlflow.xgboost.autolog()
    mlflow.set_tag("developer", "tanmoy")
    mlflow.set_tag("model", "xgboost-sklearn")
    mlflow.set_tag("type", "xgboost-sklearn final")

    booster = xgb.XGBClassifier(**best_params)
    xgboost_model = booster.fit(X_train_trf,y_train)

    y_pred = xgboost_model.predict(X_test_trf)
            
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    recall = recall_score(y_test, y_pred)
    mlflow.log_metric("recall", recall)

    precision = precision_score(y_test, y_pred)
    mlflow.log_metric("precision", precision)

    f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("f1_score", f1)

    roc_auc = roc_auc_score(y_test, y_pred)
    mlflow.log_metric("roc_auc", roc_auc)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(model_input_pipe, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path = "preprocessor")
    mlflow.xgboost.log_model(xgboost_model, artifact_path= "model_mlflow")

mlflow.end_run()

#### Run Name: `amazing-hound-63`
#### Run Id: `b878a2ba0b834edea0a44cf6935f4dc0`

#### Load model locally and make predictions

In [None]:
runs = client.search_runs(
    experiment_ids= '1',
    filter_string= "tags.type = 'xgboost-sklearn final'",
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results= 1,
    order_by= ["metrics.recall DESC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, recall: {run.data.metrics['recall']:.4f}")

RUN_ID = run.info.run_id

In [None]:
# Set the uri
logged_model = 'mlruns/1/' + RUN_ID + '/artifacts/model_mlflow/'

# Load model as a PyFuncModel
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

# load as a xgboost model
xgboost_model = mlflow.xgboost.load_model(logged_model)
print(xgboost_model)

In [None]:
# Predict on a Pandas DataFrame.
predictions = xgboost_model.predict_proba(X_test_trf)

predicted_proba = []
for i in range(0, len(predictions)):
    predicted_proba.append(predictions[i][1])

predicted_proba

#### Checking the distribution of `1` and `0` in the predicted vs actual data

In [None]:
comp_df = pd.DataFrame({'y_test': y_test, 'predicted_proba': predicted_proba})
comp_df

In [None]:
sns.displot(data= comp_df, x = "predicted_proba", hue="y_test", kind="kde")

In [None]:
model_final_features = model_input_pipe.get_feature_names_out(input_features= FEATURES)

out_feature_list = []
for f in range(0, len(model_final_features)):
    feat = 'f' + str(f)
    out_feature_list.append(feat)

out_feature_list

In [None]:
importance_df = pd.DataFrame(xgboost_model.feature_importances_, index = out_feature_list, columns= ['importantce'])
importance_df

In [None]:
def plot_feature_importance():

    model_final_features = model_input_pipe.get_feature_names_out(input_features= FEATURES)

    out_feature_list = []
    for f in range(0, len(model_final_features)):
        feat = 'f' + str(f)
        out_feature_list.append(feat)

    feat_df = pd.DataFrame(data = model_final_features, index = out_feature_list, columns= ['feature_names'])

    importance_df = pd.DataFrame(xgboost_model.feature_importances_, index = out_feature_list, columns= ['importance'])
    importance_df

    plot_df = feat_df.merge(importance_df, how = 'inner',left_index = True, right_index= True)

    sns.set(rc={"figure.figsize":(20, 15)})
    # sns.barplot(data = plot_df.sort_values(by = 'score', ascending= False), y = "feature_names", x = "score", orient = 'h')
    # plt.show()

    barplot = sns.barplot(data = plot_df.sort_values(by = 'importance', ascending= False), y = "feature_names", x = "importance", orient = 'h')
    fig = barplot.get_figure()
    fig.savefig('xgboost-sklearn_feature_imporance.png')
    # return plot_df

In [None]:
plot_feature_importance()

In [None]:
# import pyarrow as pa
# import pyarrow.parquet as pq
# import pickle

# with open("models/X_train_trf.b", "wb") as f_out:
#             pickle.dump(X_train_trf, f_out)

# feat_df_table = pa.Table.from_pandas(feat_df)
# pq.write_table(feat_df_table, 'models/feat_df.parquet')

In [None]:
model_final_features = model_input_pipe.get_feature_names_out(input_features= FEATURES)

out_feature_list = []
for f in range(0, len(model_final_features)):
    feat = 'f' + str(f)
    out_feature_list.append(feat)

feat_df = pd.DataFrame(data = model_final_features, index = out_feature_list, columns= ['feature_names'])
feat_df['feature_names'].to_list()

In [None]:
import streamlit as st
st.set_page_config(layout="wide")

import lime
import lime.lime_tabular

In [None]:
i = 10

# test = xgb.DMatrix(X_test_trf[i].reshape(-1,1))

explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X_train_trf,
                                                feature_names = feat_df['feature_names'].to_list(), 
                                                class_names=['1','0'],
                                                verbose = True,
                                                mode = 'classification',
                                                kernel_width=3
                                                )

predict_fn_xgboost = lambda x: xgboost_model.predict_proba(x).astype(float)
exp = explainer.explain_instance(X_test_trf[i],predict_fn_xgboost, num_features= 10)
exp.show_in_notebook(show_all=False)

In [None]:
import shap

In [None]:
model_final_features = model_input_pipe.get_feature_names_out(input_features= FEATURES)

out_feature_list = []
for f in range(0, len(model_final_features)):
    feat = 'f' + str(f)
    out_feature_list.append(feat)

feat_df = pd.DataFrame(data = model_final_features, index = out_feature_list, columns= ['feature_names'])
# feat_df
temp_fea_df = pd.DataFrame(X_train_trf, columns= feat_df['feature_names'].to_list())
temp_fea_df

In [None]:
type(temp_fea_df)

In [None]:
explainer = shap.TreeExplainer(xgboost_model)
shap_values = explainer.shap_values(temp_fea_df)
expected_value = explainer.expected_value

In [None]:
explainer_img = shap.plots._waterfall.waterfall_legacy(expected_value, shap_values[79], features = temp_fea_df.loc[79,:], feature_names=temp_fea_df.columns, max_display=15, show=False)
plt.tight_layout()
explainer_img.savefig('shap_feature_importance.jpg', dpi = 300)

In [None]:
with open("xgb_explainer.b", "wb") as f_out:
            pickle.dump(explainer, f_out)

In [None]:
type(shap_values[79])

In [None]:
temp_fea_df.loc[79,:]

In [None]:
expected_value