#### Checking Python version

In [None]:
!python -V

This is the same code present in `persistency_base_model - modified data2.ipynb`. I have just copied the notebook and renamed it

#### Importing libraries

In [None]:
import os
import warnings
warnings.simplefilter("ignore", UserWarning)

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime as dt

import matplotlib.pyplot as plt
from matplotlib.figure import Figure
%matplotlib inline

from feature_engine import encoding as ce
from feature_engine import imputation as mdi
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics

#### Importing mlflow and setting tracking uri

In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("persistency-prediction-experiment")

#### Importing data

In [None]:
INPUT_FILEPATH = 'data'
INPUT_FILENAME = 'master_data_final.csv'

INDEX = 'policy_number'
DATE_COLS = ['proposal_received_date', 'policy_issue_date', 'agent_dob', 'agent_doj']
NA_VALUES = ['', 'NA', 'N/A', 'NULL', 'null', '?', '*', '#N/A', '#VALUE!']
DTYPE_DICT = {'zipcode': 'str', 'agent_code': 'str'} ## THese columns should be string

In [None]:
def load_data(INPUT_FILEPATH, INPUT_FILENAME):
    input_df = pd.read_csv(os.path.join(INPUT_FILEPATH, INPUT_FILENAME),
                      index_col = INDEX,
                      na_values = NA_VALUES,
                      parse_dates = DATE_COLS,
                      dayfirst = True,
                      dtype = DTYPE_DICT)
                    
    return input_df

In [None]:
input_df = load_data(INPUT_FILEPATH, INPUT_FILENAME)

In [None]:
input_df.head(10)

#### Checking data info

In [None]:
input_df.info()

In [None]:
input_df.columns

#### Target variable distribution

In [None]:
input_df['lapse'].value_counts()/len(input_df)*100

#### Creating feature: time_to_issue

In [None]:
def create_time_to_issue(df) -> pd.DataFrame:
    df['time_to_issue'] = (df['policy_issue_date'] - df['proposal_received_date']).dt.days

    return df

In [None]:
input_df = create_time_to_issue(input_df)
input_df['time_to_issue'].describe()

#### Creating feature: prem_to_income_ratio

In [None]:
def create_prem_to_income_ratio(df) -> pd.DataFrame:
    df['prem_to_income_ratio'] = np.where(df['income'] == 0, 0, (df['annual_premium']/df['income']))

    return df

In [None]:
input_df = create_prem_to_income_ratio(input_df)
input_df['prem_to_income_ratio'].describe()

## --------------------------------------------------------------------------------------------------------------------------------

The objective of this exercise is to create a demoable solution (maybe not the best possible one, given the augmented data). Hence, we are not going to deep dive into EDA and hypothesis testing. Instead, we will focus on building the ML product using different technologies. 

## --------------------------------------------------------------------------------------------------------------------------------

#### Columns to remove

In [None]:
COLS_TO_REM = ['proposal_received_date','policy_issue_date', 'zipcode', 'county', 'state', 'agent_code', 'agent_dob', 'agent_doj']

- `proposal_received_date`: Derived `time_to_issue` from this column
- `policy_issue_date`: Derived `time_to_issue` from this column
- `zipcode`: Too many values, high cardinality
- `county`: Too many values, high cardinality
- `state`: Too many values, high cardinality
- `agent_code`: Id column
- `agent_dob`: Derived `agent_age` from this column
- `agent_doj`: Derived `agent_tenure_days` from this column

In [None]:
def filter_df(df, COLS_TO_REM) -> pd.DataFrame:
    df = df.drop(COLS_TO_REM, axis = 1)
    return df

In [None]:
temp_df = filter_df(input_df, COLS_TO_REM)
temp_df.shape

In [None]:
input_df.shape

## Feature Engineering Steps

#### Missing Value Imputation

We have only one column where missing value is present. `agent_persistency`. Impute missing value with median

In [None]:
agent_persistency_missing_perc = round(temp_df['agent_persistency'].isnull().mean()*100,2)

print(f'Total missing percentage of column agent_persistency is: {agent_persistency_missing_perc}% ')

In [None]:
MISSING_COL = ['agent_persistency']

#### One-Hot Encoding

In [None]:
ONE_HOT_COLS = ['owner_gender', 'marital_status', 'smoker', 'medical', 'education', 'occupation', 'payment_freq',  
                'agent_status', 'agent_education']

#### Normalisation

We will normalise the columns using `StandardScaler` because we have values at different scale

#### Creating train test split

In [None]:
temp_df.columns

In [None]:
FEATURES = ['owner_age', 'owner_gender', 'marital_status', 'num_nominee', 'smoker',
       'medical', 'education', 'occupation', 'experience', 'income',
       'negative_zipcode', 'family_member', 'existing_num_policy',
       'has_critical_health_history', 'policy_term', 'payment_freq',
       'annual_premium', 'sum_insured', 'agent_status', 'agent_education',
       'agent_age', 'agent_tenure_days', 'agent_persistency',
       'last_6_month_submissions', 'average_premium', 'is_reinstated',
       'prev_persistency', 'num_complaints', 'target_completion_perc',
       'has_contacted_in_last_6_months', 'credit_score',
       'time_to_issue', 'prem_to_income_ratio']

TARGET = 'lapse'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(temp_df[FEATURES],
                                                    temp_df[TARGET],
                                                    test_size=0.3,
                                                    random_state = 786, 
                                                    shuffle = True,
                                                    stratify = temp_df[TARGET])

X_train.shape, X_test.shape

### Creating data transformation pipeline

In [None]:
model_input_pipe = Pipeline([
    
    ('imputer_num', mdi.MeanMedianImputer(imputation_method = 'median', variables = MISSING_COL )), 
    
    ('onehot_encoder', ce.OneHotEncoder(top_categories=None,
                                        variables= ONE_HOT_COLS,
                                        drop_last=True)),
    
    ('normalisation', StandardScaler())
    
    # ('clf', LogisticRegression(penalty,random_state = 786))
])

In [None]:
X_train_trf = model_input_pipe.fit_transform(X_train)

In [None]:
X_train_trf

In [None]:
train = xgb.DMatrix(X_train_trf, label = y_train)
X_test_trf = model_input_pipe.transform(X_test)
valid = xgb.DMatrix(X_test_trf, label = y_test) 

### Running xgboost with hyperopt and tracking using mlflow

In [None]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag("developer", "tanmoy")
        mlflow.set_tag("model", "xgboost hyperparam")

        mlflow.log_params(params)

        booster = xgb.train(params = params,
                            dtrain = train,
                            num_boost_round = 1000,
                            evals = [(valid, "validation")],
                            early_stopping_rounds = 50)

        
        y_pred = booster.predict(valid).round()
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        return {"loss": -accuracy, 'status': STATUS_OK}                    

In [None]:
search_space =  {
    'max_depth' : scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate' : hp.loguniform('learning_rate', -3, 0),
    'min_child_weight' : hp.loguniform('min_child_weight', -1, 3),
    'objective' : 'binary:logistic',
    'seed' : 786
}

best_result = fmin(

    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 20,
    trials = Trials()
)