# Mortality Prediction

## Initializing the Python environment

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# from sklearn.model_selection import train_test_split

%matplotlib inline

## Initialize the client library

ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the client library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.

Get your code snippet:

1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).

2. In the left sidebar, navigate to **Model Inventory** and click **+ Register new model**.

3. Enter the model details, making sure to select **Binary classification** as the template and **Marketing/Sales - Attrition/Churn Management** as the use case, and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))

4. Go to **Getting Started** and click **Copy snippet to clipboard**.

Next, replace this placeholder with your own code snippet:

In [None]:
# Replace with your code snippet
import validmind as vm

vm.init(
    api_host="https://api.prod.validmind.ai/api/v1/tracking",
    api_key="...",
    api_secret="...",
    model="..."
)

## Load the Demo Dataset

In [None]:
# load sample file
sample_df = pd.read_csv('./Data/ILEC 2009-16 20200123 sample_small.csv',
                        usecols=['Observation_Year', 'Gender', 'Smoker_Status',
                                 'Insurance_Plan', 'Duration', 'Attained_Age', 'SOA_Guaranteed_Level_Term_Period',
                                 'Face_Amount_Band', 'Preferred_Class',
                                 'Number_Of_Deaths', 'Policies_Exposed',
                                 'SOA_Anticipated_Level_Term_Period', 'SOA_Post_level_Term_Indicator',
                                 'Expected_Death_QX2015VBT_by_Policy',
                                 'Issue_Age', 'Issue_Year'])

# target variable
sample_df['mort'] = sample_df['Number_Of_Deaths'] / sample_df['Policies_Exposed']

sample_df.head()

In [None]:
# filter pipeline
df = sample_df[(sample_df.Expected_Death_QX2015VBT_by_Policy != 0)
               & (sample_df.Smoker_Status != 'Unknown')
               & (sample_df.Insurance_Plan == ' Term')
               & (-sample_df.Preferred_Class.isna())
               & (sample_df.Attained_Age >= 18)
               & (sample_df.Issue_Year >= 1980)
               & (sample_df.SOA_Post_level_Term_Indicator == "Within Level Term")
               & (sample_df.SOA_Anticipated_Level_Term_Period != "Unknown")
               & (sample_df.mort < 1)]

print(f'Count: {df.shape[0]}')
print()

# describe data
df.describe()

In [None]:
from validmind.test_suites import register_test_suite
from validmind.test_suites import register_test_suite
from validmind.vm_models import TestPlan, TestSuite


class TabularDataQualityExtra(TestPlan):
    """
    Expanded test plan for data quality on tabular datasets
    """

    name = "tabular_data_quality_extra"
    tests = [
        "validmind.data_validation.FeatureTargetCorrelationPlot",
        "validmind.data_validation.IQROutliersBarPlot",
        "validmind.data_validation.IQROutliersTable",
        "validmind.data_validation.ScatterPlot",
        "validmind.data_validation.TabularCategoricalBarPlots",
        "validmind.data_validation.TabularNumericalHistograms",
    ]


class CustomTabularDataset(TestSuite):
    """
    Test suite for tabular datasets.
    """

    name = "custom_tabular_dataset"

    test_suites = [
        # "tabular_dataset_description",
        # "tabular_data_quality",
        "tabular_data_quality_extra",
    ]


register_test_suite("tabular_data_quality_extra", TabularDataQualityExtra)
register_test_suite("custom_tabular_dataset", CustomTabularDataset)

In [None]:
vm_dataset = vm.init_dataset(
    dataset=df,
    target_column="mort",
)

tabular_suite = vm.run_test_suite(
    "custom_tabular_dataset", dataset=vm_dataset, fail_fast=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Encode categorical variables
cat_vars = ['Observation_Year',
            'Gender',
            'Smoker_Status',
            'Face_Amount_Band',
            'Preferred_Class',
            'SOA_Anticipated_Level_Term_Period']

onehot = preprocessing.OneHotEncoder()
results = onehot.fit_transform(df[cat_vars]).toarray()
cat_vars_encoded = list(onehot.get_feature_names_out())
df = pd.concat(
    [df, pd.DataFrame(data=results, columns=cat_vars_encoded, index=df.index)], axis=1)

In [None]:
# Target Variable
Y = ['Number_Of_Deaths']

# Predictors (aka Input Variables)
X = cat_vars_encoded + ['Attained_Age', 'Duration', 'Const']

train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)
policies_exposed = train_df['Policies_Exposed']

# add constant variable
train_df['Const'] = 1
test_df['Const'] = 1

train_df = train_df[X + Y]
test_df = test_df[X + Y]

print(f'Train size: {train_df.shape[0]}, test size: {test_df.shape[0]}')

### GLM modeling 101

In a generalized linear model (GLM), each outcome Y of the dependent variables is assumed to be generated from a particular distribution in an exponential family, a large class of probability distributions that includes the normal, binomial, Poisson and gamma distributions, among others. The mean, $μ$, of the distribution depends on the independent variables, X, through

<center>${\displaystyle \operatorname {E} (\mathbf {Y} |\mathbf {X} )={\boldsymbol {\mu }}=g^{-1}(\mathbf {X} {\boldsymbol {\beta }})}$</center>

${\displaystyle \operatorname {E} (\mathbf {Y} |\mathbf {X} )={\boldsymbol {\mu }}=g^{-1}(\mathbf {X} {\boldsymbol {\beta }})}$

where:

- $E(Y|X)$ is the expected value of $Y$ conditional on $X$
- $Xβ$ is the linear predictor, a linear combination of unknown parameters $β$
- $g$ is the link function.

### Model 1: Poisson distribution with log link on count

<i> Target Variable </i> = [Number_Of_Deaths]

<i> Input Variables </i> =  [Observation_Year, Gender, Smoker_Status, Face_Amount_Band, Preferred_Class, Attained_Age, Duration, SOA_Anticipated_Level_Term_Period]

As the <i> target variable</i> is a count measure, we will fit GLM with Poisson distribution and log link. 

The target variable is count, what we really fit the Poisson model to is mortality rate (count/exposure) with the use of offset. This is a common practice according to 
https://en.wikipedia.org/wiki/Poisson_regression

In [None]:
# Our choice for Link function is the Gaussian distribution for the nature of death frequency
model = sm.GLM(endog=train_df[Y],
               exog=train_df[X],
               family=sm.families.Poisson(sm.families.links.log()),
               freq_weights=policies_exposed,
               offset=policies_exposed.apply(lambda x: np.log(x))
               )
res = model.fit()
res.summary()

In [None]:
# Initialize training and testing datasets for model A
vm_train_ds = vm.init_dataset(dataset=train_df, target_column="Number_Of_Deaths")
vm_test_ds = vm.init_dataset(dataset=test_df, target_column="Number_Of_Deaths")

vm_model_1 = vm.init_model(
    model=res,
    train_ds=vm_train_ds,
    test_ds=vm_test_ds
)

In [None]:
class RegressionTestsExtra(TestPlan):
    """
    Expanded test plan for regression models
    """

    name = "regression_extra"
    tests = [
        "validmind.model_validation.statsmodels.RegressionCoeffsPlot",
    ]


class RegressionSuite(TestSuite):
    """
    Test suite for regression models.
    """

    name = "custom_regression_suite"

    test_suites = [
        "regression_extra",
        "regression_model_description",
        "regression_models_evaluation",
    ]


register_test_suite("regression_extra", RegressionTestsExtra)
register_test_suite("custom_regression_suite", RegressionSuite)

In [None]:
suite_results = vm.run_test_suite(
    "custom_regression_suite",
    model=vm_model_1,
    models=[vm_model_1]
)