# Mortality Prediction

## Initializing the Python environment

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# from sklearn.model_selection import train_test_split

%matplotlib inline

## Initializing the ValidMind Client Library

Log in to the ValidMind platform with your registered email address, and navigate to the Documentation Projects page.

### Creating a new Documentation Project 

***(Note: if a documentation project has already been created, you can skip this section and head directly "Finding Project API key and secret")***

Clicking on "Create a new project" allows to you to register a new documentation project for our demo model. 

Select "Customer Churn model" from the Model drop-down, and "Initial Validation" as Type. Finally, click on "Create Project".

### Finding the project API key and secret 

In the "Client Integration" page of the newly created project, you will find the initialization code that allows the client library to associate documentation and tests with the appropriate project. The initialization code configures the following arguments: 

* api_host: Location of the ValidMind API.
* api_key: Account API key.
* api_secret: Account Secret key.
* project: The project identifier. The `project` argument is mandatory since it allows the library to associate all data collected with a specific account project.


The code snippet can be copied and pasted directly in the cell below to initialize the ValidMind Developer Framework when run:  

In [2]:
## Replace the code below with the code snippet from your project ## 

import validmind as vm

vm.init(
    api_host = "https://api.prod.validmind.ai/api/v1/tracking",
    api_key = "...",
    api_secret = "...",
    project = "..."
)

2023-09-07 23:35:43,997 - INFO(validmind.api_client): Connected to ValidMind. Project: Mortality Prediction - Initial Validation (clm9hieni03zn7y8hy3l7cink)


## Load the Demo Dataset

In [3]:
# load sample file 
sample_df = pd.read_csv('./Data/ILEC 2009-16 20200123 sample_small.csv',
                    usecols = ['Observation_Year', 'Gender', 'Smoker_Status',
                               'Insurance_Plan',  'Duration', 'Attained_Age', 'SOA_Guaranteed_Level_Term_Period',
                               'Face_Amount_Band', 'Preferred_Class', 
                               'Number_Of_Deaths','Policies_Exposed', 
                               'SOA_Anticipated_Level_Term_Period','SOA_Post_level_Term_Indicator', 
                               'Expected_Death_QX2015VBT_by_Policy',
                               'Issue_Age', 'Issue_Year'])

# target variable
sample_df['mort'] = sample_df['Number_Of_Deaths'] / sample_df['Policies_Exposed']

sample_df.head()

Unnamed: 0,Observation_Year,Gender,Smoker_Status,Insurance_Plan,Issue_Age,Duration,Attained_Age,Face_Amount_Band,Issue_Year,Preferred_Class,SOA_Anticipated_Level_Term_Period,SOA_Guaranteed_Level_Term_Period,SOA_Post_level_Term_Indicator,Number_Of_Deaths,Policies_Exposed,Expected_Death_QX2015VBT_by_Policy,mort
0,2009,Female,NonSmoker,Perm,0,2,1,250000-499999,2008,,N/A (Not Term),N/A (Not Term),N/A (Not Term),0,70.098636,0.009814,0.0
1,2009,Female,NonSmoker,Perm,0,3,2,25000-49999,2007,,N/A (Not Term),N/A (Not Term),N/A (Not Term),0,521.52878,0.046938,0.0
2,2009,Female,NonSmoker,Perm,0,5,4,1-9999,2004,,N/A (Not Term),N/A (Not Term),N/A (Not Term),0,5.199693,0.000312,0.0
3,2009,Female,NonSmoker,Perm,0,5,4,50000-99999,2005,,N/A (Not Term),N/A (Not Term),N/A (Not Term),0,418.654782,0.025119,0.0
4,2009,Female,NonSmoker,Perm,0,14,13,25000-49999,1995,,N/A (Not Term),N/A (Not Term),N/A (Not Term),0,121.073952,0.009686,0.0


In [4]:
# filter pipeline
df = sample_df[(sample_df.Expected_Death_QX2015VBT_by_Policy != 0)
               & (sample_df.Smoker_Status != 'Unknown') 
               & (sample_df.Insurance_Plan == ' Term')
               & (-sample_df.Preferred_Class.isna())
               & (sample_df.Attained_Age >= 18)
               & (sample_df.Issue_Year >= 1980)
               & (sample_df.SOA_Post_level_Term_Indicator == "Within Level Term")
               & (sample_df.SOA_Anticipated_Level_Term_Period != "Unknown")
               & (sample_df.mort < 1)]

print(f'Count: {df.shape[0]}')
print()

# describe data
df.describe()

Count: 122719



Unnamed: 0,Observation_Year,Issue_Age,Duration,Attained_Age,Issue_Year,Preferred_Class,Number_Of_Deaths,Policies_Exposed,Expected_Death_QX2015VBT_by_Policy,mort
count,122719.0,122719.0,122719.0,122719.0,122719.0,122719.0,122719.0,122719.0,122719.0,122719.0
mean,2014.093857,42.226151,7.951466,49.177617,2006.64871,2.038902,0.018441,12.486255,0.01929384,0.001639
std,1.410709,12.771479,4.795012,13.329959,4.890626,0.964722,0.14809,28.893192,0.05366449,0.023072
min,2012.0,18.0,1.0,18.0,1984.0,1.0,0.0,0.002732,2.192e-07,0.0
25%,2013.0,32.0,4.0,39.0,2003.0,1.0,0.0,0.838798,0.0007785208,0.0
50%,2014.0,42.0,7.0,49.0,2007.0,2.0,0.0,2.632876,0.003332713,0.0
75%,2015.0,52.0,12.0,59.0,2011.0,3.0,0.0,10.816382,0.01476548,0.0
max,2016.0,84.0,30.0,91.0,2016.0,4.0,6.0,641.906968,2.273771,0.981233


In [5]:
from validmind.test_plans import register_test_plan
from validmind.test_suites import register_test_suite
from validmind.vm_models import TestPlan, TestSuite

class TabularDataQualityExtra(TestPlan):
    """
    Expanded test plan for data quality on tabular datasets
    """

    name = "tabular_data_quality_extra"
    tests = [
        "validmind.data_validation.FeatureTargetCorrelationPlot",
        "validmind.data_validation.IQROutliersBarPlot",
        "validmind.data_validation.IQROutliersTable",
        "validmind.data_validation.ScatterPlot",
        "validmind.data_validation.TabularCategoricalBarPlots",
        "validmind.data_validation.TabularNumericalHistograms",
    ]

class CustomTabularDataset(TestSuite):
    """
    Test suite for tabular datasets.
    """

    name = "custom_tabular_dataset"

    test_plans = [
        # "tabular_dataset_description",
        # "tabular_data_quality",
        "tabular_data_quality_extra",
    ]

register_test_plan("tabular_data_quality_extra", TabularDataQualityExtra)
register_test_suite("custom_tabular_dataset", CustomTabularDataset)

2023-09-07 23:35:45,809 - INFO(validmind.test_plans): Registered test plan: tabular_data_quality_extra
2023-09-07 23:35:45,814 - INFO(validmind.test_suites): Registered test suite: custom_tabular_dataset


In [6]:
vm_dataset = vm.init_dataset(
    dataset=df,
    target_column="mort",
)

tabular_suite = vm.run_test_suite("custom_tabular_dataset", dataset=vm_dataset, fail_fast=True)

2023-09-07 23:35:45,865 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


HBox(children=(Label(value='Running test suite...'), IntProgress(value=0, max=12)))


The figure layout has changed to tight



VBox(children=(HTML(value='<h2>Test Suite Results: <i style="color: #DE257E">Custom Tabular Dataset</i></h2><h…

In [7]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Encode categorical variables
cat_vars = ['Observation_Year', 
     'Gender', 
     'Smoker_Status',
     'Face_Amount_Band', 
     'Preferred_Class',
     'SOA_Anticipated_Level_Term_Period']

onehot = preprocessing.OneHotEncoder()
results = onehot.fit_transform(df[cat_vars]).toarray()
cat_vars_encoded = list(onehot.get_feature_names_out())
df = pd.concat([df,pd.DataFrame(data = results, columns = cat_vars_encoded, index = df.index)], axis = 1)

In [8]:
# Target Variable
Y = ['Number_Of_Deaths']

# Predictors (aka Input Variables)
X = cat_vars_encoded + ['Attained_Age', 'Duration',  'Const']

train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)
policies_exposed = train_df['Policies_Exposed']

# add constant variable
train_df['Const'] = 1
test_df['Const'] = 1

train_df = train_df[X + Y]
test_df = test_df[X + Y]
 
print(f'Train size: {train_df.shape[0]}, test size: {test_df.shape[0]}')

Train size: 98175, test size: 24544


### GLM modeling 101

In a generalized linear model (GLM), each outcome Y of the dependent variables is assumed to be generated from a particular distribution in an exponential family, a large class of probability distributions that includes the normal, binomial, Poisson and gamma distributions, among others. The mean, $μ$, of the distribution depends on the independent variables, X, through

<center>${\displaystyle \operatorname {E} (\mathbf {Y} |\mathbf {X} )={\boldsymbol {\mu }}=g^{-1}(\mathbf {X} {\boldsymbol {\beta }})}$</center>

${\displaystyle \operatorname {E} (\mathbf {Y} |\mathbf {X} )={\boldsymbol {\mu }}=g^{-1}(\mathbf {X} {\boldsymbol {\beta }})}$

where:

- $E(Y|X)$ is the expected value of $Y$ conditional on $X$
- $Xβ$ is the linear predictor, a linear combination of unknown parameters $β$
- $g$ is the link function.

### Model 1: Poisson distribution with log link on count

<i> Target Variable </i> = [Number_Of_Deaths]

<i> Input Variables </i> =  [Observation_Year, Gender, Smoker_Status, Face_Amount_Band, Preferred_Class, Attained_Age, Duration, SOA_Anticipated_Level_Term_Period]

As the <i> target variable</i> is a count measure, we will fit GLM with Poisson distribution and log link. 

The target variable is count, what we really fit the Poisson model to is mortality rate (count/exposure) with the use of offset. This is a common practice according to 
https://en.wikipedia.org/wiki/Poisson_regression

In [9]:
# Our choice for Link function is the Gaussian distribution for the nature of death frequency
model = sm.GLM(endog = train_df[Y], 
               exog = train_df[X], 
               family=sm.families.Poisson(sm.families.links.log()),
               freq_weights = policies_exposed,
               offset = policies_exposed.apply(lambda x: np.log(x))
              )
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,Number_Of_Deaths,No. Observations:,98175.0
Model:,GLM,Df Residuals:,1225386.5
Model Family:,Poisson,Df Model:,26.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-289020.0
Date:,"Thu, 07 Sep 2023",Deviance:,395650.0
Time:,23:36:30,Pearson chi2:,1210000.0
No. Iterations:,100,Pseudo R-squ. (CS):,0.6453
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Observation_Year_2012,-0.5422,0.008,-67.262,0.000,-0.558,-0.526
Observation_Year_2013,-0.6406,0.008,-79.634,0.000,-0.656,-0.625
Observation_Year_2014,-0.6724,0.008,-85.265,0.000,-0.688,-0.657
Observation_Year_2015,-0.9193,0.009,-106.417,0.000,-0.936,-0.902
Observation_Year_2016,-0.7501,0.008,-91.862,0.000,-0.766,-0.734
Gender_Female,-1.9709,0.013,-151.749,0.000,-1.996,-1.945
Gender_Male,-1.5538,0.013,-116.925,0.000,-1.580,-1.528
Smoker_Status_NonSmoker,-2.2554,0.016,-138.057,0.000,-2.287,-2.223
Smoker_Status_Smoker,-1.2692,0.019,-67.638,0.000,-1.306,-1.232


In [10]:
# Initialize training and testing datasets for model A
vm_train_ds = vm.init_dataset(dataset=train_df, target_column="Number_Of_Deaths")
vm_test_ds = vm.init_dataset(dataset=test_df, target_column="Number_Of_Deaths")

vm_model_1 = vm.init_model(
    model=res,
    train_ds=vm_train_ds,
    test_ds=vm_test_ds
)

2023-09-07 23:36:31,052 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-09-07 23:36:31,615 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


In [11]:
class RegressionTestsExtra(TestPlan):
    """
    Expanded test plan for regression models
    """

    name = "regression_extra"
    tests = [
        "validmind.model_validation.statsmodels.RegressionCoeffsPlot",
    ]

class RegressionSuite(TestSuite):
    """
    Test suite for regression models.
    """

    name = "custom_regression_suite"

    test_plans = [
        "regression_extra",
        "regression_model_description",
        "regression_models_evaluation",
    ]

register_test_plan("regression_extra", RegressionTestsExtra)
register_test_suite("custom_regression_suite", RegressionSuite)

2023-09-07 23:36:32,764 - INFO(validmind.test_plans): Registered test plan: regression_extra
2023-09-07 23:36:32,769 - INFO(validmind.test_suites): Registered test suite: custom_regression_suite


In [12]:
suite_results = vm.run_test_suite(
    "custom_regression_suite",
    model = vm_model_1,
    models = [vm_model_1]
)

HBox(children=(Label(value='Running test suite...'), IntProgress(value=0, max=10)))

VBox(children=(HTML(value='<h2>Test Suite Results: <i style="color: #DE257E">Custom Regression Suite</i></h2><…