# Veritas Fairness Assement - Life Insurance Underwriting Study (sample code)
This notebook includes samples of code used in the analysis conducted during the life insurance underwriting case study.

It is applicable to insurance underwriting datasets including a life insurance dataset available on
[kaggle](https://www.kaggle.com/c/prudential-life-insurance-assessment/data)


## License

Written by Sankarshan Mridha (Swiss Re) and Laura Alvarez (Accenture) as an extension to Phase 1 Credit Scoring Use Case code https://github.com/veritas-project/phase1/tree/main/credit_scoring 

Contact email: Veritas@mas.gov.sg


Copyright © 2021 Monetary Authority of Singapore

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of the
License at http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the Licens

## Imports

In [None]:
# Core Packages
%matplotlib inline 
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport


SEED = 123

In [None]:
# High-res plots
%config InlineBackend.figure_format = 'retina'

In [None]:
import warnings
warnings.filterwarnings('ignore') 

## Load Data

Please modify the following cell to update dataset file path 

In [None]:
all_data = pd.read_csv('../dataset.csv')

### Feature engineering

In [None]:
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

mapper = {
    'index': 'Insured ID',
    'InsuredInfo_6': 'Gender',
    'InsuredInfo_1': 'Race',
    'InsuredInfo_4': 'Nationality',
    'Family_Hist_1': 'Marital Status',
    'InsuredInfo_3': 'Occupation Type',
    'Employment_Info_2': 'Occupation Industry',
    'Wt': 'Weight',
    'Ht': 'Height',
    'Medical_History_4': 'Smoker Status',
    'Ins_Age': 'Age at Policy Inception',
    'Insurance_History_3': 'No. of Life Policies',
    'Insurance_History_2': 'No. of Accident Policies',
    'Insurance_History_7': 'No. of CI Policies',
    'Product_Info_3': 'Duration in force for Medical Plan'
}

all_data.rename(mapper=mapper, axis=1, inplace=True)
# Drop columns we do not have confidence in mapping to
drop_columns = ('Medical', 'Family', 'Insurance', 'Product', 'Employment', 'Insurance', 'InsuredInfo')
mask = all_data.columns.str.startswith(drop_columns)
all_data = all_data.iloc[:,~mask]
all_data.head()

### Binary outcome labels

In [None]:
# create labels
# 0: {1,2}
# 1: {7,8}
# -1: the rest
all_data['Risk'] = pd.cut(all_data.Response, bins=[0,2,6,8], labels=[0,-1,1])
all_data = all_data.astype({"Risk": int})
all_data.Risk.value_counts()

In [None]:
# remove Response = -1
df = all_data.loc[all_data['Risk']!= -1].reset_index(drop=True)

## Data exploration

### Pandas profiling report

In [None]:
prof = ProfileReport(all_data)
prof.to_file(output_file='output.html')

In [None]:
prof.to_notebook_iframe()

## Representation

####  Code corresponding to section 2.7.2.1 Examine Data for Unintended Bias in Veritas Document 4 FEAT Principles Assessment Case Studies

### Representation by Gender

In [None]:
all_data.Gender.value_counts().plot(kind='bar', 
             xlabel='Gender', ylabel='# of Customers', title='Life Insurance Underwriting',color='#FF9933');

In [None]:
all_data.shape

In [None]:
all_data.Gender.value_counts(normalize=True)

#### Difference in representation

In [None]:
print('50% of Female representation is:',round(all_data.Gender.value_counts(normalize=True).loc[2]*0.5,2))
print('Difference in representation is:',round(all_data.Gender.value_counts(normalize=True).loc[2] - all_data.Gender.value_counts(normalize=True).loc[1],2))

### Representation by Race

In [None]:
all_data.Race.value_counts().plot(kind='bar', 
             xlabel='Race', ylabel='# of Customers', title='Life Insurance Underwriting',color='#FF9933');

In [None]:
all_data.Race.value_counts(normalize=True)

### Representation by Race binary

In [None]:
all_data['race_bin']=np.where(all_data['Race']==1,'Majority','Other')

In [None]:
all_data.race_bin.value_counts().plot(kind='bar', 
             xlabel='Race binary', ylabel='# of Customers', title='Life Insurance Underwriting',color='#FF9933');

In [None]:
all_data.race_bin.value_counts(normalize=True)

In [None]:
all_data.race_bin.value_counts()

In [None]:
print('50% of Majority representation is:',round(all_data.race_bin.value_counts(normalize=True).loc['Majority']*0.5,2))
print('Difference in representation is:',round((all_data.race_bin.value_counts(normalize=True).loc['Majority']) - (all_data.race_bin.value_counts(normalize=True).loc['Other']),2))

## Target Distribution

####  Code corresponding to section 2.7.2.1 Examine Data for Unintended Bias in Veritas Document 4 FEAT Principles Assessment Case Studies

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def display_prevalence(df, outcome, prot_feature,denominator_level):
    
    prop = pd.crosstab(df[outcome],df[prot_feature],normalize='columns')
    levels=list(prop.columns.values)
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Proportions of Observed Outcomes", "Prevalence Ratio"))
    fig.add_trace(
    go.Bar(name = 'Proportion Positive', x = levels, 
           y = list(prop.loc[1,:]),
           text = [round(x,2) for x in list(prop.loc[1,:])], 
           textposition = 'auto',
           marker_color = 'mediumaquamarine'), row=1, col=1),
    fig.add_trace(go.Bar(name = 'Proportion Negative', x = levels, 
               y = list(prop.loc[0,:]),
               text = [round(x,2) for x in list(prop.loc[0,:])], 
               textposition = 'auto',
               marker_color = 'lightslategrey'), row=1, col=1)
    fig.update_layout(barmode = 'stack', template = 'plotly_white',
                           title = 'Observed Data', width = 1000, height=400)
    numerator_level = [lev for lev in levels if lev != denominator_level][0]
    prevalence_ratio = (prop.loc[1,numerator_level])/(prop.loc[1,denominator_level])
    fig.add_trace(
        go.Bar( 
               x = [0, 0.5, 1],
               y =[0,prevalence_ratio,0],
               width = 0.2,
               name = 'Prevalence Ratio',
               marker_color = 'orange'), row=1, col=2),

    fig.update_yaxes(range = [0, 2], row=1, col=2)
    fig.update_xaxes(range=[0,1], showticklabels=False, row=1, col=2)

    fig.update_layout(
                    shapes=[
                        dict(
                            type="rect",
                            xref = 'x2',
                            yref="y2",
                            x0=0,
                            y0=0.8,
                            x1=1,
                            y1=1.2,
                            fillcolor='lightgrey',
                            opacity=0.5,
                            layer="below",
                            line_width=0,
                        )]),

    fig.show()


In [None]:
all_data.Risk.value_counts(normalize=True)

In [None]:
all_data.Risk.value_counts().plot(kind='bar', 
             xlabel='Response', ylabel='# of Customers', title='Life Insurance Underwriting',color='#FF9933');


### Target by Gender

In [None]:
prop_gender = pd.crosstab(all_data['Risk'],all_data['Gender'],normalize='columns')
prop_gender

In [None]:
display_prevalence(all_data, 'Risk', 'Gender',2)

### Target by Race

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def display_prop_positive(df, outcome, prot_feature):
    
    prop = pd.crosstab(df[outcome],df[prot_feature],normalize='columns')
    levels=list(prop.columns.values)
    
    fig = make_subplots(rows=1, cols=1)#, subplot_titles=("Proportions of Observed Outcomes", "Prevalence Ratio"))
    fig.add_trace(
    go.Bar(name = 'Proportion Positive', x = levels, 
           y = list(prop.loc[1,:]),
           text = [round(x,2) for x in list(prop.loc[1,:])], 
           textposition = 'auto',
           marker_color = 'mediumaquamarine'), row=1, col=1),
    fig.add_trace(go.Bar(name = 'Proportion Negative', x = levels, 
               y = list(prop.loc[0,:]),
               text = [round(x,2) for x in list(prop.loc[0,:])], 
               textposition = 'auto',
               marker_color = 'lightslategrey'), row=1, col=1)
    fig.update_layout(barmode = 'stack', template = 'plotly_white',
                           title = 'Observed Data', width = 1000, height=500)

    fig.show()


In [None]:
prop_race = pd.crosstab(all_data['Risk'],all_data['Race'],normalize='columns')
prop_race

In [None]:
display_prop_positive(all_data, 'Risk', 'Race')

### Target by Race binary

In [None]:
display_prevalence(all_data, 'Risk', 'race_bin','Majority')