# ML Zoomcamp Mid-Term Project 
### Travel Insurance Classification Model

The purpose of this model is to predict if a customer bought Travel Insurance Package during the introductory offering held 2019.

## Data Preparation

In [1]:
# import pyhton libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [20]:
# load dataset
df = pd.read_csv('TravelInsurancePrediction.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,0,31,Government Sector,Yes,400000,6,1,No,No,0
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [21]:
# check size of dataframe
df.shape

(1987, 10)

In [23]:
# delete the Unnamed column 
del df['Unnamed: 0']

In [25]:
df.head(3)

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1


In [26]:
# statistical summary
df.describe()

Unnamed: 0,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance
count,1987.0,1987.0,1987.0,1987.0,1987.0
mean,29.650226,932763.0,4.752894,0.277806,0.357323
std,2.913308,376855.7,1.60965,0.44803,0.479332
min,25.0,300000.0,2.0,0.0,0.0
25%,28.0,600000.0,4.0,0.0,0.0
50%,29.0,900000.0,5.0,0.0,0.0
75%,32.0,1250000.0,6.0,1.0,1.0
max,35.0,1800000.0,9.0,1.0,1.0


In [28]:
# checking that there no null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB


In [29]:
# change fields name to small letters and replace white spaces with underscore
df.columns = df.columns.str.lower().str.replace(' ','_')

# select all the categorical variables and make a list
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

# loop through the list change the text to small letters and replace white spaces with underscore
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [31]:
# import sklearn library to split the dataset into train and test set
from sklearn.model_selection import train_test_split

In [32]:
# split data for train dataset
df_train_full, df_test = train_test_split(df, test_size = .2, random_state = 1)
# split data for train dataset
df_train, df_val = train_test_split(df_train_full, test_size = .33, random_state = 11)

In [33]:
# set the target to variables
y_train = df_train.travelinsurance.values
y_val = df_val.travelinsurance.values

In [34]:
# delete the churn field from the train and val dataset
del df_train['travelinsurance']
del df_val['travelinsurance']

## Exploratory Data Analysis

In [35]:
# number ofcustomers - 0 for does who didn't buy the travel insurance package while 1 for those who did
df_train_full.travelinsurance.value_counts()

0    1024
1     565
Name: travelinsurance, dtype: int64

In [36]:
# defining the average for the travel insurance
avg = df_train_full.travelinsurance.mean()
round(avg, 3)

0.356

In [69]:
# split the train dataset into categorical variable
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)

# split the train dataset into numerical variable
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)

## Feature Importance

In [70]:
# import IPython display library
from IPython.display import display

In [71]:
# loof through the entire categorical dataset to determine the churn and risk rate
for col in categorical:
    df_group = df_train_full.groupby(by=col).travelinsurance.agg(['mean'])
    df_group['diff'] = df_group['mean'] - avg
    df_group['risk'] = (df_group['mean'] / avg) - 1
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
employment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
government_sector,0.241901,-0.113669,-0.319681
private_sector/self_employed,0.402309,0.04674,0.13145


Unnamed: 0_level_0,mean,diff,risk
graduateornot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.323529,-0.03204,-0.090109
yes,0.361214,0.005644,0.015874


Unnamed: 0_level_0,mean,diff,risk
frequentflyer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.297992,-0.057578,-0.161931
yes,0.563953,0.208384,0.586057


Unnamed: 0_level_0,mean,diff,risk
evertravelledabroad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.258165,-0.097405,-0.27394
yes,0.768977,0.413407,1.162662


## Feature Importance: Mutual Information

In [72]:
# import mutual info score library
from sklearn.metrics import mutual_info_score

In [73]:
# define function for categorical variables to determine feature importance
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.travelinsurance)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending = False).to_frame(name = 'MI')

display(df_mi.head())

Unnamed: 0,MI
evertravelledabroad,0.085536
frequentflyer,0.025265
employment_type,0.012052
graduateornot,0.000399


## Feature Importance: Correlation

In [74]:
# determine the correlation between numerical values and travelinsurance
df_train_full[numerical].corrwith(df_train_full.travelinsurance).to_frame('correlation')

Unnamed: 0,correlation
age,0.052488
annualincome,0.39661
familymembers,0.082207
chronicdiseases,0.032062


In [75]:
# group travelinsurance by the numerical values
df_train_full.groupby(by = 'travelinsurance')[numerical].mean()

Unnamed: 0_level_0,age,annualincome,familymembers,chronicdiseases
travelinsurance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,29.551758,827294.9,4.678711,0.265625
1,29.872566,1137788.0,4.952212,0.295575


In [76]:
# DictVectoriser for encoding and transforming categorical variables
from sklearn.feature_extraction import DictVectorizer

In [86]:
# merge cat and num dataset
columns = categorical + numerical
train_dicts = df_train[columns].to_dict(orient = 'records')

In [91]:
# assign the DictVectorizer to a variable
dv = DictVectorizer(sparse=False)

# fit the train set for transformation
X_train = dv.fit_transform(train_dicts)

# fit the validation set for transformation
X_val = dv.transform(val_dicts)