## Data Preparation

In [1]:
# import pyhton libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# load dataset
df = pd.read_csv('TravelInsurancePrediction.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,0,31,Government Sector,Yes,400000,6,1,No,No,0
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [3]:
# check size of dataframe
df.shape

(1987, 10)

In [4]:
# delete the Unnamed column 
del df['Unnamed: 0']

In [5]:
df.head(3)

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1


In [6]:
# statistical summary
df.describe()

Unnamed: 0,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance
count,1987.0,1987.0,1987.0,1987.0,1987.0
mean,29.650226,932763.0,4.752894,0.277806,0.357323
std,2.913308,376855.7,1.60965,0.44803,0.479332
min,25.0,300000.0,2.0,0.0,0.0
25%,28.0,600000.0,4.0,0.0,0.0
50%,29.0,900000.0,5.0,0.0,0.0
75%,32.0,1250000.0,6.0,1.0,1.0
max,35.0,1800000.0,9.0,1.0,1.0


In [7]:
# checking that there no null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB


In [8]:
# change fields name to small letters and replace white spaces with underscore
df.columns = df.columns.str.lower().str.replace(' ','_')

# select all the categorical variables and make a list
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

# loop through the list change the text to small letters and replace white spaces with underscore
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

## Exploratory Data Analysis

In [13]:
# number ofcustomers - 0 for does who didn't buy the travel insurance package while 1 for those who did
df.travelinsurance.value_counts()

0    1277
1     710
Name: travelinsurance, dtype: int64

In [14]:
# defining the average for the travel insurance
avg = df.travelinsurance.mean()
round(avg, 3)

0.357

In [15]:
# split the train dataset into categorical variable
categorical = list(df.dtypes[df.dtypes == 'object'].index)

# split the train dataset into numerical variable
numerical = ['age', 'annualincome', 'familymembers', 'chronicdiseases']

## Feature Importance

In [16]:
# import IPython display library
from IPython.display import display

In [17]:
# loof through the entire categorical dataset to determine the churn and risk rate
for col in categorical:
    df_group = df.groupby(by=col).travelinsurance.agg(['mean'])
    df_group['diff'] = df_group['mean'] - avg
    df_group['risk'] = (df_group['mean'] / avg) - 1
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
employment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
government_sector,0.245614,-0.111709,-0.312627
private_sector/self_employed,0.402258,0.044936,0.125757


Unnamed: 0_level_0,mean,diff,risk
graduateornot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.335593,-0.021729,-0.060812
yes,0.361111,0.003789,0.010603


Unnamed: 0_level_0,mean,diff,risk
frequentflyer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.3,-0.057323,-0.160423
yes,0.573141,0.215819,0.603989


Unnamed: 0_level_0,mean,diff,risk
evertravelledabroad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.256378,-0.100944,-0.282502
yes,0.784211,0.426888,1.194685


## Feature Importance: Mutual Information

In [18]:
# import mutual info score library
from sklearn.metrics import mutual_info_score

In [19]:
# define function for categorical variables to determine feature importance
def calculate_mi(series):
    return mutual_info_score(series, df.travelinsurance)

df_mi = df[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending = False).to_frame(name = 'MI')

display(df_mi.head())

Unnamed: 0,MI
evertravelledabroad,0.091751
frequentflyer,0.025983
employment_type,0.011351
graduateornot,0.000181


## Feature Importance: Correlation

In [20]:
# determine the correlation between numerical values and travelinsurance
df[numerical].corrwith(df.travelinsurance).to_frame('correlation')

Unnamed: 0,correlation
age,0.06106
annualincome,0.396763
familymembers,0.079909
chronicdiseases,0.01819


In [21]:
# group travelinsurance by the numerical values
df.groupby(by = 'travelinsurance')[numerical].mean()

Unnamed: 0_level_0,age,annualincome,familymembers,chronicdiseases
travelinsurance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,29.517619,821299.9,4.657009,0.271731
1,29.888732,1133239.0,4.925352,0.288732
