# U.S. Medical Insurance Costs

In [124]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [97]:
insurance = pd.read_csv("insurance.csv")
print(insurance.columns)
print(insurance.head())

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [98]:
age = list(insurance.age)
sex = list(insurance.sex)
bmi = list(insurance.bmi)
children = list(insurance.children)
smoker = list(insurance.smoker)
region = list(insurance.region)
charges = list(insurance.charges)

In [99]:
avg_age = np.mean(age)
print("Average age of insured individual is: " + str(np.round(avg_age, 1)))

print("Youngest age of insured individuals is: " + str(np.min(age)))
print("Oldest age of insured individuals is: " + str(np.max(age)))

Average age of insured individual is: 39.2
Youngest age of insured individuals is: 18
Oldest age of insured individuals is: 64


In [100]:
# identifying regions
print(pd.unique(insurance.region))

num_southwest = region.count("southwest")
sw = insurance[insurance.region == "southwest"]
avg_sw_charges = np.mean(sw.charges)
num_southeast = region.count("southeast")
se = insurance[insurance.region == "southeast"]
avg_se_charges = np.mean(se.charges)
num_northwest = region.count("northwest")
nw = insurance[insurance.region == "northwest"]
avg_nw_charges = np.mean(nw.charges)
num_northeast = region.count("northeast")
ne = insurance[insurance.region == "northeast"]
avg_ne_charges = np.mean(ne.charges)

print("Number of insured individuals in the Southeast: " + str(num_southeast))
print("Average charges in the Southeast are: $" + str(np.round(avg_se_charges, 2)))
print("Number of insured individuals in the Southwest: " + str(num_southwest))
print("Average charges in the Southwest are: $" + str(np.round(avg_sw_charges, 2)))
print("Number of insured individuals in the Northwest: " + str(num_northwest))
print("Average charges in the Northwest are: $" + str(np.round(avg_nw_charges, 2)))
print("Number of insured individuals in the Northeast: " + str(num_northeast))
print("Average charges in the Northeast are: $" + str(np.round(avg_ne_charges, 2)))

['southwest' 'southeast' 'northwest' 'northeast']
Number of insured individuals in the Southeast: 364
Average charges in the Southeast are: $14735.41
Number of insured individuals in the Southwest: 325
Average charges in the Southwest are: $12346.94
Number of insured individuals in the Northwest: 325
Average charges in the Northwest are: $12417.58
Number of insured individuals in the Northeast: 324
Average charges in the Northeast are: $13406.38


In [101]:
smoke = insurance[insurance.smoker == "yes"]
non_smoke = insurance[insurance.smoker == "no"]
smoke_diff = np.mean(smoke.charges) - np.mean(non_smoke.charges)

print("Average insurance charges for non-smokers are: $" + str(np.round(np.mean(non_smoke.charges), 2)))
print("Average insurance charges for smokers are: $" + str(np.round(np.mean(smoke.charges), 2)))
print("The difference between average insurance charges of smokers and non-smokers is: $" + str(np.round(smoke_diff, 2)))

Average insurance charges for non-smokers are: $8434.27
Average insurance charges for smokers are: $32050.23
The difference between average insurance charges of smokers and non-smokers is: $23615.96


In [102]:
has_children = insurance[insurance.children >= 1]
age_w_children = np.mean(has_children.age)

print("The average age of an insured individual who has children is: " + str(np.round(age_w_children, 1)))

The average age of an insured individual who has children is: 39.8


In [126]:
corr_age, pval_age = pearsonr(insurance["age"], insurance["charges"])
corr_bmi, pval_bmi = pearsonr(insurance["bmi"], insurance["charges"])
corr_children, pval_children = pearsonr(insurance["children"], insurance["charges"])

print("The correlation between age and insurance charges is: " + str(np.round(corr_age, 3)))
print("The p-value between age and insurance charges is: " + str(pval_age))
print("The correlation between bmi and insurance charges is: " + str(np.round(corr_bmi, 3)))
print("The p-value between bmi and insurance charges is: " + str(pval_bmi))
print("The correlation between children and insurance charges is: " + str(np.round(corr_children, 3)))
print("The p-value between children and insurance charges is: " + str(np.round(pval_children, 5)))

female_charges = insurance.charges[insurance.sex == "female"]
male_charges = insurance.charges[insurance.sex == "male"]
tstat_sex, pval_sex = ttest_ind(female_charges, male_charges)
smoke_charges = insurance.charges[insurance.smoker == "yes"]
no_smoke_charges = insurance.charges[insurance.smoker == "no"]
tstat_smoker, pval_smoker = ttest_ind(no_smoke_charges, smoke_charges)
print("The p-value between sex and insurance charges is: " + str(np.round(pval_sex, 3)))
print("The p-value between smoker status and insurance charges is: " + str(pval_smoker))

sw_charges = insurance.charges[insurance.region == "southwest"]
se_charges = insurance.charges[insurance.region == "southeast"]
nw_charges = insurance.charges[insurance.region == "northwest"]
ne_charges = insurance.charges[insurance.region == "northeast"]
fstat_region, pval_region = f_oneway(sw_charges, se_charges, nw_charges, ne_charges)
#print(pval_region)
tukey_region_results = pairwise_tukeyhsd(insurance.charges, insurance.region, 0.05)
print(tukey_region_results)

The correlation between age and insurance charges is: 0.299
The p-value between age and insurance charges is: 4.8866933317203816e-29
The correlation between bmi and insurance charges is: 0.198
The p-value between bmi and insurance charges is: 2.459085535117846e-13
The correlation between children and insurance charges is: 0.068
The p-value between children and insurance charges is: 0.01285
The p-value between sex and insurance charges is: 0.036
The p-value between smoker status and insurance charges is: 8.271435842179102e-283
       Multiple Comparison of Means - Tukey HSD, FWER=0.05       
  group1    group2   meandiff  p-adj    lower      upper   reject
-----------------------------------------------------------------
northeast northwest  -988.8091 0.7002 -3428.9473  1451.329  False
northeast southeast  1329.0269 0.4754 -1044.9543 3703.0081  False
northeast southwest -1059.4471 0.6584 -3499.5853  1380.691  False
northwest southeast  2317.8361 0.0583    -54.212 4689.8842  False
northw

The most impactful variables on insurance charges in this dataset by far are (in order of significance):

    1. Smoking Status
    2. Age
    3. BMI