# U.S. Medical Insurance Costs

In [1]:
# data file in format of: age,sex,bmi,children,smoker,region,charges

import csv
records = []

In [2]:
# opening and placing all data into a list of dictionaries

with open("insurance.csv") as insurance_file:
    csv_file = csv.DictReader(insurance_file)
    for row in csv_file:
        records.append(row)

In [3]:
# function that separates dataset into each item in category
def separate_sets(record, category):
    scale = []
    for entry in record:
        if not entry[category] in scale: scale.append(entry[category])
    temp = {item:[] for item in scale}
    for item in temp:
        temp[item] = [entry for entry in record if entry[category]==item]
    return temp

In [4]:
# function to calculate the average of the dictionary for a specific category

def average_calc(contain, category):
    avg = round(sum([float(entry[category]) for entry in contain])/len(contain),2)
    print("The average", category, "in this dataset is", avg)
    return avg

In [5]:
# test out average function on categories that can be used
avg_age = average_calc(records, "age")
avg_cost = average_calc(records, "charges")
avg_bmi = average_calc(records, "bmi")
avg_kids = average_calc(records, "children")

The average age in this dataset is 39.21
The average charges in this dataset is 13270.42
The average bmi in this dataset is 30.66
The average children in this dataset is 1.09


In [6]:
# function to calculate percent the subsets are compared to the whole
def calc_percent(data1, title1, data2, title2, main):
    total = len(data1) + len(data2)
   
    percent1 = round(len(data1)/total*100,2)
    percent2 = round(len(data2)/total*100,2)
    print("This dataset contains", percent1, "percent", title1, "and", percent2, "percent", title2)
    return percent1, percent2

In [7]:
#determine difference in costs based on sex
sex = separate_sets(records, "sex")
men_percent, women_percent = calc_percent(sex["male"], "men", sex["female"], "women", records)
print("Dataset: only males")
men_avg = average_calc(sex["male"], "charges")
print("Dataset: only females")
women_avg = average_calc(sex["female"], "charges")
print("On average men pay", "$" + str(men_avg - women_avg), "more for insurance.")


This dataset contains 50.52 percent men and 49.48 percent women
Dataset: only males
The average charges in this dataset is 13956.75
Dataset: only females
The average charges in this dataset is 12569.58
On average men pay $1387.17 more for insurance.


In [8]:
# Look and compare differences for smokers and non-smokers
smoke = separate_sets(records, "smoker")

smoker_percent, non_smoker_percent = calc_percent(smoke["yes"], "smokers", smoke["no"], "non smokers", records)

print("Dataset: smokers")
smoker_avg = average_calc(smoke["yes"], "charges")
print("Dataset: non-smokers")
non_smoker_avg = average_calc(smoke["no"], "charges")
print("On average smokers pay", "$" + str(smoker_avg - non_smoker_avg), "more for insurance.")
sex_smoker = separate_sets(smoke["yes"], "sex")
m, f = calc_percent(sex_smoker["male"], "men", sex_smoker["female"], "women", sex_smoker)

This dataset contains 20.48 percent smokers and 79.52 percent non smokers
Dataset: smokers
The average charges in this dataset is 32050.23
Dataset: non-smokers
The average charges in this dataset is 8434.27
On average smokers pay $23615.96 more for insurance.
This dataset contains 58.03 percent men and 41.97 percent women


In [9]:
#function to classify entries into set categories based on a scale

def classify(scale, record, category):
    keys = [key for key in scale.keys()]
    temp = {key:0 for key in scale.values()}
    for entry in record:
        for key in keys:
            if float(entry[category]) < key: 
                temp[scale[key]] += 1
                break
    return temp

In [10]:
bmi_scale = {18.5:"underweight", 25.0: "normal", 30: "overweight", 35: "obese", float("inf"): "extremely obese"}
age_scale = {21: "child", 30: "young adult", 40 : "adult", 50: "middle aged", 60: "old", float("inf"): "elderly"}
children = {1: "no children", 2: "single child", 3: "two children", float("inf"): "more than two children"}
bmi_ranges = classify(bmi_scale, records, "bmi")
print(bmi_ranges)
age_ranges = classify(age_scale, records, "age")
print(age_ranges)
num_children = classify(children, records, "children")
print(num_children)

{'underweight': 20, 'normal': 225, 'overweight': 386, 'obese': 391, 'extremely obese': 316}
{'child': 166, 'young adult': 251, 'adult': 257, 'middle aged': 279, 'old': 271, 'elderly': 114}
{'no children': 574, 'single child': 324, 'two children': 240, 'more than two children': 200}


In [11]:
# function to count numbers of each category in a scale

def count_class(scale, record, category):
    temp = {scale[i]:0 for i in range(len(scale))}
    for entry in record:
         for i in range(len(scale)):
                if entry[category]==scale[i]: temp[entry[category]] += 1
    return temp

regions = []
for entry in records:
    if not entry["region"] in regions: regions.append(entry["region"])
region_nums = count_class(regions, records, "region")
print(region_nums)

{'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}


In [12]:
areas = separate_sets(records, "region")
print("Datasets in order: Southwest, Southeast, Northwest, Northeast")
avg_southwest = average_calc(areas["southwest"], "charges")
avg_southeast = average_calc(areas["southeast"], "charges")
avg_northwest = average_calc(areas["northwest"], "charges")
avg_northeast = average_calc(areas["northeast"], "charges")

Datasets in order: Southwest, Southeast, Northwest, Northeast
The average charges in this dataset is 12346.94
The average charges in this dataset is 14735.41
The average charges in this dataset is 12417.58
The average charges in this dataset is 13406.38


In [13]:
child = separate_sets(records,"children")
for key in range(len(child)):
    print("Num of children:", key)
    average = average_calc(child[str(key)], "charges")
print("Percent of data with 4 or 5 kids:", round((((len(child[str(5)])+len(child[str(4)]))/len(records))*100),2))

Num of children: 0
The average charges in this dataset is 12365.98
Num of children: 1
The average charges in this dataset is 12731.17
Num of children: 2
The average charges in this dataset is 15073.56
Num of children: 3
The average charges in this dataset is 15355.32
Num of children: 4
The average charges in this dataset is 13850.66
Num of children: 5
The average charges in this dataset is 8786.04
Percent of data with 4 or 5 kids: 3.21


In [4]:
#trying pandas
import pandas as pd
df1 = pd.read_csv("insurance.csv")
df1.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
