In [35]:
# Data Insurance Portfolio Project

# Using Python and CSV to analyze and create predictions
import csv

with open('insurance.csv') as insurance_data:
    insurance_dict = csv.DictReader(insurance_data)

    # Group each category of data into individual lists
    age_list = []
    sex_list = []
    bmi_list = []
    children_list = []
    smoker_list = []
    region_list = []
    charges_list = []
    for item in insurance_dict:
        age_list.append(int(item['age']))
        sex_list.append(item['sex'])
        bmi_list.append(float(item['bmi']))
        children_list.append(int(item['children']))
        smoker_list.append(item['smoker'])
        region_list.append(item['region'])
        charges_list.append(float(item['charges']))

    # import functions created for linear regression project
    # to use in predictive data analysis
    
    # Calculate y value with given linear regression data
    def get_y(m, b, x):
        return m*x+b
    
    # Calculate error value of one point
    def calculate_error(m, b, point):
        x_point = point[0]
        y_point = point[1]
        return abs(y_point - get_y(m, b, x_point))

    # Calculate all error of each points combined
    def calculate_all_error(m, b, points):
        all_error = 0
        for point in points:
            all_error += calculate_error(m, b, point)
        return all_error

    # Calculate best error option with 
    def calculate_best_error(datapoints, span):
        possible_ms = [m*.1 for m in range(-span,span)]
        possible_bs = [b*.1 for b in range(span)]
        smallest_error = float("inf")
        best_m = 0
        best_b = 0
        for m in possible_ms:
            for b in possible_bs:
                current_error = calculate_all_error(m, b, datapoints)
                if current_error < smallest_error:
                    best_m = m
                    best_b = b
                    smallest_error = current_error
        print("test")
        return [best_m, best_b]

    # basic data understanding
    # finding the average of listed data based on data type
    def average(data):
        if isinstance(data[0], (int, float)):
            int_sum = 0
            for item in data:
                int_sum += item
            average_int = int_sum/len(data)
            average_list = [average_int]
            return average_list
        elif isinstance(data[0], str):
            unique_words = []
            for word in data:
                if word in unique_words:
                    continue
                else:
                    unique_words.append(word)           
            averages_list = []
            for i in range(len(unique_words)):
                word_count = data.count(unique_words[i])
                word_average = word_count / len(data)
                averages_list.append({unique_words[i]:word_average})
            return averages_list

    # testing the average function
    print(average(region_list))
    print(average(charges_list))
    print(average(smoker_list))
    print(average(children_list))
    print(average(bmi_list))
    print(average(sex_list))
    print(average(age_list))

    # transforming string listed data into numbered for easy math
    sex_list_numbered = [0 if x == 'male' else 1 for x in sex_list]
    smoker_list_numbered = [0 if x == 'no' else 1 for x in smoker_list]
    region_list_numbered = [0 if x == 'northeast' else 1 if x == 'southeast' else 2 if x == 'southwest' else 3 for x in region_list]
    
    # Calculating comparisons between data sets to find any relations
    # or statistically interesting observations

    # smokers over charges
    # Failed test that took too much time to calculate correctly
    #smoker_charges_data = calculate_best_error(list(zip(smoker_list_numbered,charges_list)),5000)
    #print(smoker_charges_data)

    # Second test to compare avg of charges for smokers versus non-smokers
    smokers_charges_list = list(zip(smoker_list_numbered,charges_list))
    smokers_only_charges = []
    non_smokers_charges = []
    for item in smokers_charges_list:
        if item[0] == 1:
            smokers_only_charges.append(item[1])
        else:
            non_smokers_charges.append(item[1])

    # Create reusable function of above code to split data between compared data type
    def split_data_average(first_list, second_list):
        # find all unique data to group in first list
        unique_type = []
        for data in first_list:
            if data in unique_type:
                continue
            else:
                unique_type.append(data)
                
        # sort groups by numerical order ascending
        unique_type.sort()
        
        # begin to split data in second list by groups found in first list
        # instatiate organized dict
        organized_dict = {}
        for type in unique_type:
            organized_dict[type] = []

        # add to dict by group
        combined_list = list(zip(first_list, second_list))
        for type in unique_type:
            for item in combined_list:
                if item[0] == type:
                    organized_dict[type].append(item[1])

        # average data in organized list for each group
        averages_dict = {}
        for type in organized_dict:
            total = 0
            count = 0
            for item in organized_dict[type]:
                total += item
                count += 1
            average = total / count
            averages_dict[type] = average
        
        # return dictionary of averages by type
        return averages_dict

    # Find average charges of smokers versus non smokers
    # by hand and with reusable function
    print(average(smokers_only_charges))
    print(average(non_smokers_charges))
    print(split_data_average(smoker_list_numbered, charges_list))

    # continue to use function to compare other groups
    print(split_data_average(sex_list_numbered, charges_list))
    print(split_data_average(children_list, charges_list))
    print(split_data_average(children_list, smoker_list_numbered))
    print(split_data_average(sex_list_numbered, children_list))
    print(split_data_average(region_list_numbered, charges_list))
    print(split_data_average(sex_list_numbered, bmi_list))
    print(split_data_average(region_list_numbered, smoker_list_numbered))
    print(split_data_average(region_list_numbered, children_list))

    # Most noticeable difference in price was smoker vs non
    # Percentage of smokers decreases significanly at 4+ children
    # Unclear if statiscally different, but west coast charges were both lower than east
    # Unclear in statiscally different, but west coast smoker percentages were lower than east

    # Overall thoughts, I tried to use the linear regression project code here but couldn't figure out how to parse through
    # in a timely matter for the charges list (I assume I would need a large range to find an accurate linear regression equation)
    # Instead I decided to simply compare averages of specific categories in comparison with another category
    # I had a ton of fun
    
    

[{'southwest': 0.2428998505231689}, {'southeast': 0.27204783258594917}, {'northwest': 0.2428998505231689}, {'northeast': 0.242152466367713}]
[13270.422265141257]
[{'yes': 0.20478325859491778}, {'no': 0.7952167414050823}]
[1.0949177877429]
[30.663396860986538]
[{'female': 0.4947683109118087}, {'male': 0.5052316890881914}]
[39.20702541106129]
[32050.23183153285]
[8434.268297856199]
{0: 8434.268297856199, 1: 32050.23183153285}
{0: 13956.751177721886, 1: 12569.57884383534}
{0: 12365.975601635882, 1: 12731.171831635793, 2: 15073.563733958328, 3: 15355.31836681528, 4: 13850.656311199999, 5: 8786.035247222222}
{0: 0.20034843205574912, 1: 0.1882716049382716, 2: 0.22916666666666666, 3: 0.2484076433121019, 4: 0.12, 5: 0.05555555555555555}
{0: 1.1153846153846154, 1: 1.0740181268882176}
{0: 13406.3845163858, 1: 14735.411437609895, 2: 12346.93737729231, 3: 12417.575373969228}
{0: 30.943128698224832, 1: 30.377749244713023}
{0: 0.20679012345679013, 1: 0.25, 2: 0.17846153846153845, 3: 0.17846153846153