In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
raw_data_dir = os.path.join(parent_dir, 'data', 'raw')
processed_data_dir = os.path.join(parent_dir, 'data', 'processed')

In [3]:
df = pd.read_csv(os.path.join(raw_data_dir, 'insurance.csv'))

In [4]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


### Major Surgery conducted

In [5]:
df['MajorSurgeryDone'] = df['NumberOfMajorSurgeries'].map(lambda val: 0 if val == 0 else 1)

### BMI

In [6]:
def calculate_bmi(weight, height):
    height_meter = height / 100
    return weight / (height_meter ** 2)

In [7]:
df['BMI'] = df.apply(lambda row: calculate_bmi(row['Weight'], row['Height']), axis=1)

### BMI Category

In [8]:
bins = [0, 18.5, 24.9, 29.9, 34.9, np.inf]
labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity Class 1', 'Obesity Class 2/3']

df['BMI_Category'] = pd.cut(df['BMI'], bins=bins, labels=labels, right=False)

### Age bin

In [9]:
age_bins = [18, 25, 40, 55, np.inf]
age_labels = ['Young Adult', 'Adult', 
              'Middle Aged Adults', 'Senior']

df['Age_Category'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

### Save data to csv file

In [10]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,MajorSurgeryDone,BMI,BMI_Category,Age_Category
0,45,0,0,0,0,155,57,0,0,0,25000,0,23.725286,Normal weight,Middle Aged Adults
1,60,1,0,0,0,180,73,0,0,0,29000,0,22.530864,Normal weight,Senior
2,36,1,1,0,0,158,59,0,0,1,23000,1,23.634033,Normal weight,Adult
3,52,1,1,0,1,183,93,0,0,2,28000,1,27.770313,Overweight,Middle Aged Adults
4,38,0,0,0,1,166,88,0,0,1,23000,1,31.934969,Obesity Class 1,Adult


In [11]:
df.to_csv(os.path.join(processed_data_dir, 'insurance_feature_engineered_for_eda.csv'), index=False)

In [12]:
df.drop(columns=['BMI_Category', 'Age_Category'], axis=1).to_csv(os.path.join(processed_data_dir, 'insurance_feature_engineered_for_model.csv'), index=False)