In [29]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

#### Description
Create another feature based called **BMI_group** which groups people based on their **BMI**. The groups should be as follows:<br>
**Underweight**: BMI is less than 18.5.<br>
**Normal**: BMI is 18.5 to 24.9.<br>
**Overweight**: BMI is 25 to 29.9.<br>
**Obese**: BMI is 30 or more.<br>
The grouping is based on WHO standards.<br>

The output should have first five rows of the resulting dataframe.

In [3]:
df=pd.read_csv("insurance.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
bins = [0, 18.5, 25.0, 30.0, 200]
slot = ["Underweight", "Normal",'Overweight', "Obese"]
df["BMI_group"] = pd.cut(df["bmi"], bins, labels=slot)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,BMI_group
0,19,female,27.9,0,yes,southwest,16884.924,Overweight
1,18,male,33.77,1,no,southeast,1725.5523,Obese
2,28,male,33.0,3,no,southeast,4449.462,Obese
3,33,male,22.705,0,no,northwest,21984.47061,Normal
4,32,male,28.88,0,no,northwest,3866.8552,Overweight


In [5]:
le = LabelEncoder()
df.sex = le.fit_transform(df.sex)
df.BMI_group = le.fit_transform(df.BMI_group)
df.smoker = le.fit_transform(df.smoker)
df.region = le.fit_transform(df.region)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,BMI_group
0,19,0,27.9,0,1,3,16884.924,2
1,18,1,33.77,1,0,2,1725.5523,1
2,28,1,33.0,3,0,2,4449.462,1
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.88,0,0,1,3866.8552,2


In [6]:
print(df[['smoker', 'bmi']].corr()['smoker'][1])
print(df[['smoker', 'age']].corr()['smoker'][1])
print(df[['smoker', 'charges']].corr()['smoker'][1])

0.003750425904980336
-0.02501875153628481
0.7872514304984697


In [40]:
# Read training data
train = pd.read_csv("insurance_training.csv")
train = train.fillna(method='ffill')
# Read test data
test = pd.read_csv("insurance_test.csv")

In [44]:
from sklearn.linear_model import LinearRegression

X_train_lm = train.drop('charges', axis=1).values
y_train_lm = train['charges'].values
lm = LinearRegression()

lm.fit(X_train_lm, y_train_lm)

y_test_pred = lm.predict(test)

In [45]:
y_test_pred

array([ 8601.50473149,  7274.0550904 , 36650.25371263,  9461.54577199,
       26838.88000245, 11214.59490109,   746.9499201 , 17092.57028857,
         579.72239046, 10942.16587621, 28489.43359378,  9429.71984003,
        4679.2047214 , 38319.42074502, 40504.42895757, 37314.93027878,
       15365.14630871, 35978.44072843,  8766.60385558, 31252.30803423,
        4069.81000419, 10104.46153229,  2297.11224037,  6851.68116752,
       11221.45974852, 12920.74242631, 14651.27688438,  6109.69106861,
        9357.2435692 ,  2366.47613256,  8983.7468948 , 13236.3512562 ,
        4692.03994244,  4073.80666927,  4678.16136223, 13122.40336412,
        1410.05140545,  8712.04312702, 33967.50330118, 32375.92406848,
        3147.13427597,  4343.89811781, 14296.02834853, 11323.40414164,
        8739.55326901, 11507.11462267,  5053.56495766,  3598.8440298 ,
       35406.56241993,  9065.24732559, 15920.16977013,  1864.2208657 ,
       12547.47699634,  1393.06474673, 13820.60530594, 12710.48258577,
      