# Import Libraries

In [None]:
# First we'll import the basic libraries that we'll use in this project
import pandas as pd
import numpy as np
from numpy import cov
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

import ipywidgets as widgets
from ipywidgets import interact

# Dataset Description

In [None]:
# Load the dataset we'll use here
data = pd.read_csv('/content/insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Based on `info()` we can conclude that our dataset consist of 1338 rows and 6 features. So far it seems that there is no missing value there, but we will still make sure later.

# Data Cleaning

## Handling Duplicated Values

In [None]:
data[data.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
195,19,male,30.59,0,no,northwest,1639.5631
581,19,male,30.59,0,no,northwest,1639.5631


In [None]:
data = data.drop_duplicates(keep="first")
data[data.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges


**We have handled the duplicated value!**

## Handling the Missing Value

In [None]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

**Wow there is no missing value!**

## Handling the Inconsistent format

In [None]:
data["age"].unique()

array([19, 18, 28, 33, 32, 31, 46, 37, 60, 25, 62, 23, 56, 27, 52, 30, 34,
       59, 63, 55, 22, 26, 35, 24, 41, 38, 36, 21, 48, 40, 58, 53, 43, 64,
       20, 61, 44, 57, 29, 45, 54, 49, 47, 51, 42, 50, 39])

In [None]:
data["sex"].unique()

array(['female', 'male'], dtype=object)

In [None]:
data["bmi"].unique()

array([27.9  , 33.77 , 33.   , 22.705, 28.88 , 25.74 , 33.44 , 27.74 ,
       29.83 , 25.84 , 26.22 , 26.29 , 34.4  , 39.82 , 42.13 , 24.6  ,
       30.78 , 23.845, 40.3  , 35.3  , 36.005, 32.4  , 34.1  , 31.92 ,
       28.025, 27.72 , 23.085, 32.775, 17.385, 36.3  , 35.6  , 26.315,
       28.6  , 28.31 , 36.4  , 20.425, 32.965, 20.8  , 36.67 , 39.9  ,
       26.6  , 36.63 , 21.78 , 30.8  , 37.05 , 37.3  , 38.665, 34.77 ,
       24.53 , 35.2  , 35.625, 33.63 , 28.   , 34.43 , 28.69 , 36.955,
       31.825, 31.68 , 22.88 , 37.335, 27.36 , 33.66 , 24.7  , 25.935,
       22.42 , 28.9  , 39.1  , 36.19 , 23.98 , 24.75 , 28.5  , 28.1  ,
       32.01 , 27.4  , 34.01 , 29.59 , 35.53 , 39.805, 26.885, 38.285,
       37.62 , 41.23 , 34.8  , 22.895, 31.16 , 27.2  , 26.98 , 39.49 ,
       24.795, 31.3  , 38.28 , 19.95 , 19.3  , 31.6  , 25.46 , 30.115,
       29.92 , 27.5  , 28.4  , 30.875, 27.94 , 35.09 , 29.7  , 35.72 ,
       32.205, 28.595, 49.06 , 27.17 , 23.37 , 37.1  , 23.75 , 28.975,
      

In [None]:
data["children"].unique()

array([0, 1, 3, 2, 5, 4])

In [None]:
data["smoker"].unique()

array(['yes', 'no'], dtype=object)

In [None]:
data["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [None]:
data["charges"].unique()

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

In [None]:
data.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


# Descriptive Analytics

In [None]:
data.describe(include="all")

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1337.0,1337,1337.0,1337.0,1337,1337,1337.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,675,,,1063,364,
mean,39.222139,,30.663452,1.095737,,,13279.121487
std,14.044333,,6.100468,1.205571,,,12110.359656
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29,0.0,,,4746.344
50%,39.0,,30.4,1.0,,,9386.1613
75%,51.0,,34.7,2.0,,,16657.71745


Based on `describe()`, we can conclude that:
- Our oldest respondent is 64 years old, with average 39 years old
- More than 50% of our respondent are male
- Our respondent highest `bmi` is 53
- Most of our respondents have only 1 child, but few/some of them have 5 `children`
- 80% of our respondents are a non-`smoker`
- 30% of our respondents live in Southeast `region` and
- Our respondent average `charges` on their insurance is 13279.121487.

# Analysis

In [None]:
data[["smoker", "bmi"]].groupby(["smoker"]).agg(["max", "min", "mean"]).reset_index()

Unnamed: 0_level_0,smoker,bmi,bmi,bmi
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean
0,no,53.13,15.96,30.651853
1,yes,52.58,17.195,30.708449


In [None]:
data[["smoker", "age", "sex"]].groupby(["smoker", "sex"]).agg(["max", "min", "mean", "sum"]).reset_index()

Unnamed: 0_level_0,smoker,sex,age,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,min,mean,sum
0,no,female,64,18,39.691042,21711
1,no,male,64,18,39.100775,20176
2,yes,female,64,18,38.608696,4440
3,yes,male,64,18,38.446541,6113


In [None]:
data[["smoker", "charges"]].groupby(["smoker"]).agg(["max", "min", "mean", "sum"]).reset_index()

Unnamed: 0_level_0,smoker,charges,charges,charges,charges
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,sum
0,no,36910.60803,1121.8739,8440.660307,8972422.0
1,yes,63770.42801,12829.4551,32050.231832,8781764.0


## Discrete Variable

In [None]:
data[["sex", "charges"]].groupby(["sex"]).agg(["max", "min", "mean"]).reset_index()

Unnamed: 0_level_0,sex,charges,charges,charges,charges
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,sum
0,female,63770.42801,1607.5101,12569.578844,8321061.0
1,male,62592.87309,1121.8739,13974.998864,9433124.0


In [None]:
data[["region", "charges"]].groupby(["region"]).agg(["max", "min", "mean", "sum"]).reset_index()

Unnamed: 0_level_0,region,charges,charges,charges,charges
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,sum
0,northeast,58571.07448,1694.7964,13406.384516,4343669.0
1,northwest,60021.39897,1621.3402,12450.840844,4034072.0
2,southeast,63770.42801,1121.8739,14735.411438,5363690.0
3,southwest,52590.82939,1241.565,12346.937377,4012755.0


Dengan kovarians tidak mudah ditafsirkan secara kualitatif, sehingga kita perlu menghitung korelasi.

## Correlation

In [None]:
data.corr()

  data.corr()


Unnamed: 0,age,bmi,children,charges
age,1.0,0.109344,0.041536,0.298308
bmi,0.109344,1.0,0.012755,0.198401
children,0.041536,0.012755,1.0,0.067389
charges,0.298308,0.198401,0.067389,1.0
