In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
df = pd.read_csv('../data/insurance.csv')

In [4]:
df.head(5) #display head (top 5 rows)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.tail(5) #display tail (last 5 rows)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [6]:
print(f"Shape: ",df.shape) #get total shape of dataset, total rows and columns
print("Number of Columns:", df.shape[1])
print("Number of Rows:", df.shape[0])

Shape:  (1338, 7)
Number of Columns: 7
Number of Rows: 1338


In [7]:
df.info() #quick info about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
df.describe().transpose() #statistics for numerical datatypes

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [9]:
df.isna().sum() #number of missing values per column

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [10]:
df.dropna() #drop rows with any NA values

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [11]:
print("Number of Duplicates: ", df.duplicated().sum())

Number of Duplicates:  1


In [12]:
df.drop_duplicates() #drop rows with duplicate values

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [13]:
df.nunique() #number of unique values in each column

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [14]:
df.columns #show all cloumns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [15]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('Numerical Features : {} : {}'.format(len(numerical_features), numerical_features))
print('Categorical Features : {} : {}'.format(len(categorical_features), categorical_features))

Numerical Features : 4 : ['age', 'bmi', 'children', 'charges']
Categorical Features : 3 : ['sex', 'smoker', 'region']


In [16]:
#get unique values in categorical columns
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'sex': ['female' 'male']
Unique values in column 'smoker': ['yes' 'no']
Unique values in column 'region': ['southwest' 'southeast' 'northwest' 'northeast']


In [17]:
# new calculated column bmi_range
df['bmi_range'] = 'normal'
df.loc[(df['bmi'] < 18.5), 'bmi_range'] = 'underweight'
df.loc[(df['bmi'] > 25.0) & (df['bmi'] < 29.9), 'bmi_range'] = 'overweight'
df.loc[(df['bmi'] > 30.0), 'bmi_range'] = 'obese'
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_range
0,19,female,27.9,0,yes,southwest,16884.924,overweight
1,18,male,33.77,1,no,southeast,1725.5523,obese
2,28,male,33.0,3,no,southeast,4449.462,obese
3,33,male,22.705,0,no,northwest,21984.47061,normal
4,32,male,28.88,0,no,northwest,3866.8552,overweight


In [18]:
# new calculated column age_range
df['age_range'] = 'senior'
df.loc[(df['age'] > 0) & (df['age'] <= 30), 'age_range'] = 'youngster'
df.loc[(df['age'] > 30) & (df['age'] <= 62), 'age_range'] = 'middle-aged'
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_range,age_range
0,19,female,27.9,0,yes,southwest,16884.924,overweight,youngster
1,18,male,33.77,1,no,southeast,1725.5523,obese,youngster
2,28,male,33.0,3,no,southeast,4449.462,obese,youngster
3,33,male,22.705,0,no,northwest,21984.47061,normal,middle-aged
4,32,male,28.88,0,no,northwest,3866.8552,overweight,middle-aged


In [19]:
young_df = df[df['age_range'] == 'youngster']
young_overweight_percent = (young_df[young_df['bmi_range'] == 'overweight']['age_range'].count() / len(young_df)) * 100
young_obese_percent = (young_df[young_df['bmi_range'] == 'obese']['age_range'].count() / len(young_df)) * 100
young_normal_percent = (young_df[young_df['bmi_range'] == 'normal']['age_range'].count() / len(young_df)) * 100
young_underweight_percent = (young_df[young_df['bmi_range'] == 'underweight']['age_range'].count() / len(young_df)) * 100
print("Young & Normal: {}%".format(round(young_normal_percent,2)))
print("Young & Underweight: {}%".format(round(young_underweight_percent,2)))
print("Young & Overweight: {}%".format(round(young_overweight_percent,2)))
print("Young & Obsese: {}%".format(round(young_obese_percent,2)))

Young & Normal: 21.85%
Young & Underweight: 2.48%
Young & Overweight: 26.8%
Young & Obsese: 48.87%
