In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

# Loading data into Pandas


In [2]:

df = pd.read_csv('medical-examination.csv')

In [3]:
# show first 4 data
df.head(4)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1


In [4]:
# show last 4 data
df.tail(4)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1
69999,99999,20540,1,170,72.0,120,80,2,1,0,0,1,0


In [5]:
# show the shape of the data
df.shape

(70000, 13)

In [6]:
# set id as the index
#df = df.set_index('id')

In [7]:
# show the columns name
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [8]:
# read each columns
df['age'] # can use df.age

0        18393
1        20228
2        18857
3        17623
4        17474
         ...  
69995    19240
69996    22601
69997    19066
69998    22431
69999    20540
Name: age, Length: 70000, dtype: int64

In [9]:
# read multiple columns
df[['age','gender']].head(1)

Unnamed: 0,age,gender
0,18393,2


In [10]:
# show the datatype of the columns
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [11]:
# to show the details of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
id             70000 non-null int64
age            70000 non-null int64
gender         70000 non-null int64
height         70000 non-null int64
weight         70000 non-null float64
ap_hi          70000 non-null int64
ap_lo          70000 non-null int64
cholesterol    70000 non-null int64
gluc           70000 non-null int64
smoke          70000 non-null int64
alco           70000 non-null int64
active         70000 non-null int64
cardio         70000 non-null int64
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [12]:
# to count the number of null value
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [13]:
df.isnull().any().sum()

0

### read each row

In [14]:
df.iloc[0:3,:].head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [15]:
# lets print first row
df.loc[0,:]

id                 0.0
age            18393.0
gender             2.0
height           168.0
weight            62.0
ap_hi            110.0
ap_lo             80.0
cholesterol        1.0
gluc               1.0
smoke              0.0
alco               0.0
active             1.0
cardio             0.0
Name: 0, dtype: float64

In [16]:
# lets access to height 165 of third columns
df.loc[2,'height'] # or df.iloc[2,3]

165

In [17]:
df.iloc[2,3]

165

## create new column 

In [18]:
df['Heights(M)'] = df['height'] / 100

In [19]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,1.56


In [20]:
# lets remove the column height for the dataset
df.drop('height', axis = 1, inplace = True)

In [21]:
df.head()

Unnamed: 0,id,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,18393,2,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,1,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,1,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,2,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,1,56.0,100,60,1,1,0,0,0,0,1.56


In [22]:
# lets change the gender column to M for male and f for female
df.loc[(df['gender'] == 2), 'gender'] = 'M'

In [23]:
df.loc[(df['gender'] == 1), 'gender'] = "F"

In [24]:
df.head()

Unnamed: 0,id,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,18393,M,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,F,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,F,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,M,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,F,56.0,100,60,1,1,0,0,0,0,1.56


In [25]:
# lets change M to 1 and F to 0
df['gender'] = df['gender'].apply(lambda x: 0 if x == "F" else 1 )

In [26]:
df.head()

Unnamed: 0,id,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,18393,1,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,0,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,0,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,1,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,0,56.0,100,60,1,1,0,0,0,0,1.56


In [27]:
df.head()

Unnamed: 0,id,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,18393,1,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,0,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,0,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,1,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,0,56.0,100,60,1,1,0,0,0,0,1.56


In [28]:
# lets analyze age ,,, lets take only last two digits of the age 
df['age'] = df['age'] % 100

In [29]:
df.head()

Unnamed: 0,id,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Heights(M)
0,0,93,1,62.0,110,80,1,1,0,0,1,0,1.68
1,1,28,0,85.0,140,90,3,1,0,0,1,1,1.56
2,2,57,0,64.0,130,70,3,1,0,0,0,1,1.65
3,3,23,1,82.0,150,100,1,1,0,0,1,1,1.69
4,4,74,0,56.0,100,60,1,1,0,0,0,0,1.56


In [None]:
df