In [1]:
import pandas as pd

df = pd.read_csv("new_data.csv")
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110.0,130,409.1
1,60,117.0,145,479.0
2,60,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0


In [7]:
df.describe() # prints the summary statistics of all numeric columns

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,167.0,169.0,164.0
mean,63.798817,107.628743,133.686391,377.426829
std,42.308755,14.52065,17.763388,265.960859
min,15.0,80.0,45.0,50.3
25%,45.0,100.0,124.0,252.725
50%,60.0,105.0,131.0,320.7
75%,60.0,111.0,141.0,387.6
max,300.0,159.0,184.0,1860.4


In [9]:
df.describe(percentiles=[0.3, 0.5, 0.7])  # summary statistics with specific percentiles 

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,167.0,169.0,164.0
mean,63.798817,107.628743,133.686391,377.426829
std,42.308755,14.52065,17.763388,265.960859
min,15.0,80.0,45.0,50.3
30%,45.0,100.0,126.0,270.76
50%,60.0,105.0,131.0,320.7
70%,60.0,110.0,138.6,379.3
max,300.0,159.0,184.0,1860.4


In [10]:
df.describe().T  # transpose of rows and columns 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Duration,169.0,63.798817,42.308755,15.0,45.0,60.0,60.0,300.0
Pulse,167.0,107.628743,14.52065,80.0,100.0,105.0,111.0,159.0
Maxpulse,169.0,133.686391,17.763388,45.0,124.0,131.0,141.0,184.0
Calories,164.0,377.426829,265.960859,50.3,252.725,320.7,387.6,1860.4


In [4]:
df.shape # gives the number of rows and columns 

(169, 4)

In [12]:
df.shape[0]  # number of rows only

169

In [13]:
df.shape[1]  # number of columns only

4

In [14]:
# gives name of column only
df.columns 

Index(['Duration', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')

In [15]:
#can be converted into list using list() function  
list(df.columns)

['Duration', 'Pulse', 'Maxpulse', 'Calories']

<b>Note:</b>  isnull() method checks if there is null value

In [16]:
#check null value
df.isnull()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
164,False,False,False,False
165,False,False,False,False
166,False,False,False,False
167,False,False,False,False


In [17]:
# use sum() to find sum of null values on each column
df.isnull().sum()

Duration    0
Pulse       2
Maxpulse    0
Calories    5
dtype: int64

In [18]:
# double sum provides total null values
df.isnull().sum().sum()

7

<h3 style="color:blue;"> Slicing and Extracting Data in pandas </h3>

<h4 style="color:blue;"> Working in columns </h4>

In [5]:
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110.0,130,409.1
1,60,117.0,145,479.0
2,60,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0


In [7]:
#select a single column
df['Maxpulse']

0      130
1      145
2      135
3      175
4      148
      ... 
164    140
165    145
166    145
167    150
168    150
Name: Maxpulse, Length: 169, dtype: int64

In [8]:
#select a double or more columns
df[['Duration', 'Pulse']]

Unnamed: 0,Duration,Pulse
0,60,110.0
1,60,117.0
2,60,103.0
3,45,109.0
4,45,117.0
...,...,...
164,60,105.0
165,60,110.0
166,60,115.0
167,75,120.0


<h4 style="color:blue;"> Working in rows </h4>

In [10]:
#select a single row
df[df.index==56]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
56,60,118.0,121,413.0


<b>NOTE:</b> two or more rows can be obtained by .isin() method instead of a == operator

In [22]:
#selecting two or more rows using [ ] 
df[df.index.isin(range(5,10))]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
5,60,102.0,45,300.0
6,60,110.0,136,374.0
7,45,104.0,134,253.3
8,30,109.0,133,195.1
9,60,98.0,124,269.0


<h4 style="color:blue;"> Using .loc[] and .iloc[] to fetch rows </h4>


In [23]:
df.loc[1]

Duration     60.0
Pulse       117.0
Maxpulse    145.0
Calories    479.0
Name: 1, dtype: float64

<b>REMEMBER:</b> .loc[] returns a pandas Series instead of a DataFrame

In [24]:
df.loc[10:20]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
10,60,103.0,147,329.3
11,60,100.0,120,250.7
12,60,,128,345.3
13,60,104.0,132,379.3
14,60,104.0,132,379.3
15,60,104.0,132,379.3
16,60,100.0,120,300.0
17,45,90.0,112,
18,60,103.0,123,323.0
19,45,97.0,125,243.0


In [12]:
df.iloc[10]

Duration     60.0
Pulse       103.0
Maxpulse    147.0
Calories    329.3
Name: 10, dtype: float64

In [66]:
df.iloc[10:30]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
10,60,103.0,147,329.3
11,60,100.0,120,250.7
12,60,,128,345.3
13,60,104.0,132,379.3
14,45,104.0,132,379.3
15,60,104.0,132,379.3
16,60,100.0,120,300.0
17,45,90.0,112,
18,60,103.0,123,323.0
19,45,97.0,125,243.0


In [13]:
df.loc[[10, 20, 30, 45]]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
10,60,103.0,147,329.3
20,52,108.0,131,364.2
30,60,92.0,115,243.0
45,60,99.0,119,273.0


In [27]:
df.iloc[[10, 20, 30]]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
10,60,103.0,147,329.3
20,52,108.0,131,364.2
30,60,92.0,115,243.0


In [19]:
df.loc[100:110, ['Duration', 'Maxpulse']]

Unnamed: 0,Duration,Maxpulse
100,20,112
101,90,110
102,90,100
103,90,100
104,30,108
105,30,128
106,180,120
107,30,120
108,90,120
109,210,184


In [18]:
# df1.to_json("modified.json")

In [21]:
df.iloc[5:10, :2]

Unnamed: 0,Duration,Pulse
5,60,102.0
6,60,110.0
7,45,104.0
8,30,109.0
9,60,98.0


In [2]:
df.head(10)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110.0,130,409.1
1,60,117.0,145,479.0
2,60,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0
5,60,102.0,45,300.0
6,60,110.0,136,374.0
7,45,104.0,134,253.3
8,30,109.0,133,195.1
9,60,98.0,124,269.0


In [5]:
df = df.loc[df['Duration']==60, ['Duration']] = 12

In [6]:
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,12,110.0,130,409.1
1,12,117.0,145,479.0
2,12,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0
...,...,...,...,...
164,12,105.0,140,290.8
165,12,110.0,145,300.0
166,12,115.0,145,310.2
167,75,120.0,150,320.4


In [78]:
df.iloc[5:10, :4]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
5,60,102.0,148,300.0
6,80,110.0,136,374.0
7,45,104.0,134,253.3
8,30,109.0,133,195.1
9,60,98.0,124,269.0


<h4 style="color:blue;"> Conditional slicing </h4>

pandas lets you filter data by conditions over row/column values


In [7]:
df[df.Duration == 12]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,12,110.0,130,409.1
1,12,117.0,145,479.0
2,12,103.0,135,340.0
5,12,102.0,45,300.0
6,12,110.0,136,374.0
...,...,...,...,...
157,12,100.0,120,270.4
158,12,114.0,150,382.8
164,12,105.0,140,290.8
165,12,110.0,145,300.0


In [83]:
# displays 'Pulse', 'Maxpulse', 'Calories' whose 'Duration' is grater than 75
df.loc[df['Duration'] > 75, ['Pulse', 'Maxpulse', 'Calories']]

Unnamed: 0,Pulse,Maxpulse,Calories
6,110.0,136,374.0
51,123.0,146,643.1
60,108.0,160,1376.0
61,110.0,137,1034.4
62,109.0,135,853.0
65,90.0,130,800.4
66,105.0,135,873.4
67,107.0,130,816.0
69,108.0,143,1500.2
70,97.0,129,1115.0


In [8]:
df1 = df.copy()

In [9]:
df1.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,12,110.0,130,409.1
1,12,117.0,145,479.0
2,12,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0


In [10]:
#rename columns 

df1.rename(columns = {'Calories':'Cal'}, inplace = True)
df1.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Cal
0,12,110.0,130,409.1
1,12,117.0,145,479.0
2,12,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0


In [11]:
df1.columns = ['Duration', 'Pulse', 'Max', 'Calories']
df1.head()

Unnamed: 0,Duration,Pulse,Max,Calories
0,12,110.0,130,409.1
1,12,117.0,145,479.0
2,12,103.0,135,340.0
3,45,109.0,175,282.4
4,45,117.0,148,406.0
