## Introduction to Pandas

####  Import Pandas

In [1]:
import pandas as pd

#### Getting Data Into Pandas

Data Source: https://data.covid19india.org/

In [2]:
covid_df = pd.read_csv('india_covid_districts.csv')
covid_df

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,
...,...,...,...,...,...,...,...,...
344121,2021-10-09,West Bengal,Purba Bardhaman,41530,41027,195,0,
344122,2021-10-09,West Bengal,Purba Medinipur,62804,62114,395,0,
344123,2021-10-09,West Bengal,Purulia,19370,19234,113,0,
344124,2021-10-09,West Bengal,South 24 Parganas,99604,97731,1316,0,


In [3]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344126 entries, 0 to 344125
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date       344126 non-null  object 
 1   State      344126 non-null  object 
 2   District   344126 non-null  object 
 3   Confirmed  344126 non-null  int64  
 4   Recovered  344126 non-null  int64  
 5   Deceased   344126 non-null  int64  
 6   Other      344126 non-null  int64  
 7   Tested     258862 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 21.0+ MB


#### Specifying datetime fields

In [4]:
date_time_cols = ['Date']
covid_df = pd.read_csv('india_covid_districts.csv', parse_dates = date_time_cols)
covid_df

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,
...,...,...,...,...,...,...,...,...
344121,2021-10-09,West Bengal,Purba Bardhaman,41530,41027,195,0,
344122,2021-10-09,West Bengal,Purba Medinipur,62804,62114,395,0,
344123,2021-10-09,West Bengal,Purulia,19370,19234,113,0,
344124,2021-10-09,West Bengal,South 24 Parganas,99604,97731,1316,0,


In [5]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344126 entries, 0 to 344125
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       344126 non-null  datetime64[ns]
 1   State      344126 non-null  object        
 2   District   344126 non-null  object        
 3   Confirmed  344126 non-null  int64         
 4   Recovered  344126 non-null  int64         
 5   Deceased   344126 non-null  int64         
 6   Other      344126 non-null  int64         
 7   Tested     258862 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(2)
memory usage: 21.0+ MB


In [6]:
type(covid_df)

pandas.core.frame.DataFrame

In [7]:
type(covid_df['Confirmed'])

pandas.core.series.Series

In [8]:
covid_df.index

RangeIndex(start=0, stop=344126, step=1)

In [9]:
covid_df.columns

Index(['Date', 'State', 'District', 'Confirmed', 'Recovered', 'Deceased',
       'Other', 'Tested'],
      dtype='object')

### Changing the index of a DataFrame

In [10]:
state_idx = covid_df.set_index('State')
state_idx

Unnamed: 0_level_0,Date,District,Confirmed,Recovered,Deceased,Other,Tested
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Andaman and Nicobar Islands,2020-04-26,Unknown,33,11,0,0,
Andhra Pradesh,2020-04-26,Anantapur,53,14,4,0,
Andhra Pradesh,2020-04-26,Chittoor,73,13,0,0,
Andhra Pradesh,2020-04-26,East Godavari,39,12,0,0,
Andhra Pradesh,2020-04-26,Guntur,214,29,8,0,
...,...,...,...,...,...,...,...
West Bengal,2021-10-09,Purba Bardhaman,41530,41027,195,0,
West Bengal,2021-10-09,Purba Medinipur,62804,62114,395,0,
West Bengal,2021-10-09,Purulia,19370,19234,113,0,
West Bengal,2021-10-09,South 24 Parganas,99604,97731,1316,0,


In [11]:
state_idx.info()

<class 'pandas.core.frame.DataFrame'>
Index: 344126 entries, Andaman and Nicobar Islands to West Bengal
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       344126 non-null  datetime64[ns]
 1   District   344126 non-null  object        
 2   Confirmed  344126 non-null  int64         
 3   Recovered  344126 non-null  int64         
 4   Deceased   344126 non-null  int64         
 5   Other      344126 non-null  int64         
 6   Tested     258862 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 21.0+ MB


### Pandas Series

In [12]:
state_idx['District']

State
Andaman and Nicobar Islands              Unknown
Andhra Pradesh                         Anantapur
Andhra Pradesh                          Chittoor
Andhra Pradesh                     East Godavari
Andhra Pradesh                            Guntur
                                     ...        
West Bengal                      Purba Bardhaman
West Bengal                      Purba Medinipur
West Bengal                              Purulia
West Bengal                    South 24 Parganas
West Bengal                       Uttar Dinajpur
Name: District, Length: 344126, dtype: object

### Inspecting the top n rows with head()

In [13]:
covid_df.head()

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,


In [14]:
covid_df.head(20)

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,
5,2020-04-26,Andhra Pradesh,Krishna,177,29,8,0,
6,2020-04-26,Andhra Pradesh,Kurnool,279,31,9,0,
7,2020-04-26,Andhra Pradesh,Prakasam,56,23,0,0,
8,2020-04-26,Andhra Pradesh,S.P.S. Nellore,72,23,2,0,
9,2020-04-26,Andhra Pradesh,Srikakulam,3,0,0,0,


### Frequency Distribution with value_counts

In [15]:
covid_df['State'].value_counts()

Uttar Pradesh                               39738
Madhya Pradesh                              27409
Tamil Nadu                                  21173
Bihar                                       20115
Rajasthan                                   19594
Maharashtra                                 19115
Gujarat                                     18044
Karnataka                                   16290
Odisha                                      16083
Chhattisgarh                                14918
Arunachal Pradesh                           12944
West Bengal                                 12581
Jharkhand                                   12497
Haryana                                     11872
Punjab                                      11695
Jammu and Kashmir                           10596
Andhra Pradesh                               7950
Kerala                                       7449
Uttarakhand                                  6790
Himachal Pradesh                             6225


### Sorting

In [16]:
covid_df.sort_values(by = ['State', 'District'], ascending = [False, True])

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
17447,2020-05-29,West Bengal,Alipurduar,4,0,0,0,
18053,2020-05-30,West Bengal,Alipurduar,4,0,0,0,
18661,2020-05-31,West Bengal,Alipurduar,4,0,0,0,
19293,2020-06-01,West Bengal,Alipurduar,5,0,0,0,
19925,2020-06-02,West Bengal,Alipurduar,5,0,0,0,
...,...,...,...,...,...,...,...,...
340831,2021-10-05,Andaman and Nicobar Islands,Unknown,7626,7488,129,0,
341490,2021-10-06,Andaman and Nicobar Islands,Unknown,7627,7488,129,0,
342149,2021-10-07,Andaman and Nicobar Islands,Unknown,7629,7489,129,0,
342808,2021-10-08,Andaman and Nicobar Islands,Unknown,7629,7490,129,0,


### Handling Duplicates

In [17]:
pd.Series(list(set(covid_df['State'])))

0                                   Chandigarh
1                                    Telangana
2                                          Goa
3                               Andhra Pradesh
4                                  Maharashtra
5                                  Uttarakhand
6                                    Jharkhand
7                                        Assam
8                                      Mizoram
9                                    Rajasthan
10                              Madhya Pradesh
11                                     Manipur
12                                  Puducherry
13                                     Tripura
14    Dadra and Nagar Haveli and Daman and Diu
15                                      Kerala
16                                       Delhi
17                                  Tamil Nadu
18                                   Meghalaya
19                           Jammu and Kashmir
20                               Uttar Pradesh
21           

In [18]:
covid_df['State'].drop_duplicates()

0                      Andaman and Nicobar Islands
1                                   Andhra Pradesh
13                               Arunachal Pradesh
14                                           Assam
15                                           Bihar
37                                      Chandigarh
38                                    Chhattisgarh
43                                           Delhi
44                                             Goa
45                                         Gujarat
75                                Himachal Pradesh
81                                         Haryana
101                                      Jharkhand
111                              Jammu and Kashmir
128                                      Karnataka
149                                         Kerala
163                                         Ladakh
165                                    Maharashtra
198                                      Meghalaya
199                            

In [19]:
covid_df.drop_duplicates(subset = ['State'])['State']

0                      Andaman and Nicobar Islands
1                                   Andhra Pradesh
13                               Arunachal Pradesh
14                                           Assam
15                                           Bihar
37                                      Chandigarh
38                                    Chhattisgarh
43                                           Delhi
44                                             Goa
45                                         Gujarat
75                                Himachal Pradesh
81                                         Haryana
101                                      Jharkhand
111                              Jammu and Kashmir
128                                      Karnataka
149                                         Kerala
163                                         Ladakh
165                                    Maharashtra
198                                      Meghalaya
199                            

### Slicing datasets

#### Keeping Relevant Columns

In [20]:
covid_df[['Date','State', 'District', 'Confirmed']]

Unnamed: 0,Date,State,District,Confirmed
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33
1,2020-04-26,Andhra Pradesh,Anantapur,53
2,2020-04-26,Andhra Pradesh,Chittoor,73
3,2020-04-26,Andhra Pradesh,East Godavari,39
4,2020-04-26,Andhra Pradesh,Guntur,214
...,...,...,...,...
344121,2021-10-09,West Bengal,Purba Bardhaman,41530
344122,2021-10-09,West Bengal,Purba Medinipur,62804
344123,2021-10-09,West Bengal,Purulia,19370
344124,2021-10-09,West Bengal,South 24 Parganas,99604


In [21]:
# This will give an error
# covid_df['Date','State', 'District', 'Confirmed']

#### Selecting Relevant rows based on conditions

In [22]:
covid_df[(covid_df['State'] == 'Goa' )| (covid_df['State'] == 'Maharashtra')]

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
44,2020-04-26,Goa,Unknown,7,7,0,0,
165,2020-04-26,Maharashtra,Ahmednagar,36,22,2,0,
166,2020-04-26,Maharashtra,Akola,29,7,1,0,
167,2020-04-26,Maharashtra,Amravati,20,4,1,0,
168,2020-04-26,Maharashtra,Aurangabad,50,22,5,0,2431.0
...,...,...,...,...,...,...,...,...
343776,2021-10-09,Maharashtra,Solapur,209186,202607,5496,109,629266.0
343777,2021-10-09,Maharashtra,Thane,605606,589964,11387,35,1229625.0
343778,2021-10-09,Maharashtra,Wardha,57330,55945,1217,165,52365.0
343779,2021-10-09,Maharashtra,Washim,41645,41000,637,3,


In [23]:
(covid_df['State'] == 'Goa' )| (covid_df['State'] == 'Maharashtra')

0         False
1         False
2         False
3         False
4         False
          ...  
344121    False
344122    False
344123    False
344124    False
344125    False
Name: State, Length: 344126, dtype: bool

### Aggregations using groupby

In [24]:
covid_df.groupby(['State', 'District']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Recovered,Deceased,Other,Tested
State,District,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Andaman and Nicobar Islands,Unknown,4498.633459,4313.900376,65.554511,0.028195,
Andhra Pradesh,Anantapur,76374.099624,73880.941729,582.597744,0.000000,598965.305118
Andhra Pradesh,Chittoor,104237.832707,99080.593985,868.842105,0.000000,601131.647638
Andhra Pradesh,East Godavari,134164.802632,127500.163534,638.676692,0.000000,728669.651575
Andhra Pradesh,Foreign Evacuees,417.854871,398.214712,0.000000,0.000000,
...,...,...,...,...,...,...
West Bengal,Purba Medinipur,25330.105263,24131.372180,217.161654,0.000000,
West Bengal,Purulia,9112.686627,8720.061876,53.207585,0.000000,
West Bengal,South 24 Parganas,42702.684211,40581.541353,642.810150,0.000000,
West Bengal,Unknown,168.400000,52.333333,13.000000,0.000000,


In [25]:
covid_df.groupby(['State', 'District'], as_index = False).mean()

Unnamed: 0,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,Andaman and Nicobar Islands,Unknown,4498.633459,4313.900376,65.554511,0.028195,
1,Andhra Pradesh,Anantapur,76374.099624,73880.941729,582.597744,0.000000,598965.305118
2,Andhra Pradesh,Chittoor,104237.832707,99080.593985,868.842105,0.000000,601131.647638
3,Andhra Pradesh,East Godavari,134164.802632,127500.163534,638.676692,0.000000,728669.651575
4,Andhra Pradesh,Foreign Evacuees,417.854871,398.214712,0.000000,0.000000,
...,...,...,...,...,...,...,...
676,West Bengal,Purba Medinipur,25330.105263,24131.372180,217.161654,0.000000,
677,West Bengal,Purulia,9112.686627,8720.061876,53.207585,0.000000,
678,West Bengal,South 24 Parganas,42702.684211,40581.541353,642.810150,0.000000,
679,West Bengal,Unknown,168.400000,52.333333,13.000000,0.000000,


In [26]:
covid_df.groupby(['State', 'District'], as_index = False).agg({'Confirmed' : 'mean'})

Unnamed: 0,State,District,Confirmed
0,Andaman and Nicobar Islands,Unknown,4498.633459
1,Andhra Pradesh,Anantapur,76374.099624
2,Andhra Pradesh,Chittoor,104237.832707
3,Andhra Pradesh,East Godavari,134164.802632
4,Andhra Pradesh,Foreign Evacuees,417.854871
...,...,...,...
676,West Bengal,Purba Medinipur,25330.105263
677,West Bengal,Purulia,9112.686627
678,West Bengal,South 24 Parganas,42702.684211
679,West Bengal,Unknown,168.400000


In [27]:
covid_df.groupby(['State', 'District'], as_index = False).agg({'Confirmed' : 'mean', 'Deceased' : 'max'})

Unnamed: 0,State,District,Confirmed,Deceased
0,Andaman and Nicobar Islands,Unknown,4498.633459,129
1,Andhra Pradesh,Anantapur,76374.099624,1093
2,Andhra Pradesh,Chittoor,104237.832707,1925
3,Andhra Pradesh,East Godavari,134164.802632,1286
4,Andhra Pradesh,Foreign Evacuees,417.854871,0
...,...,...,...,...
676,West Bengal,Purba Medinipur,25330.105263,395
677,West Bengal,Purulia,9112.686627,113
678,West Bengal,South 24 Parganas,42702.684211,1316
679,West Bengal,Unknown,168.400000,48


### Creating Calculated Fields

In [28]:
covid_df['Recovery Rate'] = covid_df['Recovered'] / covid_df['Confirmed'] 
covid_df

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested,Recovery Rate
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,,0.333333
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,,0.264151
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,,0.178082
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,,0.307692
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,,0.135514
...,...,...,...,...,...,...,...,...,...
344121,2021-10-09,West Bengal,Purba Bardhaman,41530,41027,195,0,,0.987888
344122,2021-10-09,West Bengal,Purba Medinipur,62804,62114,395,0,,0.989013
344123,2021-10-09,West Bengal,Purulia,19370,19234,113,0,,0.992979
344124,2021-10-09,West Bengal,South 24 Parganas,99604,97731,1316,0,,0.981196
