## Data Source

https://api.covid19india.org/

## Import libraries

In [1]:
# to get web contents
import requests
# to parse json contents
import json
# to parse csv files
import csv

# for numerical operations
import numpy as np
# to store and analysis data in dataframes
import pandas as pd

## Get data

### df_1 - Till Apr 19

In [2]:
# df_1 - Till Apr 19
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data1.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)
# keys
parsed.keys()

dict_keys(['raw_data'])

In [3]:
# save to df
df_1 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_1.shape)

# # list of columns
print(df_1.columns)

# # first few rows
# df_1.head(2)

(17364, 21)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'numcases', 'patientnumber', 'source1', 'source2', 'source3',
       'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission'],
      dtype='object')


### df_2 - Till Apr 26

In [4]:
# df_2 - Till Apr 26
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data2.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['raw_data'])

In [5]:
# save to df
df_2 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_2.shape)

# # list of columns
print(df_2.columns)

# # first few rows
# df_2.head(2)

(10819, 21)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'numcases', 'patientnumber', 'source1', 'source2', 'source3',
       'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission'],
      dtype='object')


### df_3 - Live

In [6]:
# df_3 - Live
# ===========

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data3.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['raw_data'])

In [7]:
# save to df
df_3 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_3.shape)

# # list of columns
print(df_3.columns)

# # first few rows
# df_3.head(2)

(10012, 20)
Index(['agebracket', 'contractedfromwhichpatientsuspected', 'currentstatus',
       'dateannounced', 'detectedcity', 'detecteddistrict', 'detectedstate',
       'entryid', 'gender', 'nationality', 'notes', 'numcases',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


In [8]:
# np.setdiff1d(df_1.columns, df_3.columns)

### full data

In [9]:
# full data
# =========

# drop unwanted rows
# df_1 = df_1.drop('_dnp34', axis = 1)
df_3 = df_3.drop(['entryid', 'numcases'], axis = 1)

# rename columns
# df_3 = df_3.rename({'entryid' : 'patientnumber'})

# columns
df_3.columns

Index(['agebracket', 'contractedfromwhichpatientsuspected', 'currentstatus',
       'dateannounced', 'detectedcity', 'detecteddistrict', 'detectedstate',
       'gender', 'nationality', 'notes', 'patientnumber', 'source1', 'source2',
       'source3', 'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission'],
      dtype='object')

In [10]:
# df_3[['entryid', 'patientnumber']]

In [11]:
# concatenate data
df = pd.concat([df_1, df_2, df_3])

# shape of the data
df.shape

(38195, 21)

In [12]:
# list of columns
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'numcases', 'patientnumber', 'source1', 'source2', 'source3',
       'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission'],
      dtype='object')

In [13]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,agebracket,backupnotes,contractedfromwhichpatientsuspected,currentstatus,dateannounced,detectedcity,detecteddistrict,detectedstate,estimatedonsetdate,gender,...,notes,numcases,patientnumber,source1,source2,source3,statecode,statepatientnumber,statuschangedate,typeoftransmission
0,20.0,Student from Wuhan,,Recovered,30/01/2020,Thrissur,Thrissur,Kerala,,F,...,Travelled from Wuhan,1,1,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-TS-P1,14/02/2020,Imported
1,,Student from Wuhan,,Recovered,02/02/2020,Alappuzha,Alappuzha,Kerala,,,...,Travelled from Wuhan,1,2,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-AL-P1,14/02/2020,Imported
2,,Student from Wuhan,,Recovered,03/02/2020,Kasaragod,Kasaragod,Kerala,,,...,Travelled from Wuhan,1,3,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,KL,KL-KS-P1,14/02/2020,Imported


In [14]:
# creating patient id column from patient number
# ===============================================

df['p_id'] = df['patientnumber'].apply(lambda x : 'P'+str(x))
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'numcases', 'patientnumber', 'source1', 'source2', 'source3',
       'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission', 'p_id'],
      dtype='object')

## Rearrange and rename columns

In [15]:
# order of columns
cols = ['patientnumber', 'p_id', 'statepatientnumber', 
        'dateannounced', 'agebracket', 'gender', 
        'detectedcity', 'detecteddistrict', 'detectedstate', 'statecode', 'nationality',
        'typeoftransmission', 'contractedfromwhichpatientsuspected',
        'statuschangedate', 'currentstatus', 'estimatedonsetdate',
        'source1', 'source2', 'source3', 'notes', 'backupnotes']

# rearrange columns
df = df[cols]

# rename columns
df.columns = ['patient_number', 'p_id', 'state_patient_number', 
              'date_announced', 'age_bracket', 'gender', 
              'detected_city', 'detected_district', 'detected_state', 'state_code', 'nationality',
              'type_of_transmission', 'contracted_from_which_patient_suspected',
              'status_change_date', 'current_status', 'estimated_onset_date',
              'source1', 'source2', 'source3', 'notes', 'backup_notes']

# dataframe shape
df.shape

(38195, 21)

In [16]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,patient_number,p_id,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,...,type_of_transmission,contracted_from_which_patient_suspected,status_change_date,current_status,estimated_onset_date,source1,source2,source3,notes,backup_notes
0,1,P1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
1,2,P2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
2,3,P3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Travelled from Wuhan,Student from Wuhan


## Missing values

In [17]:
# no. of empty values in each column
# ==================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]==''].shape[0])

(38195, 21) 

patient_number 	 296
p_id 	 0
state_patient_number 	 30061
date_announced 	 0
age_bracket 	 30666
gender 	 28228
detected_city 	 35424
detected_district 	 7066
detected_state 	 7
state_code 	 7
nationality 	 35486
type_of_transmission 	 35205
contracted_from_which_patient_suspected 	 36316
status_change_date 	 10415
current_status 	 0
estimated_onset_date 	 28183
source1 	 733
source2 	 34242
source3 	 37712
notes 	 9422
backup_notes 	 27822


In [18]:
# no. of non-empty values in each column
# ===================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]!=''].shape[0])

(38195, 21) 

patient_number 	 37899
p_id 	 38195
state_patient_number 	 8134
date_announced 	 38195
age_bracket 	 7529
gender 	 9967
detected_city 	 2771
detected_district 	 31129
detected_state 	 38188
state_code 	 38188
nationality 	 2709
type_of_transmission 	 2990
contracted_from_which_patient_suspected 	 1879
status_change_date 	 27780
current_status 	 38195
estimated_onset_date 	 10012
source1 	 37462
source2 	 3953
source3 	 483
notes 	 28773
backup_notes 	 10373


In [19]:
# replacing empty strings with np.nan
# ==================================-

print(df.shape)

df = df.replace(r'', np.nan, regex=True)
df.isna().sum()

(38195, 21)


patient_number                               296
p_id                                           0
state_patient_number                       30061
date_announced                                 0
age_bracket                                30666
gender                                     28228
detected_city                              35424
detected_district                           7066
detected_state                                 7
state_code                                     7
nationality                                35486
type_of_transmission                       35205
contracted_from_which_patient_suspected    36316
status_change_date                         10415
current_status                                 0
estimated_onset_date                       38195
source1                                      733
source2                                    34242
source3                                    37712
notes                                       9422
backup_notes        

In [20]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(df.shape)

# df.dropna(subset=['detected_state'], inplace=True)

print(df.shape)
df.isna().sum()

(38195, 21)
(38195, 21)


patient_number                               296
p_id                                           0
state_patient_number                       30061
date_announced                                 0
age_bracket                                30666
gender                                     28228
detected_city                              35424
detected_district                           7066
detected_state                                 7
state_code                                     7
nationality                                35486
type_of_transmission                       35205
contracted_from_which_patient_suspected    36316
status_change_date                         10415
current_status                                 0
estimated_onset_date                       38195
source1                                      733
source2                                    34242
source3                                    37712
notes                                       9422
backup_notes        

## Save data

In [21]:
# save to csv`
df.to_csv('patients_data.csv', index=False)

# State tested data

In [22]:
# get response from the web page
response = requests.get('https://api.covid19india.org/state_test_data.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['states_tested_data'])

In [23]:
# save data in a dataframe
th = pd.DataFrame(parsed['states_tested_data'])

# first few rows
th.tail(3)

Unnamed: 0,coronaenquirycalls,cumulativepeopleinquarantine,negative,numcallsstatehelpline,numicubeds,numisolationbeds,numventilators,populationncp2019projection,positive,source1,...,testpositivityrate,testspermillion,testsperthousand,totalpeoplecurrentlyinquarantine,totalpeoplereleasedfromquarantine,totaltested,unconfirmed,updatedon,_d88ul,_dkvya
1120,,,,,907,,392,96906000.0,1394,https://www.wbhealth.gov.in/uploaded_files/cor...,...,2.22%,,,8908,22696,62837,,14/05/2020,,
1121,,,,,907,,392,,1407,https://www.wbhealth.gov.in/uploaded_files/cor...,...,2.02%,,,8882,24389,69543,,15/05/2020,,
1122,,,,,907,,392,,1452,https://www.wbhealth.gov.in/uploaded_files/cor...,...,1.88%,,,9667,25804,77288,,16/05/2020,,


In [24]:
th.columns

Index(['coronaenquirycalls', 'cumulativepeopleinquarantine', 'negative',
       'numcallsstatehelpline', 'numicubeds', 'numisolationbeds',
       'numventilators', 'populationncp2019projection', 'positive', 'source1',
       'source2', 'state', 'tagpeopleinquarantine', 'tagtotaltested',
       'testpositivityrate', 'testspermillion', 'testsperthousand',
       'totalpeoplecurrentlyinquarantine', 'totalpeoplereleasedfromquarantine',
       'totaltested', 'unconfirmed', 'updatedon', '_d88ul', '_dkvya'],
      dtype='object')

In [25]:
len(th.columns)

24

In [None]:
# save to csv`
th.to_csv('tests_latest_state_level.csv', index=False)

## Zones

In [None]:
# get response from the web page
response = requests.get('https://api.covid19india.org/zones.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

In [None]:
zo = pd.DataFrame(parsed['zones'])
zo.head(3)

In [None]:
# save to csv`
zo.to_csv('zones.csv', index=False)

## National level daily

In [None]:
response = requests.get('https://api.covid19india.org/data.json')
content = response.content
parsed = json.loads(content)
parsed.keys()

In [None]:
national = pd.DataFrame(parsed['cases_time_series'])
national.head()

In [None]:
national.columns

In [None]:
national = national[['date', 'totalconfirmed', 'totaldeceased', 'totalrecovered', 
                     'dailyconfirmed', 'dailydeceased', 'dailyrecovered']]
national.head()

In [None]:
# save to csv`
national.to_csv('nation_level_daily.csv', index=False)

## National level latest

In [None]:
state_level = pd.DataFrame(parsed['statewise'])
state_level.head()

In [None]:
state_level.columns

In [None]:
state_level = state_level[['state', 'statecode', 'lastupdatedtime',  
                           'confirmed', 'active', 'deaths', 'recovered',
                           'deltaconfirmed', 'deltadeaths', 'deltarecovered', 'statenotes']]
state_level.head()

In [None]:
# save to csv`
state_level.to_csv('state_level_latest.csv', index=False)

## National level tests daily

In [None]:
tested_daily = pd.DataFrame(parsed['tested'])

In [None]:
tested_daily.head()

In [None]:
tested_daily.columns

In [None]:
tested_daily = tested_daily[['updatetimestamp', 'totalsamplestested', 'totalindividualstested', 
                             'totalpositivecases', 'testsperconfirmedcase', 
                             'individualstestedperconfirmedcase',  'testpositivityrate', 
                             'testsconductedbyprivatelabs', 'positivecasesfromsamplesreported', 
                             'samplereportedtoday','source']]
tested_daily.head()

In [None]:
# save to csv`
tested_daily.to_csv('tests_daily.csv', index=False)

# District level latest

In [None]:
response = requests.get('https://api.covid19india.org/v2/state_district_wise.json')
content = response.content
parsed = json.loads(content)
len(parsed)

In [None]:
# parsed

In [None]:
dfs = []

for i in parsed:
#     print(i['state'])
    state_name = i['state']
    state_code = i['statecode']
    
#     df = pd.DataFrame()

    df = pd.DataFrame(i['districtData'])
    df['state name'] = state_name
    df['state code'] = state_code
    
    dfs.append(df)

In [None]:
district_level = pd.concat(dfs)
district_level.head()

In [None]:
delta_confirmed = []
delta_deceased = []
delta_recovered = []

for i in district_level['delta'].values:
    delta_confirmed.append(i['confirmed'])
    delta_deceased.append(i['deceased'])
    delta_recovered.append(i['recovered'])
    
district_level['delta_confirmed'] = delta_confirmed
district_level['delta_deceased'] = delta_deceased
district_level['delta_recovered'] = delta_recovered

district_level = district_level.drop('delta', axis=1)

In [None]:
district_level.columns

In [None]:
district_level = district_level[['state name', 'state code', 'district', 
                                 'confirmed', 'active', 'deceased', 'recovered',
                                 'delta_confirmed', 'delta_deceased',
                                 'delta_recovered', 'notes']]
district_level.head()

In [None]:
# save to csv`
district_level.to_csv('district_level_latest.csv', index=False)

## State wise Daily

In [None]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

# df = pd.DataFrame(parsed['states_daily'])

In [None]:
# df = df.melt(id_vars = ['date', 'status'], 
#              value_vars = ['an', 'ap', 'ar', 'as', 'br', 'ch', 'ct', 'dd', 
#                     'dl', 'dn', 'ga', 'gj', 'hp', 'hr', 'jh', 'jk', 
#                     'ka', 'kl', 'la', 'ld', 'mh', 'ml', 'mn', 'mp',
#                     'mz', 'nl', 'or', 'pb', 'py', 'rj', 'sk', 'tg', 
#                     'tn', 'tr', 'tt', 'up', 'ut', 'wb'], 
#              var_name='state', value_name='count')

# df = df.set_index(['date', 'state'])

# df = df.pivot(columns = 'status').reset_index()

# df.columns = df.columns.droplevel(0)
# df.columns.name = ''

# df.columns = ['Date', 'State', 'Confirmed', 'Deceased', 'Recovered']
# df.head()

In [None]:
# response = requests.get('https://api.covid19india.org/csv/')
# parsed = response.content.decode('utf-8')
# parsed

# df = pd.DataFrame(parsed, sep=',')
# df.head()

In [None]:
# pd.DataFrame('http://api.covid19india.org/states_daily_csv/confirmed.csv')

In [None]:
# pd.read_csv('https://api.covid19india.org/csv/')

## States Daily changes

In [None]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

In [None]:
# pd.DataFrame(parsed['states_daily'])

In [None]:
# state_wise = pd.DataFrame(parsed['statewise'])
# state_wise.head()

In [None]:
# tested = pd.DataFrame(parsed['tested'])
# tested.head()

## District wise

In [None]:
# response = requests.get('https://api.covid19india.org/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [None]:
# pd.DataFrame(parsed['Goa'])

In [None]:
# parsed['Goa'].keys()

In [None]:
# pd.DataFrame(parsed['Goa']['districtData'])

In [None]:
# pd.DataFrame(parsed)

In [None]:
# pd.DataFrame(parsed[1]['districtData'])

## Travel history (no more updated)

In [None]:
# response = requests.get('https://api.covid19india.org/travel_history.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [None]:
# th = pd.DataFrame(parsed['travel_history'])
# th.head()