- Shows different methods of gathering the data.

In [None]:
import subprocess
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
pd.set_option('display.max_rows', 500)

# Data Understanding

- RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
- John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
- REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

# GITHUB csv data

- git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [None]:
#To update the data
git_pull = subprocess.Popen( "git pull", 
                     cwd = os.path.dirname('E:/ads_covid-19/data/raw/COVID-19/'), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

In [None]:
data_path='E:/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [None]:
pd_raw.head()

# Webscrapping

- Data from Robert Koch-Institut website 

In [None]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
html_table=soup.find('table') #tag 'table' for the data table

In [None]:
all_rows=html_table.find_all('tr') #tag 'tr' for each raw of the table

In [None]:
final_data_list=[]

In [None]:
for pos,rows in enumerate(all_rows):
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #tag td for each data element in raw
    final_data_list.append(col_list)

In [None]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases in the past 7 days',
                                                       4:'7-day incidence',
                                                       5:'deaths'})
# All the data is in german standard (decimals: ',' and tousands: '.')  

In [None]:
pd_daily_status.head()

In [None]:
pd_daily_status.to_csv('E:/ads_covid-19/data/raw/RKI/RKI_data.csv',sep=';', index = False) 
# Data will be prepared in notebook 'data_preparation' 

# REST API calls

- REST API calls is robust method for extracting the data.

- https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [None]:
## data request for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [None]:
json_object=json.loads(data.content)

In [None]:
type(json_object)

In [None]:
json_object.keys()

In [None]:
# Extract the data from the dict.

full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [None]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

In [None]:
pd_full_list.to_csv('E:/ads_covid-19/data/raw/NPGEO/GER_state_data.csv',sep=';')

# API access via REST service, e.g. USA data

- example of a REST conform interface

www.smartable.ai

In [None]:
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '28ee4219700f48718be78b057beb7eb4',
}

response = requests.get('https://api.smartable.ai/coronavirus/stats/US', headers=headers)
print(response)

In [None]:
US_dict=json.loads(response.content)
with open('E:/ads_covid-19/data/raw/SMARTABLE/US_data.json', 'w') as outfile:
    json.dump(US_dict, outfile,indent=2)

In [None]:
print(json.dumps(US_dict,indent=2)) #string dump

# Individual States of US

In [None]:
US_dict['stats']['breakdowns'][0]  #to understan the data in dict.

In [None]:
# Extract data of interest from dict.

full_list_US_country=[]
for pos, each_dict in enumerate(US_dict['stats']['breakdowns'][:]):
    flatten_dict = each_dict['location']
    flatten_dict.update(dict(list(US_dict['stats']['breakdowns'][pos].items())[1:7]))
    full_list_US_country.append(flatten_dict)

In [None]:
pd.DataFrame(full_list_US_country).to_csv('E:/ads_covid-19/data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)