In [1]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json

pd.set_option('display.max_rows',500)



![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

# John Hopkins data

In [20]:
git_pull = subprocess.Popen( "git pull" , 
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b''
out : b'Already up to date.\n'


In [3]:
data_path = ('../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
pd_raw = pd.read_csv(data_path)

In [4]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,8/9/20,8/10/20,8/11/20,8/12/20,8/13/20,8/14/20,8/15/20,8/16/20,8/17/20,8/18/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,37054,37162,37269,37345,37424,37431,37551,37596,37599,37599
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,6411,6536,6676,6817,6971,7117,7260,7380,7499,7654
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,35160,35712,36204,36699,37187,37664,38133,38583,39025,39444
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,955,963,963,977,981,989,989,989,1005,1005
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1672,1679,1735,1762,1815,1852,1879,1906,1935,1966
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,92,92,92,92,92,93,93,93,93,93
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,246499,253868,260911,268574,276072,282437,289100,294569,299126,305966
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,40410,40433,40593,40794,41023,41299,41495,41663,41701,41846
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,113,113
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3875,3897,3915,3927,3936,3945,3950,3957,3959,3966


# Web scraping through python

In [5]:
page = requests.get(' https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html')

In [6]:
soup = BeautifulSoup(page.content,'html.parser')

#To retrieve the entire page from above link

In [7]:
soup.get_text()

'\n\n\n\n\nRKI  -  Coronavirus SARS-CoV-2 - COVID-19: Fallzahlen in Deutschland und weltweit\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n//<![CDATA[\n      // i18n\n        var PRINT_PAGE_TEXT = \'Seite drucken\';\n        var PRINT_TOOLTIP = \'Artikel drucken (öffnet Dialog)\';\n        var SCALE_IMG = \'Schriftgröße\';\n        var SCALE_IMG_LARGE = \'Schriftgröße vergrößern\';\n        var SCALE_IMG_NORMAL = \'Schriftgröße verkleinern\';\n        var SHOW_MORE = \'mehr anzeigen\';\nwindow.LABEL_MOREDETAILS = \'mehr anzeigen\';\nwindow.LABEL_LESSDETAILS = \'weniger anzeigen\'; \n      //]]>\n    \n\n\n\n\n\nNavigation und Service\nSpringe direkt zu:\n\nInhalt\nHauptmenu\nSuche\n\n\n\n\n\n\n\nServicemenü\nKon\xadtaktIn\xadhaltHil\xadfeIm\xadpres\xads\xadumDa\xadten\xadschut\xadz\xader\xadklä\xadrungRSSEnglish\n\n\n\n\nGebärdensprache\n\n\nLeichte Sprache\n\n\n\n\nSuche\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nKontakt\nNavigation\n\n\n\nZielgruppeneinstiege\n\n\n\n\n

In [8]:
html_table = soup.find('table')

In [9]:
all_rows = html_table.find_all('tr')

In [10]:
final_data_list = []

In [11]:
for pos, rows in enumerate(all_rows):
    
    #print(pos)
    #print(rows)
    
    col_list = [each_col.get_text(strip=True) for each_col in  rows.find_all('td')]
    final_data_list.append(col_list)
    
    #print(col_list)
    
    #for each_col in rows.find_all('td'):
        #print(each_col.get_text(strip=True))
        

In [12]:
pd_daily_status = pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1: 'Cases',
                                                       2: 'Changes',
                                                       3: 'cases_per_100k',
                                                       4: 'fatal',
                                                       5: 'comment'})

In [13]:
pd_daily_status.head()

Unnamed: 0,state,Cases,Changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,38.968,228,852.0,77,1.86
3,Bayern,53.707,409,1.406,108,2.631
4,Berlin,10.394,75,476.0,127,224.0
5,Branden­burg,3.728,20,66.0,26,169.0
6,Bremen,1.865,14,46.0,67,56.0


## Rest API Calls

In [14]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [15]:
json_object = json.loads(data.content)

In [16]:
type(json_object)

dict

In [17]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [18]:
full_list = []
for pos, each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [19]:
pd.DataFrame(full_list)

Unnamed: 0,AGS_TXT,Aktualisierung,Death,Fallzahl,GlobalID,LAN_ew_AGS,LAN_ew_BEZ,LAN_ew_EWZ,LAN_ew_GEN,OBJECTID,OBJECTID_1,Shape__Area,Shape__Length,faelle_100000_EW
0,1,1597788000000,160,3825,fc5ba936-c95c-432c-8a33-9eb2f30b660f,1,Land,2896712,Schleswig-Holstein,15,1,45737310000.0,2881496.0,132.046265
1,2,1597788000000,264,5927,0f3e860c-5181-4d3f-a421-1d51f50315ea,2,Freie und Hansestadt,1841179,Hamburg,6,2,2089396000.0,418800.2,321.913296
2,3,1597788000000,656,15562,3fd77024-c29b-4843-9be8-682ad48e60c9,3,Land,7982448,Niedersachsen,9,3,129983600000.0,4008988.0,194.952726
3,4,1597788000000,56,1865,4132268b-54de-4327-ac1e-760e915112f1,4,Freie Hansestadt,682986,Bremen,5,4,1119157000.0,335717.7,273.065627
4,5,1597788000000,1783,55375,561d658f-3ee5-46e3-bc95-3528c6558ab9,5,Land,17932651,Nordrhein-Westfalen,10,5,87829360000.0,2648673.0,308.794277
5,6,1597788000000,528,13842,93277ac4-e8fc-48c7-8940-028dc2ed66af,6,Land,6265809,Hessen,7,6,52359130000.0,2148244.0,220.913213
6,7,1597788000000,242,8334,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,7,Land,4084844,Rheinland-Pfalz,11,7,47838770000.0,1774430.0,204.022479
7,8,1597788000000,1860,38968,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,8,Land,11069533,Baden-Württemberg,1,8,81517320000.0,2544320.0,352.029304
8,9,1597788000000,2631,53707,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,9,Freistaat,13076721,Bayern,2,9,163485500000.0,3898618.0,410.706935
9,10,1597788000000,174,3017,e3396a6f-8a30-4fdf-8df7-def77dd38bea,10,Land,990509,Saarland,12,10,6060692000.0,562678.9,304.590872


## Rest API Calls eg for US data