In [18]:
import pandas as pd
pd.set_option('display.max_rows',500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

# John Hopkins data

In [19]:
data_path = ('../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
pd_raw = pd.read_csv(data_path)

In [20]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/20/20,7/21/20,7/22/20,7/23/20,7/24/20,7/25/20,7/26/20,7/27/20,7/28/20,7/29/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,35526,35615,35727,35928,35981,36036,36157,36263,36368,36471
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,4171,4290,4358,4466,4570,4637,4763,4880,4997,5105
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,23691,24278,24872,25484,26159,26764,27357,27973,28615,29229
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,884,884,889,889,897,897,897,907,907,918
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,749,779,812,851,880,916,932,950,1000,1078
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,76,76,76,76,82,82,82,86,86,91
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,130774,136118,141900,148027,153520,158334,162526,167416,173355,178996
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,34981,35254,35693,36162,36613,36996,37317,37390,37629,37937
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,113,113
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3599,3614,3633,3640,3654,3668,3685,3699,3718,3736


# Web scraping through python

In [21]:
import requests
from bs4 import BeautifulSoup

In [22]:
page = requests.get(' https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html')

In [23]:
soup = BeautifulSoup(page.content,'html.parser')

#To retrieve the entire page from above link

In [24]:
soup.get_text()

'\n\n\n\n\nRKI  -  Coronavirus SARS-CoV-2 - COVID-19: Fallzahlen in Deutschland und weltweit\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n//<![CDATA[\n      // i18n\n        var PRINT_PAGE_TEXT = \'Seite drucken\';\n        var PRINT_TOOLTIP = \'Artikel drucken (öffnet Dialog)\';\n        var SCALE_IMG = \'Schriftgröße\';\n        var SCALE_IMG_LARGE = \'Schriftgröße vergrößern\';\n        var SCALE_IMG_NORMAL = \'Schriftgröße verkleinern\';\n        var SHOW_MORE = \'mehr anzeigen\';\nwindow.LABEL_MOREDETAILS = \'mehr anzeigen\';\nwindow.LABEL_LESSDETAILS = \'weniger anzeigen\'; \n      //]]>\n    \n\n\n\n\n\nNavigation und Service\nSpringe direkt zu:\n\nInhalt\nHauptmenu\nSuche\n\n\n\n\n\n\n\nServicemenü\nKon\xadtaktIn\xadhaltHil\xadfeIm\xadpres\xads\xadumDa\xadten\xadschut\xadz\xader\xadklä\xadrungRSSEnglish\n\n\n\n\nGebärdensprache\n\n\nLeichte Sprache\n\n\n\n\nSuche\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nKontakt\nNavigation\n\n\n\nZielgruppeneinstiege\n\n\n\n\n

In [25]:
html_table = soup.find('table')

In [26]:
all_rows = html_table.find_all('tr')

In [27]:
final_data_list = []

In [28]:
for pos, rows in enumerate(all_rows):
    
    #print(pos)
    #print(rows)
    
    col_list = [each_col.get_text(strip=True) for each_col in  rows.find_all('td')]
    final_data_list.append(col_list)
    
    #print(col_list)
    
    #for each_col in rows.find_all('td'):
        #print(each_col.get_text(strip=True))
        

In [41]:
pd_daily_status = pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1: 'Cases',
                                                       2: 'Changes',
                                                       3: 'cases_per_100k',
                                                       4: 'fatal',
                                                       5: 'comment'})

In [42]:
pd_daily_status.head()

Unnamed: 0,state,Cases,Changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,38.48,58,598,54,1.859
3,Bayern,52.888,41,980,75,2.631
4,Berlin,10.238,42,522,139,224.0
5,Branden­burg,3.697,2,51,20,169.0
6,Bremen,1.841,6,38,56,56.0


## Rest API Calls

In [47]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [50]:
import json

In [51]:
json_object = json.loads(data.content)

In [52]:
type(json_object)

dict

In [53]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [56]:
full_list = []
for pos, each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [57]:
pd.DataFrame(full_list)

Unnamed: 0,AGS_TXT,Aktualisierung,Death,Fallzahl,GlobalID,LAN_ew_AGS,LAN_ew_BEZ,LAN_ew_EWZ,LAN_ew_GEN,OBJECTID,OBJECTID_1,Shape__Area,Shape__Length,faelle_100000_EW
0,1,1597528800000,158,3782,fc5ba936-c95c-432c-8a33-9eb2f30b660f,1,Land,2896712,Schleswig-Holstein,15,1,45737310000.0,2881496.0,130.561823
1,2,1597528800000,264,5878,0f3e860c-5181-4d3f-a421-1d51f50315ea,2,Freie und Hansestadt,1841179,Hamburg,6,2,2089396000.0,418800.2,319.251958
2,3,1597528800000,656,15384,3fd77024-c29b-4843-9be8-682ad48e60c9,3,Land,7982448,Niedersachsen,9,3,129983600000.0,4008988.0,192.722834
3,4,1597528800000,56,1841,4132268b-54de-4327-ac1e-760e915112f1,4,Freie Hansestadt,682986,Bremen,5,4,1119157000.0,335717.7,269.551645
4,5,1597528800000,1777,54390,561d658f-3ee5-46e3-bc95-3528c6558ab9,5,Land,17932651,Nordrhein-Westfalen,10,5,87829360000.0,2648673.0,303.301503
5,6,1597528800000,526,13433,93277ac4-e8fc-48c7-8940-028dc2ed66af,6,Land,6265809,Hessen,7,6,52359130000.0,2148244.0,214.385724
6,7,1597528800000,242,8121,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,7,Land,4084844,Rheinland-Pfalz,11,7,47838770000.0,1774430.0,198.808082
7,8,1597528800000,1859,38480,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,8,Land,11069533,Baden-Württemberg,1,8,81517320000.0,2544320.0,347.620807
8,9,1597528800000,2631,52888,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,9,Freistaat,13076721,Bayern,2,9,163485500000.0,3898618.0,404.443897
9,10,1597528800000,174,3000,e3396a6f-8a30-4fdf-8df7-def77dd38bea,10,Land,990509,Saarland,12,10,6060692000.0,562678.9,302.874583
