In [1]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json


pd.set_option('display.max_rows', 500)

# 2.1 Data Understanding

The following three options available for extracting data:
- RKI, webscrape [webscrapping] [COVID-19 : Case numbers in Germany and worldwide](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)

- John Hopkins, (GITHUB) [COVID-19 Data Repository by John Hopkins](https://github.com/CSSEGISandData/COVID-19.git)

- REST API Services to retrieve data [NPGEO Corona Hub 2020](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

## 2.1.1 Jhon Hopkin GITHUB csv data

Clonning data from Johnhopkins GITHUB page
- using command 'git clone/pull' https://github.com/CSSEGISandData/COVID-19.git

In [2]:
# To Automate the Github data pulling
git_pull = subprocess.Popen( "git pull" ,
                         cwd = os.path.dirname( r'C:\Users\SurabhiD\ads_covid_19\data\raw\new-covid' ),
                         #cwd = os.path.dirname( '../Users/SurabhiD/ads_covid_19/data/raw/new-covid' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error))
print("out : " + str(out))

Error : b'fatal: not a git repository (or any of the parent directories): .git\n'
out : b''


In [3]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [4]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20,7/13/20,7/14/20,7/15/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,33190,33384,33594,33908,34194,34366,34451,34455,34740,34994
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2964,3038,3106,3188,3278,3371,3454,3571,3667,3752
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,16404,16879,17348,17808,18242,18712,19195,19689,20216,20770
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,855,855,855,855,855,855,855,858,861,862
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,346,386,386,396,458,462,506,525,541,576
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,70,70,70,73,74,74,74,74,74,74
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,80447,83426,87030,90693,94060,97509,100166,103265,106910,111146
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,28936,29285,29820,30346,30903,31392,31969,32151,32490,33005
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,108,111,112,113,113,113,113,113,113,113
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3433,3440,3453,3467,3474,3478,3492,3505,3517,3527


## 2.1.1 Webscrapping

- RKI, webscrape (webscraping) [Robert-koch website fälle](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)

In [5]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [6]:
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
#soup.get_text()

In [8]:
html_table=soup.find('table')

In [9]:
all_rows=html_table.find_all('tr')

In [10]:
final_data_list=[]

In [11]:
for pos,rows in enumerate(all_rows):
#print(pos)
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)
#print(rows)
#print(col_list)

In [12]:
pd_daily_status = pd.DataFrame(final_data_list).dropna().rename(columns={0:'State',
                                                    1:'Cases',
                                                    2:'Changes',
                                                    3:'Cases_per_100K',
                                                    4: 'Fatal',
                                                    5:'Comment'})

In [13]:
pd_daily_status.head()

Unnamed: 0,State,Cases,Changes,Cases_per_100K,Fatal,Comment
2,Baden-Württem­berg,40.358,185,1.45,131,1.863
3,Bayern,55.414,353,1.835,140,2.634
4,Berlin,10.786,58,416.0,111,226.0
5,Branden­burg,3.795,8,70.0,28,169.0
6,Bremen,1.913,8,62.0,91,56.0


In [14]:
pd_daily_status.to_csv('../data/raw/RKI/RKI_data.csv',sep=';', index = False) 
# Data will be prepared in notebook 'data_preparation'

# 2.3.1 REST API Calls 

REST API Services to retrieve data [NPGEO Corona Hub 2020](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

In [15]:
#Data Request for Country: Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [16]:
#data.content

In [17]:
json_object=json.loads(data.content)

In [18]:
type(json_object)

dict

In [19]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'serverGens', 'geometryType', 'spatialReference', 'fields', 'features'])

In [20]:
#json_object

In [21]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [22]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,3927,1598306400000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,135.567499,45737310000.0,2881496.0,160
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,6083,1598306400000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,330.386128,2089396000.0,418800.2,265
2,3,3,Niedersachsen,Land,7982448,9,16191,1598306400000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,202.832515,129983600000.0,4008988.0,661
3,4,4,Bremen,Freie Hansestadt,682986,5,1913,1598306400000,4,4132268b-54de-4327-ac1e-760e915112f1,280.093589,1119157000.0,335717.7,56
4,5,5,Nordrhein-Westfalen,Land,17932651,10,57213,1598306400000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,319.043738,87829360000.0,2648673.0,1802


In [23]:
pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')

In [24]:
pd_full_list.shape[0]

16

# 2.4.1 API access via REST service, e.g. India Data

example of REST conform interface (Registration Required)  
[Smartable.ai](https://smartable.ai/)

In [25]:
url_endpoint='https://api.smartable.ai/coronavirus/stats/IN'
headers={
    'Cache-Control':'no-cache',
    'Subscription-Key': '096aaad7799c49b3825e698f67e67e2c',
}
response = requests.get(url_endpoint,headers=headers)

In [26]:
print(response)

<Response [200]>


In [27]:
IN_dict = json.loads(response.content) # imports string
with open('../data/raw/smartable/IN_data.txt', 'w') as outfile:
    json.dump(IN_dict, outfile, indent=2)

In [28]:
print(json.dumps(IN_dict,indent=2)) #string dump

{
  "location": {
    "long": 78.0,
    "countryOrRegion": "India",
    "provinceOrState": null,
    "county": null,
    "isoCode": "IN",
    "lat": 21.0
  },
  "updatedDateTime": "2020-08-25T22:00:34.6092863Z",
  "stats": {
    "totalConfirmedCases": 3231754,
    "newlyConfirmedCases": 66873,
    "totalDeaths": 59612,
    "newDeaths": 1066,
    "totalRecoveredCases": 2467252,
    "newlyRecoveredCases": 64151,
    "history": [
      {
        "date": "2020-01-22T00:00:00",
        "confirmed": 0,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-23T00:00:00",
        "confirmed": 0,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-24T00:00:00",
        "confirmed": 0,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-25T00:00:00",
        "confirmed": 0,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-26T00:00:00",
        "confirmed": 0,
  

### 2.4.1.1 Individual States India

In [29]:
IN_dict['stats']['breakdowns'][0]

{'location': {'long': 78.0,
  'countryOrRegion': 'India',
  'provinceOrState': None,
  'county': None,
  'isoCode': 'IN',
  'lat': 21.0},
 'totalConfirmedCases': 3231754,
 'newlyConfirmedCases': 66873,
 'totalDeaths': 59612,
 'newDeaths': 1066,
 'totalRecoveredCases': 2467252,
 'newlyRecoveredCases': 64151}

In [30]:
full_list_IN_country=[]
for pos,each_dict in enumerate (IN_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(IN_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_IN_country.append(flatten_dict)

In [31]:
pd.DataFrame(full_list_IN_country).to_csv('../data/raw/smartable/full_list_IN_country.csv',sep=';',index=False)