![](CRISP_DM.png)

In [None]:
# importing required packages
import subprocess
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import json
# set limit for displaying max amount of raws values for dataframe
pd.set_option('display.max_rows', 200)

In [None]:
#Set a base path in such way that full execuation will be possible with one click
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir('C:/Users/dhame/ds_covid-19/')

'Your base path for this project is: '+os.path.split(os.getcwd())[-1]

## 1.1 Data Extraction &Understanding
* We have following three options available for extracting data....each explained in brief
    * John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
    * RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
    * REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

### 1.1.1 JhonsHopkins GITHUB dataset
* clonning data from Johnhopkins GITHUB page
    * using command 'git clone/pull https://github.com/CSSEGISandData/COVID-19.git'

In [None]:
# pulling data from github and storing in local drive
git_pull = subprocess.Popen('git pull', 
                     cwd = os.path.dirname('data/raw/COVID-19/'), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

In [None]:
data_path='/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)
pd_raw.head()

### 1.1.2 Webscrapping
+ RKI, webscrape (webscraping) [Robert-koch website fälle](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)

In [None]:
# select page by giving URL of RKI Fallzahlen
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [None]:
soup_table = BeautifulSoup(page.content, 'html.parser')
html_table=soup_table.find('table') # find the table, attention this works if one table exists
all_rows=html_table.find_all('tr')

In [None]:
absolute_data_list=[]
for pos,rows in enumerate(all_rows):
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
    absolute_data_list.append(col_list)

In [None]:
pd_daily_status=pd.DataFrame(absolute_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_100k',
                                                       4:'fatal',
                                                       5:'comment'})
pd_daily_status.head()

### 1.1.3 REST API 
* REST API services to retreive data [NPGEO website](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

In [None]:
# creating data request for Germany country
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [None]:
# use json package to load data from called REST API
json_object=json.loads(data.content) 
#checking data_type of Json_object
type(json_object)

In [None]:
# getting keys of json file
json_object.keys()

In [None]:
# create empty list name full_list and than append data from json_object
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [None]:
# convert full_list to pandas dataframe
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

In [None]:
# save dataframe to local drive in CSV format
pd_full_list.to_csv('C:/Users/dhame/ds_covid-19/data/raw/NPGEO/GER_state_data.csv',sep=';')

In [None]:
pd_full_list.describe()

### 1.1.4 Additional API Access via REST Dienst, i.e INDIA Dataset
* example of REST confirm interface (Important!!!: Registration is required)
* [Smartable](https://smartable.ai/)

In [None]:
url_endpoint = 'https://api.smartable.ai/coronavirus/stats/IN'
headers = {
    'Cache-Control': 'mo-cache',
    'Subscription-Key': '22dc8d09733243328bacc2047f1c6f23'}
response = requests.get(url_endpoint, headers=headers)

In [None]:
# Check out what response content
response.content

In [None]:
IN_dict = json.loads(response.content) # importing strings for India dataset and dump into JSON file with .txt format
with open ('C:/Users/dhame/ds_covid-19/data/raw/IN_data.ext','w') as outfile:
    json.dump(IN_dict, outfile,indent=2)

In [None]:
# put all dictionary type data for INDIA into dataframe
df_4 = pd.DataFrame(IN_dict)
df_4.head()

#### 1.1.4.1 Individual States India

In [None]:
IN_dict['stats']['breakdowns'][0]

In [None]:
full_list_IN_country=[]
for pos,each_dict in enumerate (IN_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(IN_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_IN_country.append(flatten_dict)

In [None]:
df_india = pd.DataFrame(full_list_IN_country)

In [None]:
pd.DataFrame(full_list_IN_country).to_csv('C:/Users/dhame/ds_covid-19/data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)