In [1]:
from urllib import request
import pandas as pd

In [2]:
URLS = {"Confirmed":
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv', 
        "Recovered":
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv',
        "Deaths":
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv', 
        }

for url_key in URLS:
    print ("- Downloading", url_key, "...")
    request.urlretrieve(URLS[url_key], './input/{}_cases.csv'.format(url_key))
print ("- Done!")


confirmed = pd.read_csv('./input/Confirmed_cases.csv')
recovered  =recv_df = pd.read_csv('./input/Recovered_cases.csv')
deaths = pd.read_csv('./input/Deaths_cases.csv')

confirmed.head()

- Downloading Confirmed ...
- Downloading Recovered ...
- Downloading Deaths ...
- Done!


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20
0,,Thailand,15.0,101.0,2,3,5,7,8,8,...,50,53,59,70,75,82,114,147,177,212
1,,Japan,36.0,138.0,2,1,2,2,4,4,...,511,581,639,639,701,773,839,825,878,889
2,,Singapore,1.2833,103.8333,0,1,3,3,4,5,...,150,160,178,178,200,212,226,243,266,313
3,,Nepal,28.1667,84.25,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,,Malaysia,2.5,112.5,0,0,0,3,4,4,...,117,129,149,149,197,238,428,566,673,790


In [3]:
# Getting all dates
all_dates = confirmed.columns[4:]

new_confirmed = confirmed.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=all_dates, var_name='Date', value_name='Confirmed')

new_recovered = recovered.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=all_dates, var_name='Date', value_name='Recovered')

new_deaths = deaths.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=all_dates, var_name='Date', value_name='Deaths')

clean_data = pd.concat([new_confirmed, new_recovered['Recovered'], new_deaths['Deaths']], axis=1)


# removing county wise data to avoid double counting
# clean_data = clean_data[clean_data['Province/State'].str.contains(',')!=True]

clean_data.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Recovered,Deaths
0,,Thailand,15.0,101.0,1/22/20,2,0,0
1,,Japan,36.0,138.0,1/22/20,2,0,0
2,,Singapore,1.2833,103.8333,1/22/20,0,0,0
3,,Nepal,28.1667,84.25,1/22/20,0,0,0
4,,Malaysia,2.5,112.5,1/22/20,0,0,0


In [4]:
clean_data.rename(columns={'Country/Region': 'Country'}, inplace=True)

In [5]:
clean_data

Unnamed: 0,Province/State,Country,Lat,Long,Date,Confirmed,Recovered,Deaths
0,,Thailand,15.0000,101.0000,1/22/20,2,0,0
1,,Japan,36.0000,138.0000,1/22/20,2,0,0
2,,Singapore,1.2833,103.8333,1/22/20,0,0,0
3,,Nepal,28.1667,84.2500,1/22/20,0,0,0
4,,Malaysia,2.5000,112.5000,1/22/20,0,0,0
...,...,...,...,...,...,...,...,...
26329,Aruba,Netherlands,12.5186,-70.0358,3/18/20,4,0,0
26330,,Zambia,-15.4167,28.2833,3/18/20,2,0,0
26331,,Djibouti,11.8251,42.5903,3/18/20,1,0,0
26332,,"Gambia, The",13.4432,-15.3101,3/18/20,1,0,0


In [6]:
clean_data = clean_data.groupby(["Country", "Date"])[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [7]:
clean_data["Country"].replace({'US': 'United States'}, inplace=True)
clean_data["Country"].replace({'Czechia': 'Czech Republic'}, inplace=True)
clean_data["Country"].replace({'Korea, South': 'South Korea'}, inplace=True)

In [9]:
clean_data.to_csv("./input/covid_19_clear.csv", index=False)