In [5]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import re

data importing functions:
1. GET_csse_covid_19_time_series()
2. GET_csse_covid_19_daily_reports(region)
3. GET_shanghai_data()


Gather Cross Sectional Data

In [5]:
# user pass in variable 'region'
# if region = 'global', gather global daily reports
# if region = 'us', gather united states daily reports
def GET_csse_covid_19_daily_reports(region):
    '''
   get the latest and previous date cases
    :return:
    '''
    print('reading [cross sectional] data ......')
    if region == 'global':
        region = ''
    elif region == 'us':
        region = '_us'
    # current date
    date = datetime.now()
    latest_data = None
    prev_data = None
    while latest_data is None or prev_data is None:
        if latest_data is None:
            try:
                url1 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports{}/{}.csv'.format(region, date.strftime('%m-%d-%Y'))
                latest_data = pd.read_csv(url1)
            except:
                date = date-timedelta(1)
        else:
            try:
                url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports{}/{}.csv'.format(region, date.strftime('%m-%d-%Y'))
                prev_data = pd.read_csv(url2)
            except:
                date = date-timedelta(1)
    print('finish reading')
    return latest_data, prev_data


In [None]:
latest_data_global, prev_data_global = GET_csse_covid_19_daily_reports('global')
latest_data_us, prev_data_us = GET_csse_covid_19_daily_reports('us')

Gather Time Series Data

In [10]:
def GET_csse_covid_19_time_series():
    print('reading [time series] data ......')
    time_series_covid19_confirmed_US = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')
    time_series_covid19_confirmed_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
    time_series_covid19_deaths_US = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')
    time_series_covid19_deaths_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
    time_series_covid19_recovered_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
    print('finish reading')
    return time_series_covid19_confirmed_US, time_series_covid19_confirmed_global, time_series_covid19_deaths_US, time_series_covid19_deaths_global, time_series_covid19_recovered_global

In [11]:
time_series_covid19_confirmed_US, time_series_covid19_confirmed_global, time_series_covid19_deaths_US, time_series_covid19_deaths_global, time_series_covid19_recovered_global = GET_csse_covid_19_time_series()

reading [time series] data ......
finish reading


Gather Shanghai Data (recent 10 days)

In [8]:
def GET_shanghai_data(data_name = 'ts_shanghai_covid'):
    print('reading [shanghai] data ......')
    url = f'https://gitee.com/gzjzg/whale-pkg/raw/master/{data_name}.csv'
    data = pd.read_csv(url,encoding = 'gbk')['detail']
    data2 = data['detail'].sort_values()

    # transform the data, get ready to use re.search()
    for i in range(len(data2)):
        data2[i] = data2[i].replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    # initial cases dataframe
    cases = pd.DataFrame(columns=['local_daily_positive_cases', 'local_asymptomatic_cases'])

    for i in range(len(data2)):
        # using regex
        case = re.search(r'(\d{4}-\d{1,2}-\d{1,2})(.*?)(\d+)(.*?)(\d+)',data2[i])
        # if cannot find useful infor, ignore that line
        if case == None:
            cases = cases
        # else, we have a new record of daily covid report
        else:
            case = pd.DataFrame([[case.group(3), case.group(5)]],
                                columns=['local_daily_positive_cases', 'local_asymptomatic_cases'],
                                # transform date from str to datetime format
                                index = [pd.to_datetime(case.group(1))])
            cases = cases.append(case)

    cases['local_daily_positive_cases'].astype(int)
    cases['local_asymptomatic_cases'].astype(int)
    # final result
    print('finish reading')
    return cases.sort_index()

In [9]:
ts_shanghai_covid = GET_shanghai_data()
ts_shanghai_covid

reading [shanghai] data ......


HTTPError: HTTP Error 404: Not Found

In [1]:
def ts_process_CHINA(time_series_covid19_confirmed_global):
    '''daily new cases of china (provinces) '''
    # extract chinese data from global data
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_global.loc[time_series_covid19_confirmed_global['Country/Region'] == 'China',
                                                                                ~time_series_covid19_confirmed_global.columns.isin(['Country/Region', 'Lat', 'Long'])].T

    # change the display form of China data
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.rename(columns=time_series_covid19_confirmed_CN.iloc[0]).drop(time_series_covid19_confirmed_CN.index[0])

    # new confirmed cases
    # remove column 'Unknown'
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.loc[:,time_series_covid19_confirmed_CN.columns != 'Unknown']
    firstday = pd.DataFrame(time_series_covid19_confirmed_CN.iloc[0]).T
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.diff().iloc[1:,:]

    result = firstday.append(time_series_covid19_confirmed_CN)
    # sort columns by the last day's new confirmed cases
    result = result.sort_values(by = result.last_valid_index(), axis=1, ascending=False)

    return result

In [40]:
df = ts_process_CHINA(time_series_covid19_confirmed_global)
df

Unnamed: 0,Shanghai,Hong Kong,Jilin,Guangdong,Fujian,Zhejiang,Heilongjiang,Yunnan,Guangxi,Sichuan,...,Ningxia,Hubei,Inner Mongolia,Hunan,Guizhou,Gansu,Tianjin,Tibet,Xinjiang,Chongqing
1/22/20,,,,,,,,,,,...,,,,,,,,,,
1/23/20,7,2,1,6,4,17,2,1,3,3,...,0,0,0,5,2,2,0,0,2,3
1/24/20,4,0,2,21,5,16,2,3,18,7,...,1,105,1,15,0,0,4,0,0,18
1/25/20,13,3,1,25,8,19,5,6,0,13,...,1,212,6,19,1,2,2,0,1,30
1/26/20,7,3,0,33,17,42,6,5,13,16,...,1,297,0,26,1,3,4,0,1,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4/9/22,1015,2535,242,17,12,16,1,1,5,6,...,0,1,0,0,0,0,1,0,0,0
4/10/22,917,1921,187,22,9,9,0,2,4,10,...,0,0,0,0,0,0,0,0,0,0
4/11/22,998,1407,171,20,7,11,1,1,5,7,...,0,1,1,0,0,0,0,0,0,0
4/12/22,1190,1433,233,26,8,12,2,1,5,1,...,0,1,0,2,0,0,1,0,0,0


In [12]:
def ts_process_US(time_series_covid19_confirmed_US):
    '''daily new cases of US (provinces) '''
    time_series_covid19_confirmed_US = time_series_covid19_confirmed_US.groupby(['Province_State']).sum()
    time_series_covid19_confirmed_US = time_series_covid19_confirmed_US.loc[:, ~time_series_covid19_confirmed_US.columns.isin(['UID', 'code3', 'FIPS', 'Lat', 'Long_'])].T
    # new confirmed cases
    time_series_covid19_confirmed_US = time_series_covid19_confirmed_US.diff()
    return time_series_covid19_confirmed_US

In [14]:
ts_process_US(time_series_covid19_confirmed_US)

Province_State,Alabama,Alaska,American Samoa,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Diamond Princess,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1/22/20,,,,,,,,,,,...,,,,,,,,,,
1/23/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1/24/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1/25/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1/26/20,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4/13/22,156.0,1303.0,10.0,2777.0,127.0,939.0,1656.0,691.0,151.0,0.0,...,277.0,-10061.0,0.0,310.0,35.0,1217.0,4695.0,25.0,860.0,0.0
4/14/22,188.0,0.0,92.0,0.0,89.0,7074.0,1800.0,407.0,226.0,0.0,...,320.0,3450.0,740.0,346.0,36.0,1094.0,0.0,130.0,901.0,0.0
4/15/22,220.0,0.0,0.0,0.0,75.0,4602.0,1831.0,1193.0,245.0,0.0,...,453.0,684.0,0.0,274.0,0.0,1538.0,2794.0,117.0,873.0,0.0
4/16/22,0.0,0.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,...,455.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0
