# Data Processing and analysis

In [4]:
# import packages

# data processing
import pandas as pd
import numpy as np
from datetime import timedelta, datetime


import re

# data visualization
from tabulate import tabulate
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Layout
from plotly import offline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

# change text color
import colorama
from colorama import Fore, Style

# IPythonfrom IPython.display import IFrame

from IPython.display import IFrame

## data processing
first, get the data from functions we wrote in getdata1.py

time series data:
- time_series_covid19_confirmed_US
- time_series_covid19_confirmed_global
- time_series_covid19_deaths_US
- time_series_covid19_deaths_global
- time_series_covid19_recovered_global

cross sectional data:
- csse_covid_19_daily_reports
- csse_covid_19_daily_reports_us

In [5]:
from getdata1 import *

# GET_csse_covid_19_daily_reports()
latest_data_global, prev_data_global = GET_csse_covid_19_daily_reports_my('global')
latest_data_us, prev_data_us = GET_csse_covid_19_daily_reports_my('us')

# GET_csse_covid_19_time_series()
time_series_covid19_confirmed_US, time_series_covid19_confirmed_global, time_series_covid19_deaths_US, time_series_covid19_deaths_global,time_series_covid19_recovered_global = GET_csse_covid_19_time_series_my()

# shanghai time series data
ts_shanghai_covid = GET_shanghai_data_my()  # 这里包含近10天的上海无症状新增趋势！


reading [cross sectional] data ......
finish reading
reading [cross sectional] data ......
finish reading
reading [time series] data ......
finish reading
reading [shanghai] data ......
finish reading


### 1. we want time serise chinese data only from global data

ts_process_CHINA_my

In [66]:
def ts_process_CHINA_my(time_series_covid19_confirmed_global):
    '''daily new cases of china (provinces) '''
    # extract chinese data from global data
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_global.loc[time_series_covid19_confirmed_global['Country/Region'] == 'China',
                                                                                ~time_series_covid19_confirmed_global.columns.isin(['Country/Region', 'Lat', 'Long'])].T

    # change the display form of China data
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.rename(columns=time_series_covid19_confirmed_CN.iloc[0]).drop(time_series_covid19_confirmed_CN.index[0])

    # new confirmed cases
    # remove column 'Unknown'
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.loc[:,time_series_covid19_confirmed_CN.columns != 'Unknown']
    # keep the first day's data unchanged
    firstday = pd.DataFrame(time_series_covid19_confirmed_CN.iloc[0]).T
    time_series_covid19_confirmed_CN = time_series_covid19_confirmed_CN.diff().iloc[1:,:]
    result = firstday.append(time_series_covid19_confirmed_CN)


    # sort columns by the last day's new confirmed cases
    result = result.sort_values(by = result.last_valid_index(), axis=1, ascending=False)

    return result

### 2. we get data from United States (in states)
We use a single function def ts_process_US(ts_US, death = False, clip = False)
- if death = False, we use time_series_covid19_confirmed_US
- otherwise, we use time_series_covid19_deaths_US

Since there might be negative increase, we can use clip=0 to regulate it

In [20]:
# take a look at the datasets we will use
# time_series_covid19_confirmed_US.head()

In [21]:
# time_series_covid19_deaths_US.head()

In [16]:
def ts_process_US_my(ts_US, death = False, clip = False):
    '''daily new/death  cases of US (provinces) '''
    if death:
        ts_US = ts_US.loc[:, ~ts_US.columns.isin(['Population'])]


    ts_US = ts_US.groupby(['Province_State']).sum()
    ts_US = ts_US.loc[:, ~ts_US.columns.isin(['UID', 'code3', 'FIPS', 'Lat', 'Long_'])].T
    # new confirmed cases
    firstday = pd.DataFrame(ts_US.iloc[0]).T
    ts_US = ts_US.diff().iloc[1:,:]
    ts_US = firstday.append(ts_US)

    if clip:
        ts_US.clip(lower = 0 , inplace = True)
    return ts_US

### 3. only extract China and US from global cross sectional data
can use datasets:
- latest_data_global, prev_data_global
- latest_data_us, prev_data_us

In [46]:
latest_data_us.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2022-04-21 04:31:50,32.3182,-86.9023,1298626,19524,,,1.0,26485.355947,7543861.0,,1.503435,84000001.0,USA,153856.340318,
1,Alaska,US,2022-04-21 04:31:50,61.3707,-152.4044,251506,1248,,,2.0,34380.113322,,,0.496211,84000002.0,USA,,
2,American Samoa,US,2022-04-21 04:31:50,-14.271,-170.132,5663,21,,,60.0,10177.746626,,,0.370828,16.0,ASM,,
3,Arizona,US,2022-04-21 04:31:50,33.7298,-111.4312,2019174,29852,,,4.0,27740.795528,19077981.0,,1.478426,84000004.0,USA,262106.371219,
4,Arkansas,US,2022-04-21 04:31:50,34.9697,-92.3731,834553,11360,,,5.0,27654.314197,5101244.0,,1.361208,84000005.0,USA,169038.280816,


In [48]:
def daily_process_my(daily_data, country = 'China'):

    if country == 'China':
        daily_data = daily_data.loc[daily_data['Country_Region'] == country]
        # remove Province_State = Unknown
        daily_data = daily_data.loc[daily_data['Province_State'] != 'Unkown']
    elif country == 'us':
        daily_data = daily_data

    daily_data = daily_data.loc[:, daily_data.columns.isin(['Province_State','Last_Update','Confirmed','Deaths','Incident_Rate','Case_Fatality_Ratio' ])]
    # set province as index
    daily_data1 = pd.DataFrame(daily_data.iloc[:,1:])
    daily_data1 = daily_data1.set_index(daily_data['Province_State'])

    return daily_data1

In [49]:
daily_process_my(latest_data_us, country='us')

Unnamed: 0_level_0,Last_Update,Confirmed,Deaths,Incident_Rate,Case_Fatality_Ratio
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,2022-04-21 04:31:50,1298626,19524,26485.355947,1.503435
Alaska,2022-04-21 04:31:50,251506,1248,34380.113322,0.496211
American Samoa,2022-04-21 04:31:50,5663,21,10177.746626,0.370828
Arizona,2022-04-21 04:31:50,2019174,29852,27740.795528,1.478426
Arkansas,2022-04-21 04:31:50,834553,11360,27654.314197,1.361208
California,2022-04-21 04:31:50,9172767,89838,23215.011213,0.979399
Colorado,2022-04-21 04:31:50,1371521,12030,23816.354839,0.877128
Connecticut,2022-04-21 04:31:50,749783,10825,21030.088181,1.443751
Delaware,2022-04-21 04:31:50,260735,2896,26775.995005,1.110706
Diamond Princess,2022-04-21 04:31:50,49,0,,0.0
