## Background 

In [1]:
## Explanation of Covid-19 and what we plan to do here

## Import Libraries

In [2]:
import os
import sys
import re
import numpy as np
import pandas as pd
from collections import defaultdict
import datetime as dt
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from plotly import tools
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
import calmap 
import folium
import warnings
warnings.filterwarnings('ignore')
py.init_notebook_mode(connected=True)
#color = sns.color_palette()
%matplotlib inline

In [3]:
## Check files 
# !ls ../data/csse_covid_19_data/csse_covid_19_time_series

In [4]:
# Instantiate settings 
file_path = "../data/csse_covid_19_data/csse_covid_19_time_series"

### Link : 
[1] https://www.kaggle.com/khoongweihao/covid-19-novel-coronavirus-eda-forecasting-cases

[2] https://www.kaggle.com/sudalairajkumar/covid-19-analysis-of-usa

## Import data

In [5]:
df_confirmed = pd.read_csv(os.path.join(file_path,"time_series_19-covid-Confirmed.csv"))
df_deaths = pd.read_csv(os.path.join(file_path,"time_series_19-covid-Deaths.csv"))
df_recovered = pd.read_csv(os.path.join(file_path,"time_series_19-covid-Recovered.csv"))

In [6]:
# Rename columns
df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_recovered.rename(columns={'Country/Region':'Country'}, inplace=True)
df_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)
df_confirmed.rename(columns={'Province/State':'State'}, inplace=True)
df_recovered.rename(columns={'Province/State':'State'}, inplace=True)
df_deaths.rename(columns={'Province/State':'State'}, inplace=True)

In [7]:
print(f"Snippet of our data sample ")
df_confirmed.head(3)

Snippet of our data sample 


Unnamed: 0,State,Country,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20
0,,Thailand,15.0,101.0,2,3,5,7,8,8,...,50,50,50,53,59,70,75,82,114,147
1,,Japan,36.0,138.0,2,1,2,2,4,4,...,461,502,511,581,639,639,701,773,839,825
2,,Singapore,1.2833,103.8333,0,1,3,3,4,5,...,138,150,150,160,178,178,200,212,226,243


## Data Processing 

In [8]:
df_confirmed_processed = pd.melt(df_confirmed, 
                                 id_vars=['State','Country','Lat','Long'],
                                 var_name='Date',
                                 value_name='Confirmed')

df_recovered_processed = pd.melt(df_recovered, 
                                 id_vars=['State','Country','Lat','Long'],
                                 var_name='Date',
                                 value_name='Recovered')

df_deaths_processed = pd.melt(df_deaths, 
                                 id_vars=['State','Country','Lat','Long'],
                                 var_name='Date',
                                 value_name='Deaths')

In [9]:
print(f"Reshaped table for confirmed cases is of dimension : {df_confirmed_processed.shape}")
print(f"Reshaped table for recovered cases is of dimension : {df_recovered_processed.shape}")
print(f"Reshaped table for death cases is of dimension : {df_deaths_processed.shape}")

Reshaped table for confirmed cases is of dimension : (25465, 6)
Reshaped table for recovered cases is of dimension : (25465, 6)
Reshaped table for death cases is of dimension : (25465, 6)


In [10]:
df_confirmed_processed.dtypes

State         object
Country       object
Lat          float64
Long         float64
Date          object
Confirmed      int64
dtype: object

In [11]:
# Create master table by joining the three base tables
df_master = pd.merge(
                pd.merge(df_confirmed_processed, 
                         df_recovered_processed, 
                         on=['State','Country','Lat','Long','Date']), 
                df_deaths_processed, 
                on=['State','Country','Lat','Long','Date'])

We can create a derived column called 'Active' cases to know how many active infections are there which can be computed as : 
**Confirmed - Recovered - Deaths**

In [12]:
# Create derived column called Active Cases 
df_master['Active'] = df_master['Confirmed'] - df_master['Recovered'] - df_master['Deaths']

In [13]:
# Convert date column to appropriate data type
df_master['Date'] = pd.to_datetime(df_master['Date'])

In [14]:
df_master.head(4)

Unnamed: 0,State,Country,Lat,Long,Date,Confirmed,Recovered,Deaths,Active
0,,Thailand,15.0,101.0,2020-01-22,2,0,0,2
1,,Japan,36.0,138.0,2020-01-22,2,0,0,2
2,,Singapore,1.2833,103.8333,2020-01-22,0,0,0,0
3,,Nepal,28.1667,84.25,2020-01-22,0,0,0,0


In [15]:
print(f"Master table is of dimension : {df_master.shape}")

Master table is of dimension : (25465, 9)


## Visualization

In [16]:
# color pallette
cnf = '#393e46' # confirmed - grey
dth = '#ff2e63' # death - red
rec = '#21bf73' # recovered - cyan
act = '#fe9801' # active case - yellow

### What is the latest tally of Covid-19 globally?

In [17]:
temp = df_master.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Set1')

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,2020-03-16 00:00:00,181546,7126,78088,96332


In [18]:
tm = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm, path=["variable"], values="value", height=400, width=600,
                 color_discrete_sequence=[rec, act, dth])
fig.show()

At this juncture, we can see nearly half the cases worldwide are still active. It looks like we are still in the early stages of this pandemic!

### How have cases of Covid-19 trended globally?

In [19]:
temp = df_master.groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()

In [20]:
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Global Trend for Covid-19 Cases')
fig.show()

We notice that the global trend for confirmed cases was exponentially rising in January and first week of February. As
the rate of increase generally slowed down around the second week of February, we notice a secondary wave of infections. 


### How have cases of Covid-19 trended in China as opposed to Rest of the World?

In [21]:
is_china = df_master['Country'] == 'China'
temp = df_master[is_china].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()

In [22]:
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='China Trend for Covid-19 Cases')
fig.show()

In [23]:
is_row = df_master['Country'] != 'China'
temp = df_master[is_row].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Rest of World Trend for Covid-19 Cases')
fig.show()

We notice that China has pretty much contained their outbreak with the recovered cases closely approaching the confirmed. On the other hand, 
the rest of the world is only in the initial phase with daily confirmed numbers still increasing at an exponential rate. 

### Can we see the trends for some European Countries in the Covid-19 outbreak?

In [24]:
is_italy = df_master['Country'] == 'Italy'
temp = df_master[is_italy].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Italy Trend for Covid-19 Cases')
fig.show()

In [25]:
is_spain = df_master['Country'] == 'Spain'
temp = df_master[is_spain].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Spain Trend for Covid-19 Cases')
fig.show()

### How has the trend been for countries with strict contact tracing and early government interventions (Singapore and Taiwan)?

In [26]:
is_singapore = df_master['Country'] == 'Singapore'
temp = df_master[is_singapore].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Singapore Trend for Covid-19 Cases')
fig.show()

In [27]:
is_taiwan = df_master['Country'] == 'Taiwan*'
temp = df_master[is_taiwan].groupby('Date')['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
fig = go.Figure()
trace1 = go.Scatter(x=temp['Date'],
                    y=temp['Confirmed'],
                    mode='lines+markers',
                    name='Confirmed',
                    marker=dict(color=cnf,))

trace2 = go.Scatter(x=temp['Date'],
                    y=temp['Recovered'],
                    mode='lines+markers',
                    name='Recovered',
                    marker=dict(color=rec,))

trace3 = go.Scatter(x=temp['Date'],
                    y=temp['Deaths'],
                    mode='lines+markers',
                    name='Deaths',
                    marker=dict(color=dth,))

trace4 = go.Scatter(x=temp['Date'],
                    y=temp['Active'],
                    mode='lines+markers',
                    name='Active',
                    marker=dict(color=act,))

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(title='Taiwan Trend for Covid-19 Cases')
fig.show()

### How did the virus progress from China to rest of the world?

In [28]:
## Extract top 25 countries with most cases
temp = df_master.copy()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
top20_df = temp.groupby(['Country'])['Confirmed','Recovered','Deaths','Active'].sum().sort_values(by='Confirmed',ascending=False).head(20)
top20_countries = top20_df.index.values

In [29]:
# Aggregate at country, date level and filter data for top 25 countries only
temp = df_master.groupby(['Country','Date'])['Confirmed','Recovered','Deaths','Active'].sum().reset_index()
is_top20 = temp['Country'].isin(top20_countries)
temp = temp[is_top20]

In [30]:
temp["Date"] = pd.to_datetime(temp["Date"] , format="%m/%d/%Y").dt.date
temp.sort_values(by="Date").reset_index(drop=True)
temp["Date"] = temp["Date"].astype(str)

In [31]:
fig = go.Figure()
fig = px.bar(temp,
                x="Country",
                y="Deaths",
                color="Country",
                animation_frame="Date",
                animation_group="Country",
                title="Time Lapse of Covid-19 Deaths")
fig.show()

In [32]:
fig = go.Figure()
fig = px.bar(temp,
                x="Country",
                y="Active",
                color="Country",
                animation_frame="Date",
                animation_group="Country",
                title="Time Lapse of Covid-19 Active Cases")
fig.show()