In [None]:
import numpy as np
import pandas as pd
import plotly

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots
import folium 
from folium import plugins
from tqdm.notebook import tqdm as tqdm

import warnings
warnings.filterwarnings('ignore')

**Cleaned Data**

Here we visualize the data which thank to the autor of the following link:
[COVID-19 Complete Dataset (Updated every 24hrs)](https://www.kaggle.com/imdevskp/corona-virus-report) - Day to day country wise no. of cases (Doesn't have County/State/Province level data)

In [None]:
cleaned_data = pd.read_csv('/Users/thientrangbui/Dropbox/Kaggle competion/covid19-global-forecasting-week-1/covid_19_clean_complete.csv', parse_dates=['Date'])
cleaned_data.head()

In [None]:
cleaned_data.shape

In [None]:
# we take a calculation to be sure the n. of active cases based on the confirmed/ deaths/ recovered cases
# cases 
cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']

# Active Case = confirmed - deaths - recovered
cleaned_data['Active'] = cleaned_data['Confirmed'] - cleaned_data['Deaths'] - cleaned_data['Recovered']

# filling missing values 
cleaned_data[['Province/State']] = cleaned_data[['Province/State']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)

cleaned_data.head()

In [None]:
cleaned_data.tail()

**We consider here the covid19 situation on the whole world until the date 27th July 2020**

In [None]:
len(cleaned_data['Country/Region'].unique())

Let's just see Confirmed , Deaths , Recovered and Active in world wide in all over Country/Region and lets dig deeper into the top 5 Country/Region

In [None]:
#total number of confirmed cases in each country/region
confirmiedcases = pd.DataFrame(cleaned_data.groupby('Country/Region')['Confirmed'].sum())
confirmiedcases['Country/Region'] = confirmiedcases.index
confirmiedcases.index = np.arange(1,188)

#total number of death cases in each country/region
Deathcases = pd.DataFrame(cleaned_data.groupby('Country/Region')['Deaths'].sum())
Deathcases['Country/Region'] = Deathcases.index
Deathcases.iodex = np.arange(1,188)

#total number of recovered cases in each country/region
Recoveredcases = pd.DataFrame(cleaned_data.groupby('Country/Region')['Recovered'].sum())
Recoveredcases['Country/Region'] = Recoveredcases.index
Recoveredcases.iodex = np.arange(1,188)

#total number of active cases in each country/region
Activecases = pd.DataFrame(cleaned_data.groupby('Country/Region')['Active'].sum())
Activecases['Country/Region'] = Activecases.index
Activecases.iodex = np.arange(1,188)

In [None]:
confirmiedcases.tail()

In [None]:
# global cases
global_Activecases = Activecases[['Country/Region','Active']]
global_Deathcases = Deathcases[['Country/Region','Deaths']]
global_Recoveredcases = Recoveredcases[['Country/Region','Recovered']]
global_confirmiedcases = confirmiedcases[['Country/Region','Confirmed']]

In [None]:
fig = px.bar(global_confirmiedcases.sort_values('Confirmed',ascending=False)[:20][::-1],x='Confirmed',y='Country/Region',title='Confirmed Cases Worldwide',text='Confirmed', height=900, orientation='h')
fig.show()

In [None]:
fig=px.bar(global_Recoveredcases.sort_values('Recovered',ascending=False)[:20][::-1],x='Recovered',y='Country/Region',title='Recovered Cases Worldwide',text='Recovered', height=900, orientation='h')
fig.show()

In [None]:
fig = px.bar(global_Deathcases.sort_values('Deaths',ascending=False)[:20][::-1],x='Deaths',y='Country/Region',title='Death Cases Worldwide',text='Deaths', height=900, orientation='h')
fig.show()

In [None]:
fig = px.bar(global_Activecases.sort_values('Active',ascending=False)[:20][::-1],x='Active',y='Country/Region',title='Active Cases Worldwide',text='Active', height=900, orientation='h')
fig.show()

**Comments:**

Recording covid-19 cases until 27th July 2020, we get few following points,

* About the confirmed cases: US is being strongest affected by covid19 with the largest number of Confirmed , Deaths and Active cases. However, its Recovered cases is largest also. It's followed by Brazil, Russia, India and Spain. Especially, Brazil's covid-19 situation is really complicated this period when its confirmed cases is increasing quiclky day by days. Spain is a country in Euroupe having the number of confirmed cases increasing alarmingly even after the quarantine.

* About the recovered cases: comparing the recovered rate between US and Brazil, the two country being strongest affected by covid-19, we see that Brazil has bigger number of recovered cases than in US. However, Spain's recovered cases is followed by Iran and Italy, that means its rank in the statistic of recovered cases is 8th. 

* About the death cases: Following US is countries including UK, Brazil, Itali and France. France is a country holding many questions in my point of views cause its confirmed/ active case do not belong in the top 5 countries having the largest number of confirmed/ active cases. I doubt that there's a big question in statistical recording of number of confirmed/active cases in France. 

* About the active cases: Following US is countries Brazil, UK, Russia and India. 

**Now lets make visualizations accordingly to the above result**

In [None]:
#the total number of confirmed/death/recovered/active cases on the world summed each day from 
# 22nd Jan to 27th July 2020
date_c = cleaned_data.groupby('Date')['Confirmed','Deaths','Recovered','Active'].sum().reset_index()
date_c.head()

In [None]:
date_c.tail()

The above table visualize the Global Spread of covid19 on whole world following the time. Counting until 27th July 2020, the world have over **16M confirmed cases**, in which over 650k deaths, over 9M of recovered cases and over **6M active cases**.

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=4, subplot_titles=("Comfirmed", "Deaths", "Recovered",'Active'))

trace1 = go.Scatter(
                x=date_c['Date'],
                y=date_c['Confirmed'],
                name="Confirmed",
                line_color='orange',
                mode='lines+markers',
                opacity=0.8)
trace2 = go.Scatter(
                x=date_c['Date'],
                y=date_c['Deaths'],
                name="Deaths",
                line_color='red',
                mode='lines+markers',
                opacity=0.8)

trace3 = go.Scatter(
                x=date_c['Date'],
                y=date_c['Recovered'],
                name="Recovered",
                mode='lines+markers',
                line_color='green',
                opacity=0.8)

trace4 = go.Scatter(
                x=date_c['Date'],
                y=date_c['Active'],
                name="Active",
                line_color='blue',
                mode='lines+markers',
                opacity=0.8)

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 1, 4)
fig.update_layout(template="plotly_dark",title_text = '<b>Global Spread of the Coronavirus Over Time </b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='white'))
fig.show()

In [None]:
grouped_us = cleaned_data[cleaned_data['Country/Region'] == "US"].reset_index()
grouped_us_date = grouped_us.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

grouped_brazil = cleaned_data[cleaned_data['Country/Region'] == "Brazil"].reset_index()
grouped_brazil_date = grouped_brazil.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

grouped_russia = cleaned_data[cleaned_data['Country/Region'] == "Russia"].reset_index()
grouped_russia_date = grouped_russia.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

grouped_india = cleaned_data[cleaned_data['Country/Region'] == "India"].reset_index()
grouped_india_date = grouped_india.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

grouped_spain = cleaned_data[cleaned_data['Country/Region'] == "Spain"].reset_index()
grouped_spain_date = grouped_spain.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

grouped_rest = cleaned_data[~cleaned_data['Country/Region'].isin(['US', 'Brazil', 'Russia', 'India', 'Spain'])].reset_index()
grouped_rest_date = grouped_rest.groupby('Date')['Date', 'Confirmed', 'Deaths','Recovered','Active'].sum().reset_index()

In [None]:
fig = make_subplots(rows=1, cols=4, subplot_titles=("Comfirmed", "Deaths", "Recovered",'Active'))

trace1 = go.Scatter(x=grouped_us_date['Date'],y=grouped_us_date['Confirmed'],name="Confirmed",line_color='orange',mode='lines+markers',opacity=0.8)
trace2 = go.Scatter(x=grouped_us_date['Date'],y=grouped_us_date['Deaths'],name="Deaths",line_color='red',mode='lines+markers',opacity=0.8)
trace3 = go.Scatter(x=grouped_us_date['Date'],y=grouped_us_date['Recovered'],name="Recovered",mode='lines+markers',line_color='green',opacity=0.8)
trace4 = go.Scatter(x=grouped_us_date['Date'],y=grouped_us_date['Active'],name="Active",line_color='blue',mode='lines+markers',opacity=0.8)

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 1, 4)
fig.update_layout(template="plotly_dark",title_text = '<b>Spread of the Coronavirus Over Time in US (TOP 1)</b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='white'))
fig.show()


**Comments:** Looking at the map of spreading of covid19 in US, this country have been heavily influenced. The n. of confirmed cases takes a quarter of the n. confirmed cases of the whole world. The n. of deaths is also. While the active cases takes almost half of the n. active cases of the whole world. The alarmed covid situation is making a power country facing many challenges.

In [None]:
# create map and display it
world_map = folium.Map(location=[10, -20], zoom_start=2.5,tiles='Stamen Toner')

for lat, lon, Confirmed,Deaths,Recovered,name in zip(grouped_rest['Lat'], grouped_rest['Long'], grouped_rest['Confirmed'],grouped_rest['Deaths'],grouped_rest['Recovered'], grouped_rest['Country/Region']):
    folium.CircleMarker([lat, lon],
                        radius=7,
                        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Confirmed Cases</strong>: ' + str(Confirmed) + '<br>'
                                '<strong>Recovered Cases</strong>: ' + str(Recovered) +'<br>'
                                '<strong>Deaths Cases</strong>: ' + str(Deaths) +'<br>'),
                        color='red',
                        
                        fill_color='red',
                        fill_opacity=0.7 ).add_to(world_map)
world_map

In [None]:
world_map.save('world_map.html')

**Comparisions**
How about comparing the cases to better assess the situation

In [None]:
temp = cleaned_data.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='case', value_name='count')


fig = px.area(temp, x="Date", y="count", color='case',
             title='Cases over time: Area Plot', color_discrete_sequence = ['cyan', 'red', 'orange'])
fig.show()

**Mortality and Recovery Rates**
It is worth seeing these stats as well. It might have a story for sure.

In [None]:
cleaned_latest = cleaned_data[cleaned_data['Date'] == max(cleaned_data['Date'])]
flg = cleaned_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

flg['mortalityRate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)
temp = flg[flg['Confirmed']>1000]
#temp = temp.sort_values('mortalityRate', ascending=False)

fig = px.bar(temp.sort_values(by="mortalityRate", ascending=False)[:10][::-1],
             x = 'mortalityRate', y = 'Country/Region', 
             title='Deaths per 1000 Confirmed Cases', text='mortalityRate', height=800, orientation='h',
             color_discrete_sequence=['darkred']
            )
fig.show()

**COVID-19: Spread Over Time**

In [None]:
formated_gdf = cleaned_data.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['Date'] = pd.to_datetime(formated_gdf['Date'])
formated_gdf['Date'] = formated_gdf['Date'].dt.strftime('%m/%d/%Y')
formated_gdf['size'] = formated_gdf['Confirmed'].pow(0.3)

fig = px.scatter_geo(formated_gdf, locations="Country/Region", locationmode='country names', 
                     color="Confirmed", size='size', hover_name="Country/Region", 
                     range_color= [0, 1500], 
                     projection="natural earth", animation_frame="Date", 
                     title='COVID-19: Spread Over Time', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

**COVID-19: Deaths Over Time**

In [None]:
formated_gdf = cleaned_data.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['Date'] = pd.to_datetime(formated_gdf['Date'])
formated_gdf['Date'] = formated_gdf['Date'].dt.strftime('%m/%d/%Y')
formated_gdf['size'] = formated_gdf['Deaths'].pow(0.3)

fig = px.scatter_geo(formated_gdf, locations="Country/Region", locationmode='country names', 
                     color="Deaths", size='size', hover_name="Country/Region", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="Date", 
                     title='COVID-19: Deaths Over Time', color_continuous_scale="peach")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

**Activate cases over the time**

In [None]:
formated_gdf = cleaned_data.groupby(['Date', 'Country/Region'])['Active'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['Date'] = pd.to_datetime(formated_gdf['Date'])
formated_gdf['Date'] = formated_gdf['Date'].dt.strftime('%m/%d/%Y')
formated_gdf['size'] = formated_gdf['Active'].pow(0.3)
formated_gdf['size'].fillna(formated_gdf['size'].mean(),inplace=True)

fig = px.scatter_geo(formated_gdf, locations="Country/Region", locationmode='country names', 
                     color="Active", size='size', hover_name="Country/Region", 
                     range_color= [0, 1000], 
                     projection="natural earth", animation_frame="Date", 
                     title='COVID-19: Active Cases Over Time', color_continuous_scale="portland")
fig.update(layout_coloraxis_showscale=False)
fig.show()