<a href="https://colab.research.google.com/github/yallapragada/covid19/blob/master/uy_covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Analysis of daily report and timeline report of Johns Hopkins covid datasets.

In [0]:
import pandas as pd
from datetime import datetime, timedelta
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pytz

In [0]:
# Use 3 decimal places in output display
pd.set_option('display.precision', 3)

In [0]:
# use matplotlib backend for plotting
%matplotlib inline

#use classic style in matplotlib
plt.style.use('classic')

#set seaborn style
sns.set_style('white', {'legend.frameon':True})

# USA - Daily Report

Daily report provides us total # of cases and # of deaths per each state

In [0]:
yesterday = (datetime.now(pytz.timezone('US/Eastern')) - timedelta(days=1)).strftime('%m-%d-%Y')
today = datetime.now(pytz.timezone('US/Eastern')).strftime('%m-%d-%Y')

In [0]:
# github url for johns hopkins raw daily report (for US) 
base_url_daily_report = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/'
url_daily_report = base_url_daily_report + yesterday + '.csv'

In [0]:
# read daily report
covid_daily_report = pd.read_csv(url_daily_report)

# some columns have /, replace with _
covid_daily_report.columns = covid_daily_report.columns.str.replace('/', '_')

In [0]:
# plot deaths by state

fig = plt.figure(figsize=(20,7))
plt.xticks(
    rotation=90, 
    horizontalalignment='right',
    fontweight='light'
)
plt.box(False)
plt.title(f'{today} - covid deaths')
ax = sns.lineplot(x='Province_State', y='Deaths', data=covid_daily_report, hue=None, color='coral')
ax.set(xlabel='state', ylabel='deaths')
ax.grid(b=True, which='major', axis='both', color='green')
plt.show()

In [0]:
# plot cases by state

fig = plt.figure(figsize=(20,7))
plt.xticks(
    rotation=90, 
    horizontalalignment='right',
    fontweight='light'
)
plt.box(False)
plt.title(f'{today} - covid cases')
ax = sns.lineplot(x='Province_State', y='Confirmed', data=covid_daily_report, hue=None, color='coral')
ax.set(xlabel='state', ylabel='cases')
ax.grid(b=True, which='major', axis='both', color='green')
plt.show()

# USA - Timeline Report

#### Macro picture of timeline data

In [0]:
# github url for johns hopkins timeline report (for US) 
time_series_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'

In [0]:
# read time_series data
covid_time_series_df = pd.read_csv(time_series_url)

# some columns have /, replace with _
covid_time_series_df.columns = covid_time_series_df.columns.str.replace('/', '_')

In [0]:
# melt all those 'date' columns to a single column

covid_time_series_df_melted = covid_time_series_df.melt(
                                     id_vars=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'], 
                                     var_name='Date', 
                                     value_name='Cases'
                                    )

In [0]:
# group by 'Date'
dates_df = covid_time_series_df_melted.groupby(['Date']).sum()
dates_df = dates_df.reset_index()

# fix bad dates, we need the day to be 2 character always for good plots
dates_df.Date = dates_df.Date.apply(lambda x: datetime.strptime(x, '%m_%d_%y').strftime('%-m_%d_%y'))

# sort by date
dates_df = dates_df.sort_values(by=['Date'])

# add a column for new cases
dates_df['New_Cases'] = dates_df.Cases.diff()

# not much happened before march 10th in the US
dates_df = dates_df.query('Date > "3-10-2020"')

In [0]:
# plot total cases and total new cases in the US

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 16), sharex=True)

ax1.xaxis.set_tick_params(labelrotation=90)
ax1.set_title(f'{today} - total covid cases in US')
sns.lineplot('Date', 'Cases', data=dates_df, color='coral', ax=ax1)
ax1.set(xlabel='date', ylabel='cases')
ax1.grid(b=True, which='major', axis='both', color='green')

ax2.xaxis.set_tick_params(labelrotation=90)
ax2.set_title(f'{today} - new covid cases in US')
sns.lineplot('Date', 'New_Cases', data=dates_df, color='coral', ax=ax2)
ax2.set(xlabel='date', ylabel='new cases')
ax2.grid(b=True, which='major', axis='both', color='green')

plt.box(False)
plt.show()

In [0]:
# group by state

covid_time_series_by_state_df = covid_time_series_df.groupby(['Province_State']).sum()

#### States with max relative change yesterday

In [0]:
# I want to find out top N states with max increase yesterday compared to previous 3 days average
# there is probaby a better way to do this

sdate = datetime(2020, 3, 10)   # start date
edate = (datetime.today() - timedelta(days=1))   # end date

delta = edate - sdate    # as timedelta

print(sdate, edate, delta.days)

days = [(sdate + timedelta(days=i)).strftime('%-m_%-d_%y') for i in range(delta.days+1)]

day_minus_1 = days[-1]
day_minus_2 = days[-2]
day_minus_3 = days[-3]
day_minus_4 = days[-4]
day_minus_5 = days[-5]

last_day_increase = (covid_time_series_by_state_df[day_minus_1] - covid_time_series_by_state_df[day_minus_2]) 
prev_3_day_aveage = ((covid_time_series_by_state_df[day_minus_2] - covid_time_series_by_state_df[day_minus_3]) + 
                     (covid_time_series_by_state_df[day_minus_3] - covid_time_series_by_state_df[day_minus_4]) + 
                     (covid_time_series_by_state_df[day_minus_4] - covid_time_series_by_state_df[day_minus_5]))/3
last_day_increase_relative_percent = (last_day_increase/prev_3_day_aveage) * 100
last_day_increase_relative_percent = last_day_increase_relative_percent.reset_index().fillna(value=0.0).rename(columns={0: "ratio"})

In [0]:
# print top 10 states with max relative change yesterday

last_day_increase_relative_percent = last_day_increase_relative_percent.sort_values(by=['ratio'], ascending=False)
print(last_day_increase_relative_percent.head(10))

In [0]:
# plot relative change

plt.figure(figsize=(20,7))
plt.xticks(
    rotation=90, 
    horizontalalignment='right',
    fontweight='light'
)
plt.box(False)
plt.title(f'{today} : relative increase')
ax = sns.lineplot('Province_State', 'ratio', data=last_day_increase_relative_percent, hue=None, color='coral')
ax.set(xlabel='state', ylabel='relative increase')
ax.grid(b=True, which='major', axis='both', color='green')
plt.show()

#### State level analysis of covid data

In [0]:
# melt the covid_time_series_by_state_df dataframe to change all those date columns to a single column

covid_time_series_by_state_df = covid_time_series_by_state_df.reset_index()
states_df = covid_time_series_by_state_df.melt(id_vars=['Province_State', 'UID', 'code3', 'FIPS', 'Lat', 'Long_'], var_name='Date', value_name='Cases')

# fix dates
states_df.Date = states_df.Date.apply(lambda x: datetime.strptime(x, '%m_%d_%y').strftime('%-m_%d_%y'))

# select rows in march and april, sort by state and add a new column for new_cases
states_df = states_df.query('Date > "3-10-20"')
states_df = states_df.sort_values(by=['Province_State', 'Date'])
states_df['New_Cases'] = states_df.Cases.diff()
states_df = states_df.fillna(0)

In [0]:
states_of_interest1 = ['Virginia', 'Maryland', 'District of Columbia'] 
states_of_interest2 = list(last_day_increase_relative_percent.Province_State.values)[:5]

states_of_interest_df1 = states_df.query('Province_State in @states_of_interest1')
states_of_interest_df1 = states_of_interest_df1[states_of_interest_df1.Date > '4_01_20']

states_of_interest_df2 = states_df.query('Province_State in @states_of_interest2')
states_of_interest_df2 = states_of_interest_df2[states_of_interest_df2.Date > '4_01_20']

##### DC, Virgina and Maryland

In [0]:
plt.figure(figsize=(20,7))
plt.xticks(
    rotation=90, 
    horizontalalignment='right',
    fontweight='light'
)
plt.box(False)
plt.title('new covid cases')
ax = sns.lineplot('Date', 'New_Cases', data=states_of_interest_df1, hue='Province_State', color='coral', legend='brief', linewidth=4.0)
ax.set(xlabel='state', ylabel='new cases')
ax.grid(b=True, which='major', axis='both', color='green')

leg = ax.legend()
# get the individual lines inside legend and set line width
for line in ax.get_lines():
    line.set_linewidth(6)

ax.legend(loc=2, frameon=False)
plt.show()

##### Top 5 states with highest relative change yesterday

In [0]:
plt.figure(figsize=(20,7))
plt.xticks(
    rotation=90, 
    horizontalalignment='right',
    fontweight='light'
)
plt.box(False)
plt.title('new covid cases')
ax = sns.lineplot('Date', 'New_Cases', data=states_of_interest_df2, hue='Province_State', color='coral', legend='brief', linewidth=4.0)
ax.set(xlabel='state', ylabel='new cases')
ax.grid(b=True, which='major', axis='both', color='green')

leg = ax.legend()
# get the individual lines inside legend and set line width
for line in ax.get_lines():
    line.set_linewidth(6)

ax.legend(loc=2, frameon=False)
plt.show()