In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import hashlib
%matplotlib inline 

In [None]:
data_path = 'https://raw.githubusercontent.com/vohcolab/PandaViz-Workshop/main/Pandas/Time%20Series/data/covid19dataPortugal.csv'
# data_path = '../data/covid19dataPortugal.csv' # also works if you are running locally
data_raw = pd.read_csv(data_path)
data_raw.head(5)

# Exercise 1 : Index and datetime

Turn the date column to datetime, set it as index and sort the index.

In [None]:
# We expect the solution to be a dataframe

data = data_raw.copy() # copies the contents of data_raw into a new variable

data['date'] = pd.to_datetime(data['date'], infer_datetime_format=True)
data = data.set_index('date')
data = data.sort_index()           # Don't forget best practices

# YOUR CODE HERE
#raise NotImplementedError()

In [None]:
data.head(3)

In [None]:
expected_hash = '8aed642bf5118b9d3c859bd4be35ecac75b6e873cce34e7b6f554b06f75550d7'
assert hashlib.sha256(str(data.iloc[0].deaths).encode()).hexdigest() == expected_hash
assert hashlib.sha256(str(data.index.dtype).encode()).hexdigest() == '261738f2e43a1c47a16f043b46deb993943d61f4a2bbe5ef4b03c3fb1af362b5'

# Exercise 2: Working with timestamps

#### Exercise 2.1) Day with most deaths

In [None]:
# hint: the answer should be a timestamp

worst_day = data.deaths.idxmax()

# YOUR CODE HERE
#raise NotImplementedError()

In [None]:
worst_day

In [None]:
expected_hash = '7c3185b857e1103d2e9aed349c3797c03510902dea53857bbb05e0ede17441d1'
assert hashlib.sha256(str(worst_day).encode()).hexdigest() == expected_hash

#### 2.2) Tuesday with most confirmed cases


The answer should be a timestamp

In [None]:
worst_tuesday_cases = data.loc[data.index.day_name() == 'Tuesday','confirmed_cases'].idxmax()

In [None]:
worst_tuesday_cases

In [None]:
expected_hash = 'ed0cfdeed24c0a3cde2783625c6af8e952f2be2989d74dfc0762d844d4727cd0'

assert hashlib.sha256(str(worst_tuesday_cases).encode()).hexdigest() == expected_hash

# Exercise 3: Time series methods

#### 3.1) A new high since the summer started

It's The first week of October. Most students are back to having classes, and Summer vacations are officially over. Your friend comes up to you and says "this week has been the worst week with most deaths since the 1st of July. I believe we need to start being more careful now".

To confirm if what your friend is saying is true, compute the deadliest week of the covid data from the 1st of July up until the first week of october (including), and confirm if it actually turns out to be the first week of October.

The answer should be a timestamp

In [None]:
# assume the first week of october ends on the 4th. (For grading purposes)
interval = slice('July 1st 2020', 'October 4th 2020')
weekly_high = data.loc[interval,'deaths'].resample('W').sum().idxmax()

In [None]:
weekly_high

In [None]:
expected_hash = '290519bb74966f3ebeab68bcb3b51550f6954f65918ee48c805aea3a8e4a4422'

assert hashlib.sha256(str(weekly_high).encode()).hexdigest() == expected_hash

#### 3.2) Is it getting better?

Summer is reaching the half point now and rumors says that the number of daily new confirmed cases has been decreasing for a week now. Can you confirm that?

It's currently the 29th of August, and you are asked to return the variation of daily confirmed cases of the past 7 days, excluding today. In other words from the 22nd up to the 28th.

The answer should be a series with the dates in the index and the variations from the previous date in the values

_hint_: be wary of the order of operations that you are going to use if you don't want any NaN values in your final result


In [None]:
pastweek = data.drop(columns='Country').diff().loc['22nd August 2020': '28th August 2020','confirmed_cases']

In [None]:
pastweek

In [None]:
expected_hash = '5f646c309cc217ddbb46b4b46cf94b54a2dd9c44c77036b82ad36dd1ff3385be'

assert hashlib.sha256(str(pastweek).encode()).hexdigest() == expected_hash

#### 3.3) Let's get the trend


It's currently October the 20th and you want to understand the current trend on how the death cases are evolving, without being distracted by noise. You decide to use data from the beginning of October up to today (excluding because today's data hasn't arrived yet).

Use a window of 7 days and compute the number of deaths' trend of October. You must not have any NaN value in your final result.

You should present the results in a pd.Series with time in index and death trend in values

In [None]:
trend_deaths_october_so_far = data.rolling(7).mean().loc['October 1st 2020':'October 19th 2020','deaths']

In [None]:
trend_deaths_october_so_far

In [None]:
expected_hash = 'e4a1f1a7670518a033b529883223f1549d1b2b127e718fba5283392a39daa9d0'

assert hashlib.sha256(str(trend_deaths_october_so_far).encode()).hexdigest() == expected_hash

Would you like to plot it? (Not graded)

In [None]:
trend_deaths_october_so_far.plot()