# Objectives
1. Observe progression of covid-19 by continent and country
2. Investigate whether current level of vaccination is slowing down the spread or not
3. Determine where the vaccine is needed the most by looking at mortality rate

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from matplotlib import pyplot as plt
from collections import Counter, OrderedDict
import pycountry_convert as pc

In [2]:
covid = pd.read_csv(r"C:\Users\taewoo\PycharmProjects\data_professionals\Projects\Datasets\Covid-19 Activity.csv")
covid.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1687420 entries, 0 to 1687419
Data columns (total 13 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   PEOPLE_POSITIVE_CASES_COUNT      1687420 non-null  int64  
 1   COUNTY_NAME                      1558424 non-null  object 
 2   PROVINCE_STATE_NAME              1585556 non-null  object 
 3   REPORT_DATE                      1687420 non-null  object 
 4   CONTINENT_NAME                   1685516 non-null  object 
 5   DATA_SOURCE_NAME                 1687420 non-null  object 
 6   PEOPLE_DEATH_NEW_COUNT           1687420 non-null  int64  
 7   COUNTY_FIPS_NUMBER               1531768 non-null  float64
 8   COUNTRY_ALPHA_3_CODE             1685516 non-null  object 
 9   COUNTRY_SHORT_NAME               1687420 non-null  object 
 10  COUNTRY_ALPHA_2_CODE             1685040 non-null  object 
 11  PEOPLE_POSITIVE_NEW_CASES_COUNT  1687420 non-null 

In [3]:
covid.head(10)

Unnamed: 0,PEOPLE_POSITIVE_CASES_COUNT,COUNTY_NAME,PROVINCE_STATE_NAME,REPORT_DATE,CONTINENT_NAME,DATA_SOURCE_NAME,PEOPLE_DEATH_NEW_COUNT,COUNTY_FIPS_NUMBER,COUNTRY_ALPHA_3_CODE,COUNTRY_SHORT_NAME,COUNTRY_ALPHA_2_CODE,PEOPLE_POSITIVE_NEW_CASES_COUNT,PEOPLE_DEATH_COUNT
0,18046,Kenosha,Wisconsin,2021-04-26,America,New York Times,0,55059.0,USA,United States,US,3,321
1,18096,Kenosha,Wisconsin,2021-04-27,America,New York Times,0,55059.0,USA,United States,US,50,321
2,18123,Kenosha,Wisconsin,2021-04-28,America,New York Times,1,55059.0,USA,United States,US,27,322
3,18148,Kenosha,Wisconsin,2021-04-29,America,New York Times,0,55059.0,USA,United States,US,25,322
4,18163,Kenosha,Wisconsin,2021-04-30,America,New York Times,0,55059.0,USA,United States,US,15,322
5,18188,Kenosha,Wisconsin,2021-05-01,America,New York Times,0,55059.0,USA,United States,US,25,322
6,18212,Kenosha,Wisconsin,2021-05-02,America,New York Times,0,55059.0,USA,United States,US,24,322
7,18212,Kenosha,Wisconsin,2021-05-03,America,New York Times,0,55059.0,USA,United States,US,0,322
8,18239,Kenosha,Wisconsin,2021-05-04,America,New York Times,1,55059.0,USA,United States,US,27,323
9,18251,Kenosha,Wisconsin,2021-05-05,America,New York Times,0,55059.0,USA,United States,US,12,323


### Data Cleansing

In [4]:
# dropping columns that aren't too useful for this analysis
covid.drop(columns=['COUNTY_FIPS_NUMBER', 'DATA_SOURCE_NAME', 'COUNTRY_ALPHA_2_CODE', 'COUNTRY_ALPHA_3_CODE'], inplace=True)
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1687420 entries, 0 to 1687419
Data columns (total 9 columns):
 #   Column                           Non-Null Count    Dtype 
---  ------                           --------------    ----- 
 0   PEOPLE_POSITIVE_CASES_COUNT      1687420 non-null  int64 
 1   COUNTY_NAME                      1558424 non-null  object
 2   PROVINCE_STATE_NAME              1585556 non-null  object
 3   REPORT_DATE                      1687420 non-null  object
 4   CONTINENT_NAME                   1685516 non-null  object
 5   PEOPLE_DEATH_NEW_COUNT           1687420 non-null  int64 
 6   COUNTRY_SHORT_NAME               1687420 non-null  object
 7   PEOPLE_POSITIVE_NEW_CASES_COUNT  1687420 non-null  int64 
 8   PEOPLE_DEATH_COUNT               1687420 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 115.9+ MB


In [5]:
covid.isnull().sum()

PEOPLE_POSITIVE_CASES_COUNT             0
COUNTY_NAME                        128996
PROVINCE_STATE_NAME                101864
REPORT_DATE                             0
CONTINENT_NAME                       1904
PEOPLE_DEATH_NEW_COUNT                  0
COUNTRY_SHORT_NAME                      0
PEOPLE_POSITIVE_NEW_CASES_COUNT         0
PEOPLE_DEATH_COUNT                      0
dtype: int64

In [6]:
def country_to_continent(country_name):
    '''
    return continent of a country
    
        Param:
            country_name (str) = name of a country
        Return:
            continent_name (str) = name of a continent
    '''
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return continent_name

# drop duplicates
covid.drop_duplicates(inplace=True)
# extract countries with nontype continent
country_of_nan_continent = covid[covid['CONTINENT_NAME'].isnull()]['COUNTRY_SHORT_NAME'].unique() 
# fill continents
for country in country_of_nan_continent:
    country_index = covid.loc[covid['COUNTRY_SHORT_NAME'] == country].index
    covid.loc[country_index, 'CONTINENT_NAME'] = country_to_continent(country)

# It's possible to find province if county can be found, not possible vice versa
print('county without null, province with null')
print(covid.loc[covid['COUNTY_NAME'].notnull() & covid['PROVINCE_STATE_NAME'].isnull()].shape[0])
print('county with null, province with null')
print(covid.loc[covid['COUNTY_NAME'].isnull() & covid['PROVINCE_STATE_NAME'].isnull()].shape[0])
print('county with null, province without null')
print(covid.loc[covid['COUNTY_NAME'].isnull() & covid['PROVINCE_STATE_NAME'].notnull()].shape[0])
print()

# all county are unknown, so fill nan with 'Unknown'
covid.fillna('Unknown', inplace=True)
print(covid.isnull().sum())

county without null, province with null
0
county with null, province with null
101864
county with null, province without null
27132

PEOPLE_POSITIVE_CASES_COUNT        0
COUNTY_NAME                        0
PROVINCE_STATE_NAME                0
REPORT_DATE                        0
CONTINENT_NAME                     0
PEOPLE_DEATH_NEW_COUNT             0
COUNTRY_SHORT_NAME                 0
PEOPLE_POSITIVE_NEW_CASES_COUNT    0
PEOPLE_DEATH_COUNT                 0
dtype: int64


In [8]:
def provincial_case(df, country):
    '''
    Args:
        df: DataFrame
        country: str
    Returns:
        a list of *confirmed* cases of Covid-19 for each province in the country
    Order:
        Descending
    '''
    country_table = df.loc[df['COUNTRY_SHORT_NAME'] == country]
    province_list = country_table['PROVINCE_STATE_NAME'].unique()
    province_dict = dict()
    for province in province_list:
        province_dict[province] = country_table.loc[country_table['PROVINCE_STATE_NAME'] == province]['PEOPLE_POSITIVE_CASES_COUNT'].max()
    return sorted(province_dict.items(), key=lambda x: x[1], reverse=True)


def country_case(df, country):
    '''
    Args:
        df: DataFrame
        country: str
    Returns:
        a sum of *confirmed* cases of Covid-19 in a country
    '''
    province_list = df.loc[df['COUNTRY_SHORT_NAME'] == country]['PROVINCE_STATE_NAME'].unique()
    total = 0
    for province in province_list:
        total += df.loc[df['PROVINCE_STATE_NAME'] == province]['PEOPLE_POSITIVE_NEW_CASES_COUNT'].sum()
    return (country, total)

### 10 countries with the most positive cases of Covid-19

In [9]:
every_country = covid['COUNTRY_SHORT_NAME'].unique()
country_list = []
for country in every_country:
    country_list.append(country_case(covid, country))
country_list = sorted(country_list, key=lambda x:x[1], reverse=True)

for i in range(10):
    print('Top {} {}'.format(i+1, country_list[i]))

Top 1 ('China', 124873898)
Top 2 ('Brunei', 124771255)
Top 3 ('Czechia', 124771255)
Top 4 ('Russia', 124771255)
Top 5 ('Egypt', 124771255)
Top 6 ('South Sudan', 124771255)
Top 7 ('Qatar', 124771255)
Top 8 ('Belarus', 124771255)
Top 9 ('Uzbekistan', 124771255)
Top 10 ('Malta', 124771255)


### Debugging
Every country got the value of **124771255**

Let's examine what's happening.

In [11]:
def country_case(df, country):
    province_list = df.loc[df['COUNTRY_SHORT_NAME'] == country]['PROVINCE_STATE_NAME'].unique()
    return (province_list)

# checking if there's anything in the province column that's causing the problem
print(country_case(covid, 'Uruguay'))
print(country_case(covid, 'Estonia'))
print(country_case(covid, 'South Sudan'))

['Unknown']
['Unknown']
['Unknown']


Found the problem!

The code is summing up cases of all 'Unknown' provinces from the world.

To limit the search to within a country, I need to use a temporary dataframe.

In [12]:
def country_case(df, country):
    '''
    Args:
        df: DataFrame
        country: str
    Returns:
        a sum of *confirmed* cases of Covid-19 in a country
    '''
    country_df = df.loc[df['COUNTRY_SHORT_NAME'] == country]
    province_list = country_df['PROVINCE_STATE_NAME'].unique()
    total = 0
    for province in province_list:
        total += country_df.loc[country_df['PROVINCE_STATE_NAME'] == province]['PEOPLE_POSITIVE_NEW_CASES_COUNT'].sum()
    return (country, total)

country_list = []
for country in every_country:
    country_list.append(country_case(covid, country))
country_list = sorted(country_list, key=lambda x:x[1], reverse=True)
for i in range(10):
    print('Top {} {}'.format(i+1, country_list[i]))

Top 1 ('United States', 32772514)
Top 2 ('India', 22992517)
Top 3 ('Brazil', 15209990)
Top 4 ('France', 5730585)
Top 5 ('Turkey', 5044936)
Top 6 ('Russia', 4832959)
Top 7 ('United Kingdom', 4437217)
Top 8 ('Italy', 4116287)
Top 9 ('Spain', 3581392)
Top 10 ('Germany', 3538208)


Now I got the result that's more probable.

Debugging is done.

Since the *U.S.* is the country with **the most cases**, let's take a look at the continent: America.

Let's sort countries from the most cases to the least.

In [None]:
america = covid.loc[covid['CONTINENT_NAME'] == 'America']['COUNTRY_SHORT_NAME'].unique()
america_list = []
for country in america:
    america_list.append(country_case(covid, country))
print(sorted(america_list, key=lambda x:x[1], reverse=True))

Positive Cases of Covid-19 in all states of the United States

In [None]:
print(provincial_case(covid, 'United States'))

### Investigating the progression of COVID-19 from Jan - July 2020 by continent
**Goals**
* Group data by continents and time
* Use month as the time measurement unit
* Create a newly reported cases graph

In [None]:
covid['YEAR'] = pd.to_datetime(covid['REPORT_DATE']).dt.year
covid['MONTH'] = pd.to_datetime(covid['REPORT_DATE']).dt.month
continent_new_case = covid[['CONTINENT_NAME', 'PEOPLE_POSITIVE_NEW_CASES_COUNT','YEAR', 'MONTH']].groupby(
                       ['CONTINENT_NAME','YEAR', 'MONTH'], as_index=False).sum()
print(continent_new_case)

To display all this information on the same graph, let's drop Asia's Dec of 2019 and add Africa's January of 2020.

Then, I can start making graphs

### Positive Cases of each Continent

In [None]:
continent_new_case.drop(index=13, inplace=True)
continent_new_case = continent_new_case.append({'CONTINENT_NAME':'Africa', 'YEAR':2020, 'MONTH':1, 'PEOPLE_POSITIVE_NEW_CASES_COUNT':0}, ignore_index=True)
continent_new_case.sort_values(by=['CONTINENT_NAME', 'MONTH'], inplace=True)
continent_new_case.reset_index(drop=True, inplace=True)

In [None]:
print(continent_new_case[['CONTINENT_NAME', 'MONTH', 'PEOPLE_POSITIVE_NEW_CASES_COUNT']])

In [None]:
continent_list = continent_new_case['CONTINENT_NAME'].unique()
color_list = ['tab:purple', 'tab:red', 'tab:blue', 'tab:green', 'tab:orange']
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = [18, 10]

# use for loop to create a line graph for each continent
for continent, color in zip(continent_list, color_list):
    plt.plot('MONTH', 'PEOPLE_POSITIVE_NEW_CASES_COUNT', data=continent_new_case.loc[continent_new_case['CONTINENT_NAME']==continent], c=color, label=continent, marker='o', linewidth=3)

# create a list of sum of cases for each continent in July (last month)
max_continent = []
for continent in continent_list:
    max_continent.append(continent_new_case.loc[(continent_new_case['CONTINENT_NAME'] == continent) & 
                                                (continent_new_case['MONTH'] == 7)]['PEOPLE_POSITIVE_NEW_CASES_COUNT'].sum())

# create and position labels
for y in max_continent:
    if y >= 1000000:
        plt.text(7.05, y, str(round(y/1000000, 2)) +'M')
    else:
        plt.text(7.05, y, str(round(y/1000)) +'K')
plt.legend()
plt.ylim(-100000, 3000000)
plt.xlim(1, 7.3)
plt.xlabel('Month')
plt.title('Newly Reported Cases of Covid-19 for each Continent')
plt.ylabel('Number of Infectee (M)')
plt.show()

To be clear each plot **doesn't** represent the sum of previous cases + new cases. It's a quantity of newly reported cases.

* Initially, Asia experienced the fastest growth rate as Wuhan is the place of origin of Covid-19
* The number in Europe spiked up in between February and April then experienced a consistent decline in growth rate
* America started slowly, then has been experiencing the fastest growth rate (it's on-going)
* Growth rates in both Africa and Oceania have been steadily and very slowly increasing

In [None]:
# create a list of total of new cases for each time period
aggregate_cases = []
total = 0
for continent in continent_list:
    for item, new_case in zip(continent_new_case['CONTINENT_NAME'], continent_new_case['PEOPLE_POSITIVE_NEW_CASES_COUNT']):
        if continent == item:
            total += new_case
            aggregate_cases.append(total)
        else:
            total = 0

# add it as a new column
continent_new_case['AGGREGATE_CASES'] = aggregate_cases

In [None]:
for continent, color in zip(continent_list, color_list):
    plt.plot('MONTH', 'AGGREGATE_CASES', data=continent_new_case.loc[continent_new_case['CONTINENT_NAME']==continent], c=color, label=continent, marker='o', linewidth=3)

max_continent = []
for continent in continent_list:
    max_continent.append(continent_new_case.loc[(continent_new_case['CONTINENT_NAME'] == continent) & 
                                                (continent_new_case['MONTH'] == 7)]['AGGREGATE_CASES'].sum())
for y in max_continent:
    if y >= 1000000:
        plt.text(7.05, y, str(round(y/1000000, 2)) +'M')
    else:
        plt.text(7.05, y, str(round(y/1000)) +'K')
plt.legend()
plt.xlabel('Month')
plt.xlim(1, 7.3)
plt.title('Sum of all cases of Covid-19 for each Continent')
plt.ylabel('Number of Infectee (M)')
plt.show()

* America is by far the *most* dangerous continent to travel in the current state - **fastest growth rate** since March
* Asia and Africa are experiencing a steady increase
* Europe and Oceania are managing to flatten the curves

### Counting Deaths of each Continent

In [None]:
continent_death = covid[['CONTINENT_NAME', 'PEOPLE_DEATH_NEW_COUNT', 'YEAR', 'MONTH']].groupby(
                       ['CONTINENT_NAME','YEAR', 'MONTH'], as_index=False).sum()
continent_death = continent_death.append({'CONTINENT_NAME':'Africa', 'YEAR':2020, 'MONTH':1, 'PEOPLE_DEATH_NEW_COUNT':0}, ignore_index=True)
continent_death.drop(index=13, inplace=True)
continent_death.sort_values(by=['CONTINENT_NAME', 'MONTH'], inplace=True)
continent_death.reset_index(drop=True, inplace=True)

DEATH_AGGREGATE = []
death_count = 0
for continent in continent_list:
    for item, death in zip(continent_death['CONTINENT_NAME'], continent_death['PEOPLE_DEATH_NEW_COUNT']):
        if continent == item:
            death_count += death
            DEATH_AGGREGATE.append(death_count)
        else:
            death_count = 0
continent_death['DEATH_AGGREGATE'] = DEATH_AGGREGATE
print(continent_death)

In [None]:
for continent, color in zip(continent_list, color_list):
    plt.plot('MONTH', 'DEATH_AGGREGATE', data=continent_death.loc[continent_death['CONTINENT_NAME']==continent], c=color, label=continent, marker='o', linewidth=3)

max_death = []
for continent in continent_list:
    max_death.append(continent_death.loc[(continent_death['CONTINENT_NAME'] == continent) & 
                                                (continent_death['MONTH'] == 7)]['DEATH_AGGREGATE'].sum())

for y in max_death:
    if y >= 1000:
        plt.text(7.05, y, str(round(y/1000)) +'K')
    else:
        plt.text(7.05, y, str(y))
plt.legend()
plt.xlim(1, 7.3)
plt.xlabel('Month')
plt.title('Death Toll for each Continent')
yticks = [50000, 100000, 150000, 200000, 250000, 300000, 350000]
ylabels = ['50K', '100K', '150K', '200K', '250K', '300K', '350K']
plt.yticks(ticks=yticks, labels=ylabels)
plt.ylabel('Fatality')
plt.show()

* As observed from previous graphs, the decline of Covid-19's positive cases in Europe has resulted in gentler slopes after April
* The extreme growth rate of positive cases in America is reflected by the steep slope on this graph
* Although the total of positive cases in Asia is greater than that of Europe by 80,000, death toll is much lower (it could be the consequence of how well the countries handled the situation)

### Mortality Rate

In [None]:
def death_rate(df, continent):
    '''
    Args:
        df: DataFrame
        header: str or a list of str, a header of a column 
    Return:
         death rate of a continent or an average death rate of continents
    '''
    # if calculating death rate of a continent, find the lastest updates of death and infected population
    if type(continent) == str:
        temp = df.loc[df['CONTINENT_NAME'] == continent]
        return '{}%'.format(round(temp['PEOPLE_DEATH_COUNT'].max()/temp['PEOPLE_POSITIVE_CASES_COUNT'].max()*100, 2))
    # if calculating death rate of continents, find each continent's death rate and find the average
    else:
        death = []
        population = []
        total = 0
        for item in continent:
            death.append(df.loc[df['CONTINENT_NAME'] == item]['PEOPLE_DEATH_COUNT'].max())
            population.append(df.loc[df['CONTINENT_NAME'] == item]['PEOPLE_POSITIVE_CASES_COUNT'].max())
        for d, p in zip(death, population):
            total += d/p*100
        return '{}%'.format(round(total/len(death), 2))

print(death_rate(covid,'Africa'))
print(death_rate(covid,'America'))
print(death_rate(covid,['Africa', 'America']))

In [None]:
for continent in continent_list:
    print(f"Mortality rate of Covid-19 in {continent}: {death_rate(covid, continent)}")
print(f'===============\nAverage mortality rate of Covid-19: {death_rate(covid, continent_list)}')

*Europe* has recorded the highest mortality rate of **5.77%** while *Oceania* has managed the lowest of **0.99%**.

Overall, the average is **2.88%**, which is considered **not fatal**.

In [None]:
total_all = covid[['MONTH', 'PEOPLE_POSITIVE_NEW_CASES_COUNT', 'PEOPLE_DEATH_NEW_COUNT']].groupby(['MONTH']).sum()
total_all.drop(12, inplace=True)
total_positive_list = []
total_death_list = []
count_positive = 0
count_death = 0
for positive, death in zip(total_all['PEOPLE_POSITIVE_NEW_CASES_COUNT'], total_all['PEOPLE_DEATH_NEW_COUNT']):
    count_positive += positive
    count_death += death
    total_positive_list.append(count_positive)
    total_death_list.append(count_death)
total_all['TOTAL_POSITIVE_CASE'] = total_positive_list
total_all['TOTAL_DEATH'] = total_death_list
print(total_all)

In [None]:
total_all['TOTAL_POSITIVE_CASE'].plot(linewidth=3, marker='o')
plt.title('All Positive Cases of Covid-19')
#setting up y-axis and x-axis of all positive cases
plt.xlim(1, 7.3)
plt.ylim(-100000, 16000000)
yticks = [0, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000, 14000000, 16000000]
ylabels = ['0', '2M', '4M', '6M', '8M', '10M', '12M', '14M', '16M']
plt.yticks(ticks=yticks, labels=ylabels)
y = total_all['TOTAL_POSITIVE_CASE'].max()
plt.text(6.85, y+200000, str(y))
plt.show()

total_all['TOTAL_DEATH'].plot(linewidth=3, marker='o')
plt.title('International Death Toll')
#setting up y-axis and x-axis of death toll
plt.xlim(1, 7.3)
plt.ylim(-10000, 700000)
yticks = [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000]
ylabels = ['0', '100K', '200K', '300K', '400K', '500K', '600K', '700k']
plt.yticks(ticks=yticks, labels=ylabels)
y = total_all['TOTAL_DEATH'].max()
plt.text(6.9, y+10000, str(y))
plt.show()

In [None]:
covid['REPORT_DATE'] = pd.to_datetime(covid['REPORT_DATE'])
covid.sort_values(by='REPORT_DATE', inplace=True)

### Plotting every point

In [None]:
covid_daily = covid.groupby('REPORT_DATE', as_index=False).sum()
plt.plot_date(x=covid_daily['REPORT_DATE'], y=covid_daily['PEOPLE_POSITIVE_CASES_COUNT'])
yticks = [0, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000, 14000000, 16000000]
ylabels = ['0', '2M', '4M', '6M', '8M', '10M', '12M', '14M', '16M']
plt.yticks(ticks=yticks, labels=ylabels)
plt.title('The Tracks of Covid-19')
plt.show()

Why is the last point out of place?

Since this graph is reporting the aggregate of all reported cases in the past, the last point cannot be where it is plotted.

Let's investigate

### Tracking down the problem

In [None]:
print(covid_daily.tail(2))

In [None]:
print(covid.loc[covid['REPORT_DATE']=='2020-07-23']['PEOPLE_POSITIVE_CASES_COUNT'].isnull().sum())
print(covid.loc[covid['REPORT_DATE'].isin(['2020-07-23', '2020-07-22'])]
              [['PEOPLE_POSITIVE_CASES_COUNT', 'CONTINENT_NAME', 'REPORT_DATE']]
              .groupby(['CONTINENT_NAME', 'REPORT_DATE']).sum())

In America, cases went from 8026207 to 4037927. Given it happened within a day, the number should've been more like 8037927

Let's see where in America it happened

In [None]:
pd.set_option('display.max_rows', None)
print(covid.loc[(covid['REPORT_DATE'].isin(['2020-07-23', '2020-07-22'])) & (covid['CONTINENT_NAME'] == 'America')]
[['PEOPLE_POSITIVE_CASES_COUNT', 'COUNTRY_SHORT_NAME', 'REPORT_DATE']].groupby(['COUNTRY_SHORT_NAME', 'REPORT_DATE']).sum())

Everywhere looks fine except the U.S.

It was due to missing 2020-07-23 reports in the U.S.

Since the total on 2020-07-23 is highly affected by this factor, I'm going to graph til 2020-07-22

In [None]:
# drop the reports on 2020-07-23
covid_temp = covid.copy()
covid_temp.drop(covid_temp.loc[covid_temp['REPORT_DATE'] == '2020-07-23'].index, inplace=True)
covid_daily = covid_temp.groupby('REPORT_DATE', as_index=False).sum()
plt.plot_date(x=covid_daily['REPORT_DATE'], y=covid_daily['PEOPLE_POSITIVE_CASES_COUNT'])
yticks = [0, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000, 14000000, 16000000]
ylabels = ['0', '2M', '4M', '6M', '8M', '10M', '12M', '14M', '16M']
plt.yticks(ticks=yticks, labels=ylabels)
plt.title('The Tracks of Covid-19')
plt.show()

Now it looks great with no error

In [None]:
covid_daily = covid.groupby('REPORT_DATE', as_index=False).sum()
covid_daily.drop(covid_temp.loc[covid_temp['REPORT_DATE'] == '2020-07-23'].index, inplace=True)
plt.plot_date(x=covid_daily['REPORT_DATE'], y=covid_daily['PEOPLE_POSITIVE_CASES_COUNT'])
yticks = [0, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000, 14000000, 16000000]
ylabels = ['0', '2M', '4M', '6M', '8M', '10M', '12M', '14M', '16M']
plt.yticks(ticks=yticks, labels=ylabels)
plt.title('The Tracks of Covid-19')
plt.show()