In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from collections import Counter, OrderedDict

In [2]:
covid = pd.read_csv("Covid-19 Activity.csv")
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639710 entries, 0 to 639709
Data columns (total 13 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   COUNTY_NAME                      595792 non-null  object 
 1   PEOPLE_POSITIVE_CASES_COUNT      639710 non-null  float64
 2   REPORT_DATE                      639710 non-null  object 
 3   PROVINCE_STATE_NAME              597892 non-null  object 
 4   CONTINENT_NAME                   639710 non-null  object 
 5   DATA_SOURCE_NAME                 639710 non-null  object 
 6   PEOPLE_DEATH_NEW_COUNT           639710 non-null  float64
 7   COUNTY_FIPS_NUMBER               585672 non-null  float64
 8   COUNTRY_ALPHA_3_CODE             639710 non-null  object 
 9   COUNTRY_SHORT_NAME               639710 non-null  object 
 10  COUNTRY_ALPHA_2_CODE             639504 non-null  object 
 11  PEOPLE_POSITIVE_NEW_CASES_COUNT  639710 non-null  float64
 12  PE

In [3]:
covid.head()

Unnamed: 0,COUNTY_NAME,PEOPLE_POSITIVE_CASES_COUNT,REPORT_DATE,PROVINCE_STATE_NAME,CONTINENT_NAME,DATA_SOURCE_NAME,PEOPLE_DEATH_NEW_COUNT,COUNTY_FIPS_NUMBER,COUNTRY_ALPHA_3_CODE,COUNTRY_SHORT_NAME,COUNTRY_ALPHA_2_CODE,PEOPLE_POSITIVE_NEW_CASES_COUNT,PEOPLE_DEATH_COUNT
0,Wahkiakum,0.0,2020-02-18,Washington,America,New York Times,0.0,53069.0,USA,United States,US,0.0,0.0
1,Wahkiakum,5.0,2020-07-17,Washington,America,New York Times,0.0,53069.0,USA,United States,US,0.0,0.0
2,Wahkiakum,0.0,2020-03-02,Washington,America,New York Times,0.0,53069.0,USA,United States,US,0.0,0.0
3,Stephenson,223.0,2020-06-11,Illinois,America,New York Times,0.0,17177.0,USA,United States,US,1.0,5.0
4,Stephenson,1.0,2020-03-22,Illinois,America,New York Times,0.0,17177.0,USA,United States,US,1.0,0.0


In [4]:
covid.isnull().sum()

COUNTY_NAME                        43918
PEOPLE_POSITIVE_CASES_COUNT            0
REPORT_DATE                            0
PROVINCE_STATE_NAME                41818
CONTINENT_NAME                         0
DATA_SOURCE_NAME                       0
PEOPLE_DEATH_NEW_COUNT                 0
COUNTY_FIPS_NUMBER                 54038
COUNTRY_ALPHA_3_CODE                   0
COUNTRY_SHORT_NAME                     0
COUNTRY_ALPHA_2_CODE                 206
PEOPLE_POSITIVE_NEW_CASES_COUNT        0
PEOPLE_DEATH_COUNT                     0
dtype: int64

In [5]:
# dropping columns that aren't too useful for this analysis
covid.drop(columns=['COUNTRY_ALPHA_2_CODE', 'COUNTY_FIPS_NUMBER'], inplace=True)
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639710 entries, 0 to 639709
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   COUNTY_NAME                      595792 non-null  object 
 1   PEOPLE_POSITIVE_CASES_COUNT      639710 non-null  float64
 2   REPORT_DATE                      639710 non-null  object 
 3   PROVINCE_STATE_NAME              597892 non-null  object 
 4   CONTINENT_NAME                   639710 non-null  object 
 5   DATA_SOURCE_NAME                 639710 non-null  object 
 6   PEOPLE_DEATH_NEW_COUNT           639710 non-null  float64
 7   COUNTRY_ALPHA_3_CODE             639710 non-null  object 
 8   COUNTRY_SHORT_NAME               639710 non-null  object 
 9   PEOPLE_POSITIVE_NEW_CASES_COUNT  639710 non-null  float64
 10  PEOPLE_DEATH_COUNT               639710 non-null  float64
dtypes: float64(4), object(7)
memory usage: 53.7+ MB


In [6]:
# since a count of people will always be a whole number, change floats to ints
covid['PEOPLE_POSITIVE_CASES_COUNT'] = covid['PEOPLE_POSITIVE_CASES_COUNT'].astype(int)
covid['PEOPLE_DEATH_NEW_COUNT'] = covid['PEOPLE_DEATH_NEW_COUNT'].astype(int)
covid['PEOPLE_POSITIVE_NEW_CASES_COUNT'] = covid['PEOPLE_POSITIVE_NEW_CASES_COUNT'].astype(int)
covid['PEOPLE_DEATH_COUNT'] = covid['PEOPLE_DEATH_COUNT'].astype(int)

# filter the rows that hold no information for country name, positive cases, and death counts
no_stats = covid.loc[(covid['COUNTY_NAME'].isnull()) & (covid['PEOPLE_POSITIVE_CASES_COUNT'] == 0) &
         (covid['PEOPLE_DEATH_NEW_COUNT'] == 0) & (covid['PEOPLE_POSITIVE_NEW_CASES_COUNT'] == 0)
         & (covid['PEOPLE_DEATH_COUNT'] == 0)]

# drop those filtered rows and get rid of duplicates
covid.drop(list(no_stats.index), inplace=True)
covid.drop_duplicates(inplace=True)

# for the rest, replace with 'unknown' since they hold stats
covid.fillna('Unknown', inplace=True)
covid.isnull().sum()

COUNTY_NAME                        0
PEOPLE_POSITIVE_CASES_COUNT        0
REPORT_DATE                        0
PROVINCE_STATE_NAME                0
CONTINENT_NAME                     0
DATA_SOURCE_NAME                   0
PEOPLE_DEATH_NEW_COUNT             0
COUNTRY_ALPHA_3_CODE               0
COUNTRY_SHORT_NAME                 0
PEOPLE_POSITIVE_NEW_CASES_COUNT    0
PEOPLE_DEATH_COUNT                 0
dtype: int64

In [7]:
def provincial_case(df, country):
    '''
    Args:
        df: DataFrame
        country: str
    Returns:
        a list of *confirmed* cases of Covid-19 for each province in the country
    Order:
        Descending
    '''
    country_table = df.loc[df['COUNTRY_SHORT_NAME'] == country]
    province_list = country_table['PROVINCE_STATE_NAME'].unique()
    province_dict = dict()
    for province in province_list:
        province_table = country_table.loc[country_table['PROVINCE_STATE_NAME'] == province]
        province_dict[province] = province_table['PEOPLE_POSITIVE_CASES_COUNT'].sum()
    return sorted(province_dict.items(), key=lambda x: x[1], reverse=True)


def country_case(df, country):
    '''
    Args:
        df: DataFrame
        country: str
    Returns:
        a list of *confirmed* cases of Covid-19 in a country
    '''
    country_table = df.loc[df['COUNTRY_SHORT_NAME'] == country]
    total = country_table['PEOPLE_POSITIVE_CASES_COUNT'].sum()
    return (country, total)

print(provincial_case(covid, 'Canada'))

[('Quebec', 4642131), ('Ontario', 2749327), ('Alberta', 701478), ('British Columbia', 281343), ('Nova Scotia', 105003), ('Saskatchewan', 67935), ('Manitoba', 31709), ('Newfoundland and Labrador', 29661), ('New Brunswick', 15835), ('Prince Edward Island', 3229), ('Repatriated travellers', 1666), ('Yukon', 1218), ('Northwest Territories', 567)]


In [8]:
covid['CONTINENT_NAME'].unique()

array(['America', 'Africa', 'Europe', 'Asia', 'Oceania'], dtype=object)

In [9]:
# let's sort countries from the most cases to the least in America
america = covid.loc[covid['CONTINENT_NAME'] == 'America']['COUNTRY_SHORT_NAME'].unique()
america_list = []
for country in america:
    america_list.append(country_case(covid, country))
print(sorted(america_list, key=lambda x:x[1], reverse=True))

[('United States', 204289126), ('Brazil', 77614924), ('Peru', 17372801), ('Chile', 14441391), ('Mexico', 13026170), ('Canada', 8631102), ('Colombia', 5692756), ('Ecuador', 4257480), ('Argentina', 3581405), ('Dominican Republic', 2190059), ('Panama', 2087264), ('Bolivia', 1830412), ('Guatemala', 1053953), ('Honduras', 1036961), ('El Salvador', 383427), ('Venezuela', 337278), ('Haiti', 298353), ('Costa Rica', 274691), ('Cuba', 203929), ('Paraguay', 134883), ('Nicaragua', 121961), ('Uruguay', 88493), ('Jamaica', 57368), ('Suriname', 25335), ('Guyana', 17264), ('Cayman Islands', 14836), ('Trinidad and Tobago', 14258), ('Bermuda', 13808), ('Aruba', 11470), ('Bahamas', 10501), ('Barbados', 10130), ('Sint Maarten', 8078), ('Antigua and Barbuda', 4096), ('Turks and Caicos islands', 2536), ('Saint Vincent and the Grenadines', 2532), ('Belize', 2404), ('Grenada', 2353), ('Curaçao', 2259), ('Saint Lucia', 2119), ('Dominica', 1972), ('Saint Kitts and Nevis', 1694), ('Greenland', 1430), ('Falkland 