# Exploratory Data Analysis On World Health Organization Covid19 Report

Globally, as of 8:38pm CEST, 4 May 2022, there have been 512,607,587 confirmed cases of COVID-19, including 6,243,038 deaths, reported to WHO. As of 2 May 2022, a total of 11,560,378,840 vaccine doses have been administered.

our data set is gotten from [WHO](https://covid19.who.int/), the data is constantly being updated.

In [1]:
import pandas as pd 
import numpy as np
import os
import urllib
%matplotlib inline

In [2]:
url = "https://covid19.who.int/WHO-COVID-19-global-data.csv"
file_path = os.path.join("data", "covid")

In [3]:
os.makedirs(file_path, exist_ok=True)
csv_path = os.path.join(file_path, "WHO-COVID-19-global-data.csv")
urllib.request.urlretrieve(url,csv_path)

('data\\covid\\WHO-COVID-19-global-data.csv',
 <http.client.HTTPMessage at 0x232315bb988>)

In [4]:
df = pd.read_csv(csv_path)

In [5]:
df

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0
...,...,...,...,...,...,...,...,...
202156,2022-04-30,ZW,Zimbabwe,AFRO,58,247842,0,5469
202157,2022-05-01,ZW,Zimbabwe,AFRO,0,247842,0,5469
202158,2022-05-02,ZW,Zimbabwe,AFRO,0,247842,0,5469
202159,2022-05-03,ZW,Zimbabwe,AFRO,0,247842,0,5469


### Let's get some infomation from our data

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202161 entries, 0 to 202160
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Date_reported      202161 non-null  object
 1   Country_code       201308 non-null  object
 2   Country            202161 non-null  object
 3   WHO_region         202161 non-null  object
 4   New_cases          202161 non-null  int64 
 5   Cumulative_cases   202161 non-null  int64 
 6   New_deaths         202161 non-null  int64 
 7   Cumulative_deaths  202161 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.3+ MB


In [27]:
df.describe()

Unnamed: 0,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
count,202161.0,202161.0,202161.0,202161.0
mean,2535.64,630908.5,30.881515,11830.825001
std,16647.53,3354937.0,163.375557,54976.114864
min,-32952.0,0.0,-2448.0,0.0
25%,0.0,201.0,0.0,2.0
50%,24.0,11680.0,0.0,151.0
75%,509.0,165421.0,6.0,2778.0
max,1252940.0,80676060.0,11447.0,986698.0


In [21]:
df.dtypes

Date_reported        object
Country_code         object
Country              object
WHO_region           object
New_cases             int64
Cumulative_cases      int64
New_deaths            int64
Cumulative_deaths     int64
dtype: object

In [23]:
df.shape

(202161, 8)

In [24]:
df.head()

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0


In [25]:
df.tail()

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
202156,2022-04-30,ZW,Zimbabwe,AFRO,58,247842,0,5469
202157,2022-05-01,ZW,Zimbabwe,AFRO,0,247842,0,5469
202158,2022-05-02,ZW,Zimbabwe,AFRO,0,247842,0,5469
202159,2022-05-03,ZW,Zimbabwe,AFRO,0,247842,0,5469
202160,2022-05-04,ZW,Zimbabwe,AFRO,0,247842,0,5469


In [7]:
df_index = df.index
df_index

RangeIndex(start=0, stop=202161, step=1)

In [8]:
df_columns = df.columns
df_columns

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')

In [9]:
df_index.values

array([     0,      1,      2, ..., 202158, 202159, 202160], dtype=int64)

In [22]:
df.values

array([['2020-01-03', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ['2020-01-04', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ['2020-01-05', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ...,
       ['2022-05-02', 'ZW', 'Zimbabwe', ..., 247842, 0, 5469],
       ['2022-05-03', 'ZW', 'Zimbabwe', ..., 247842, 0, 5469],
       ['2022-05-04', 'ZW', 'Zimbabwe', ..., 247842, 0, 5469]],
      dtype=object)

In [33]:
df["Country"]

0         Afghanistan
1         Afghanistan
2         Afghanistan
3         Afghanistan
4         Afghanistan
             ...     
202156       Zimbabwe
202157       Zimbabwe
202158       Zimbabwe
202159       Zimbabwe
202160       Zimbabwe
Name: Country, Length: 202161, dtype: object

In [31]:
df["Country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bonaire',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Cayman Islands', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands',
       'Costa Rica', 'Côte d’Ivoire', 'Croatia', 'Cuba', 'Curaçao',
       'Cyprus', 'Czechia', "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvado

In [35]:
# Let us strip out whitespace so we can use dot notation to index our dataFrame
df.columns = [col.strip() for col in df.columns]
df.columns

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')

In [36]:
df.Country

0         Afghanistan
1         Afghanistan
2         Afghanistan
3         Afghanistan
4         Afghanistan
             ...     
202156       Zimbabwe
202157       Zimbabwe
202158       Zimbabwe
202159       Zimbabwe
202160       Zimbabwe
Name: Country, Length: 202161, dtype: object

###  Indexing individual rows and column


In [37]:
df.loc[1:4,'Country']

1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: Country, dtype: object

In [46]:
# Let's see countries where there are new cases
df.loc[1:10, ['Country','New_cases']]

Unnamed: 0,Country,New_cases
1,Afghanistan,0
2,Afghanistan,0
3,Afghanistan,0
4,Afghanistan,0
5,Afghanistan,0
6,Afghanistan,0
7,Afghanistan,0
8,Afghanistan,0
9,Afghanistan,0
10,Afghanistan,0


In [47]:
df.Country == 'United State of America'

0         False
1         False
2         False
3         False
4         False
          ...  
202156    False
202157    False
202158    False
202159    False
202160    False
Name: Country, Length: 202161, dtype: bool

In [48]:
df[df.Country == 'United State of America']

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths


In [51]:
# Let's see places where new death is greater than 1000
df[df.New_deaths > 1000]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
20721,2020-09-08,BO,Bolivia (Plurinational State of),AMRO,528,120769,1610,7008
24023,2020-05-21,BR,Brazil,AMRO,17408,271628,1179,17971
24025,2020-05-23,BR,Brazil,AMRO,18508,310087,1188,20047
24026,2020-05-24,BR,Brazil,AMRO,20803,330890,1001,21048
24030,2020-05-28,BR,Brazil,AMRO,16324,391222,1039,24512
...,...,...,...,...,...,...,...,...
193582,2022-03-17,US,United States of America,AMRO,24417,78952606,1427,962817
193583,2022-03-18,US,United States of America,AMRO,37835,78990441,1491,964308
193584,2022-03-19,US,United States of America,AMRO,33075,79023516,1050,965358
193590,2022-03-25,US,United States of America,AMRO,45092,79182913,1235,969566


In [52]:
# Specifically, which countries have new death greater than 1000
df.loc[df.New_deaths > 1000, ['New_deaths', 'Country']]

Unnamed: 0,New_deaths,Country
20721,1610,Bolivia (Plurinational State of)
24023,1179,Brazil
24025,1188,Brazil
24026,1001,Brazil
24030,1039,Brazil
...,...,...
193582,1427,United States of America
193583,1491,United States of America
193584,1050,United States of America
193590,1235,United States of America


In [53]:
df.loc[(df.New_deaths > 1000) & (df.Country_code == 'US'), ['Date_reported', 'Country','New_cases','New_deaths','Cumulative_deaths']]

Unnamed: 0,Date_reported,Country,New_cases,New_deaths,Cumulative_deaths
192869,2020-04-03,United States of America,28400,1121,6070
192870,2020-04-04,United States of America,29912,1309,7379
192871,2020-04-05,United States of America,31709,1369,8748
192872,2020-04-06,United States of America,32491,1546,10294
192873,2020-04-07,United States of America,27019,1869,12163
...,...,...,...,...,...
193582,2022-03-17,United States of America,24417,1427,962817
193583,2022-03-18,United States of America,37835,1491,964308
193584,2022-03-19,United States of America,33075,1050,965358
193590,2022-03-25,United States of America,45092,1235,969566


### The extrema

In [58]:
df.loc[df.Country_code == 'US', ['New_cases']].max()

New_cases    1252940
dtype: int64

In [59]:
df.loc[df.Country_code == 'US', ['New_cases']].min()

New_cases    0
dtype: int64

In [60]:
df.loc[df.Country_code == 'US', ['New_cases']].sum()

New_cases    80676055
dtype: int64

In [62]:
df.loc[df.Country_code == 'US', ['Cumulative_cases']].max()

Cumulative_cases    80676055
dtype: int64

In [65]:
# Index number of maximum entry
df.New_deaths.idxmax()

35782

In [66]:
df.loc[df.New_deaths.idxmax(), ['Date_reported', 'Country', 'New_cases', 'New_deaths']]

Date_reported    2022-03-22
Country               Chile
New_cases              8167
New_deaths            11447
Name: 35782, dtype: object

In [68]:
# Let's see where enew death is less tha zero
df[df.New_deaths < 0]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
12182,2020-08-30,BS,Bahamas,AMRO,37,2057,-10,40
14294,2021-10-10,BD,Bangladesh,SEARO,481,1562359,-6,27688
17481,2021-02-27,BZ,Belize,AMRO,9,12280,-1,314
23825,2022-03-07,BW,Botswana,AFRO,18,304931,-8,2667
62018,2021-08-27,FR,France,EURO,18812,6511793,-6,112390
62027,2021-09-05,FR,France,EURO,13143,6627205,-32,112981
71768,2020-04-28,GP,Guadeloupe,AMRO,0,149,-3,10
71949,2020-10-26,GP,Guadeloupe,AMRO,0,7474,-1,126
73469,2020-04-23,GT,Guatemala,AMRO,22,316,-1,8
73480,2020-05-04,GT,Guatemala,AMRO,44,688,-1,17


In [73]:
# let us create a column to hold the percentage of new cases
df['pct_cases'] = (df['New_cases'] / df['Cumulative_cases']) * 100
df

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths,pct_cases
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0,
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0,
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0,
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0,
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0,
...,...,...,...,...,...,...,...,...,...
202156,2022-04-30,ZW,Zimbabwe,AFRO,58,247842,0,5469,0.023402
202157,2022-05-01,ZW,Zimbabwe,AFRO,0,247842,0,5469,0.000000
202158,2022-05-02,ZW,Zimbabwe,AFRO,0,247842,0,5469,0.000000
202159,2022-05-03,ZW,Zimbabwe,AFRO,0,247842,0,5469,0.000000
