## Lab 1: Question 4. Covid19 and vaccination in the world

Date: 2021-11-3

---
Data come from ECDC, WHO. The data is possible to download from [FHM][link].

[link]:https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/statistik-och-analyser/antal-fall-globalt/

In this exersice, I will check the **Covid19** and **vaccination status** in the world.


---

## Investigage the Covid19 status using data from ECDC:
- plot Covid19 dataset on the Map
- plot the data in the world
- plot the data in continents: Asia, Europe, Africa
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
import plotly.graph_objects as go
from datetime import datetime
from CovidData import CovidData

In [2]:
# Import covid19 dataframe from ECDC
df_object = CovidData("ecdc.csv")
# print dataframe information
df_object.parse_data()
print(df_object.show_info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38552 entries, 0 to 38551
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           38552 non-null  object 
 1   country_code      37412 non-null  object 
 2   continent         38552 non-null  object 
 3   population        38552 non-null  int64  
 4   indicator         38552 non-null  object 
 5   weekly_count      38552 non-null  int64  
 6   year_week         38552 non-null  object 
 7   rate_14_day       38108 non-null  float64
 8   cumulative_count  38552 non-null  int64  
 9   source            38552 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 2.9+ MB

        Dataframe info:
None

        Name: ecdc.csv

        Sheet name: None

        Data head():
       country country_code continent  population indicator  weekly_count  \
0  Afghanistan          AFG      Asia    38928341     cases             0   
1  Afghan

In [3]:
# Using the main_pipe() method to drop all NAs in the dataframe
covid19_ecdc = df_object.main_pipe()
covid19_ecdc.shape

(36980, 10)

## Plotting the COVID-19 Dataset on a Map

In [4]:
def plot_on_map(df, locations="country_code",
                    color="cumulative_count",
                    hover_name="country",
                    animation_frame="year_week",
                    title = "Covid19 Cases in cumulative count",
                    scope=None):
    fig = px.choropleth(df, locations=locations,
                    color=color,
                    scope=scope,
                    hover_name=hover_name,
                    animation_frame=animation_frame,
                    title = title, 
                    range_color=[0,df[color].quantile(0.8)],
                    color_continuous_scale=px.colors.sequential.Plasma)
 
 
    fig["layout"].pop("updatemenus")
    fig.show()
    fig.write_html(f'Visualiseringar/Q4.{title}.html', auto_open=True)

    

In [5]:
plot_on_map(covid19_ecdc)
plot_on_map(covid19_ecdc, color="weekly_count", title="Covid19 Cases in weekly count")

In [6]:
#select entries with the continent as asia
covid19_Asia = covid19_ecdc[covid19_ecdc.continent == 'Asia']


#plot
plot_on_map(covid19_Asia, scope ='asia', title = "Cumulative COVID19 cases in Asia")
plot_on_map(covid19_Asia, color="weekly_count", scope ='asia', title = "Weekly new COVID19 cases in Asia")

In [7]:
continents = covid19_ecdc["continent"].unique().tolist()[0:3]

for continent in continents:
    covid19_continent = covid19_ecdc[covid19_ecdc["continent"]== continent]

    #plot
    plot_on_map(covid19_continent, scope = continent.lower(), title = f"Cumulative COVID19 cases in {continent}")
    plot_on_map(covid19_continent, color="weekly_count", scope = continent.lower(), title = f"Weekly new COVID19 cases in {continent}")


## Investigage the Covid19 status using data from WHO:
- piploy of the Covid19 dataset and represent the most risky countries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
import plotly.graph_objects as go
from datetime import datetime
from CovidData import CovidData

In [9]:
# import COVID dataframe from WHO
df_object = CovidData('WHO-COVID-19-global-data.csv')
# print dataframe information
df_object.parse_data()
print(df_object.show_info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159027 entries, 0 to 159026
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Date_reported      159027 non-null  object
 1   Country_code       158356 non-null  object
 2   Country            159027 non-null  object
 3   WHO_region         159027 non-null  object
 4   New_cases          159027 non-null  int64 
 5   Cumulative_cases   159027 non-null  int64 
 6   New_deaths         159027 non-null  int64 
 7   Cumulative_deaths  159027 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 9.7+ MB

        Dataframe info:
None

        Name: WHO-COVID-19-global-data.csv

        Sheet name: None

        Data head():
  Date_reported Country_code      Country WHO_region  New_cases  \
0    2020-01-03           AF  Afghanistan       EMRO          0   
1    2020-01-04           AF  Afghanistan       EMRO          0   
2    2020-01-05           AF  Afghani

In [10]:
# Using the main_pipe() method to drop all NAs in the dataframe
who_covid19 = df_object.main_pipe()
who_covid19.shape

(158356, 8)

In [11]:
# set date to date format
who_covid19["Date_reported"] = pd.to_datetime(who_covid19["Date_reported"], format="%Y-%m-%d")
who_covid19.head()

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0


In [12]:
# Plot pie chart to present the cumulative number of deaths in the world.
# Represent only risky countries
last_day = who_covid19["Date_reported"].max()
who_covid19_latest = who_covid19.loc[who_covid19["Date_reported"]== last_day]

who_covid19_latest.loc[who_covid19_latest["Cumulative_deaths"] < 100000,  "Country"] = 'Other countries'
fig = px.pie(who_covid19_latest, values="Cumulative_deaths", title=f"Cumulative deaths across countries on {last_day}",
             names="Country")
fig.update_traces(textinfo = "label+percent")

fig.show()
fig.write_html(f'Visualiseringar/Q4.Pie chart of cumulative deaths across countries.html', auto_open=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Investigage the vaccination status using data from WHO:
- barplot vaccination dataset
- barplot the percentage of persons fully vaccinated per 100
- barplot the percentage of persons take total vaccinations per 100

In [13]:
# import vaccination dataframe from WHO
df_object = CovidData('vaccination-data.csv')
# print dataframe information
who_vaccination = df_object.parse_data()
print(df_object.show_info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   COUNTRY                               228 non-null    object 
 1   ISO3                                  228 non-null    object 
 2   WHO_REGION                            228 non-null    object 
 3   DATA_SOURCE                           228 non-null    object 
 4   DATE_UPDATED                          228 non-null    object 
 5   TOTAL_VACCINATIONS                    228 non-null    int64  
 6   PERSONS_VACCINATED_1PLUS_DOSE         225 non-null    float64
 7   TOTAL_VACCINATIONS_PER100             228 non-null    float64
 8   PERSONS_VACCINATED_1PLUS_DOSE_PER100  225 non-null    float64
 9   PERSONS_FULLY_VACCINATED              226 non-null    float64
 10  PERSONS_FULLY_VACCINATED_PER100       226 non-null    float64
 11  VACCINES_USED      

In [14]:
who_vaccination["COUNTRY"].nunique()

228

In [15]:
# TOTAL_VACCINATIONS_PER100 is a number that could be larger than 100. Mostly smaller than 200
# Because in most cases, one person takes at most two vaccinations
who_vaccination[who_vaccination["COUNTRY"]=="China"]

Unnamed: 0,COUNTRY,ISO3,WHO_REGION,DATA_SOURCE,DATE_UPDATED,TOTAL_VACCINATIONS,PERSONS_VACCINATED_1PLUS_DOSE,TOTAL_VACCINATIONS_PER100,PERSONS_VACCINATED_1PLUS_DOSE_PER100,PERSONS_FULLY_VACCINATED,PERSONS_FULLY_VACCINATED_PER100,VACCINES_USED,FIRST_VACCINE_DATE,NUMBER_VACCINES_TYPES_USED
117,China,CHN,WPRO,REPORTING,2021-10-22,2252894651,1155014000.0,153.124,78.504,1014700000.0,68.967,"Anhui ZL - Recombinant,Beijing CNBG - BBIBP-Co...",2020-07-22,8.0


In [16]:
# This step is to treat missing values
# After checking the percentage of missing values from previous code,
# I decide first to drop the column "FIRST_VACCINE_DATE"
# And then drop all NAs in columns "TOTAL_VACCINATIONS_PER100"(no missing values) and "PERSONS_FULLY_VACCINATED_PER100"
who_vaccination = who_vaccination.drop("FIRST_VACCINE_DATE", axis=1)
who_vaccination = who_vaccination.dropna(subset=["PERSONS_FULLY_VACCINATED_PER100"]).reset_index(drop=True)
who_vaccination.head()

Unnamed: 0,COUNTRY,ISO3,WHO_REGION,DATA_SOURCE,DATE_UPDATED,TOTAL_VACCINATIONS,PERSONS_VACCINATED_1PLUS_DOSE,TOTAL_VACCINATIONS_PER100,PERSONS_VACCINATED_1PLUS_DOSE_PER100,PERSONS_FULLY_VACCINATED,PERSONS_FULLY_VACCINATED_PER100,VACCINES_USED,NUMBER_VACCINES_TYPES_USED
0,Falkland Islands (Malvinas),FLK,AMRO,OWID,2021-04-14,4407,2632.0,126.529,75.567,1775.0,50.962,AstraZeneca - AZD1222,1.0
1,Saint Helena,SHN,AFRO,OWID,2021-05-05,7892,4361.0,129.995,71.833,3531.0,58.162,AstraZeneca - AZD1222,1.0
2,Greenland,GRL,EURO,OWID,2021-10-29,77164,40121.0,135.919,70.67,37043.0,65.249,Moderna - mRNA-1273,1.0
3,Gibraltar,GIB,EURO,OWID,2021-10-30,89068,40547.0,264.367,120.35,39808.0,118.156,Pfizer BioNTech - Comirnaty,1.0
4,Isle of Man,IMN,EURO,OWID,2021-10-30,132309,67873.0,155.599,79.821,64436.0,75.779,"Moderna - mRNA-1273, AstraZeneca - AZD1222, Pf...",3.0


In [17]:
# I intend to show PERSONS_FULLY_VACCINATED_PER100 and TOTAL_VACCINATIONS_PER100
# For realistic reasons, PERSONS_FULLY_VACCINATED_PER100 should smaller or equal to 100

who_vaccination = who_vaccination.drop(who_vaccination[who_vaccination["PERSONS_FULLY_VACCINATED_PER100"] >100].index)
who_vaccination.shape

(225, 13)

In [18]:
vaccination_variables = {"PERSONS_FULLY_VACCINATED_PER100": "The percentage of persons fully vaccinated per 100", "TOTAL_VACCINATIONS_PER100": "The total number of vaccinations per 100"}

for vaccination_varible in vaccination_variables:
    who_vaccination_sorted = who_vaccination.sort_values(vaccination_varible, ascending=False)
    fig = px.bar(who_vaccination_sorted, x="COUNTRY", y=vaccination_varible, title=vaccination_variables[vaccination_varible])
    fig.show()
    fig.write_html(f"Visualiseringar/Q4.{vaccination_variables[vaccination_varible]}across countries.html", auto_open=True)



In [19]:
import plotly.graph_objects as go

fig = go.Figure()

who_vaccination = who_vaccination.sort_values("COUNTRY", ascending=True)

fig.add_bar(x=who_vaccination["COUNTRY"], y=who_vaccination["PERSONS_FULLY_VACCINATED_PER100"], name="persons fully vaccinated per 100",
              text="PERSONS_FULLY_VACCINATED_PER100")

fig.add_bar(x=who_vaccination["COUNTRY"], y=who_vaccination["TOTAL_VACCINATIONS_PER100"], name="persons take total vaccinations per 100",
              text="TOTAL_VACCINATIONS_PER100")

fig.update_layout(
    xaxis_title='Country',
    yaxis_title='Percentage',
    title="The vaccination status across countries"
    )

fig.update_traces(textposition='outside')

fig.show()
fig.write_html(f"Visualiseringar/Q4.The vaccination status across countries.html", auto_open=True)
