In [1]:
import pandas as pd
import numpy as np

In [4]:
covid_owid = pd.read_csv("../data_source/owid-covid-data.csv")

In [31]:
covid_owid.shape

(157936, 67)

Our target is work with Europe, first let's check all the continents stored and which percetange they represent 

In [34]:
continent_counts = covid_owid.loc[:, ["location","continent"] ].groupby(["continent"]).count()
continent_counts["percentage"] = continent_counts["location"] / covid_owid.shape[0]
continent_counts.sort_values("percentage", ascending=False)

Unnamed: 0_level_0,location,percentage
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,37437,0.237039
Europe,35256,0.22323
Asia,34445,0.218095
North America,23669,0.149865
South America,9114,0.057707
Oceania,8527,0.05399


As we can observe our target represents 22% of the population. Now let's filter our full dataset by only our case and some columns selected 

In [None]:
columns_to_be_analyzed = [
    "location"
    , "date"
    , "total_cases"
    , "new_cases"
    , "total_deaths"
    , "new_deaths"
    , "icu_patients"
    , "hosp_patients"
    , "total_tests"
    , "positive_rate"
    , "total_vaccinations"
    , "people_vaccinated"
    , "people_fully_vaccinated"
    , "life_expectancy"
]

In [39]:
covid_europe = covid_owid.loc[ covid_owid["continent"] == "Europe" , columns_to_be_analyzed ]

For country let's analyze percentage of NaNs by column:

In [113]:
nans_columns = covid_europe.groupby(["location"], as_index=False).count()

In [114]:
for c_ in  nans_columns.columns:
    if c_ not in ['location','date']:
        nans_columns[c_] = 1 - nans_columns[c_] / nans_columns["date"] 

Based on our current variables let's analyze which of them can have NaNs:

- total_cases: that one should be have always values. (IMPORTANT)
- new_cases: we can have NaNs cases, not all days have new cases. > .9 nans will be excluded 
- total_deaths: that one should be have always values. (IMPORTANT)
- new_deaths: we can have NaNs cases, not all days have new cases.
- icu_patients:we can have NaNs cases, not all days have new cases.
- hosp_patients:we can have NaNs cases, not all days have new cases. > .9 nans will be excluded
- total_tests: we can have NaNs cases, not all days have new cases. > .9 nans will be excluded
- positive_rate: we can have NaNs cases, not all days have new cases. > .9 nans will be excluded
- total_vaccinations: we can have NaNs cases, not all days have new cases. 1.0 nans will be excluded
- people_vaccinated: we can have NaNs cases, not all days have new cases. .9 nans will be excluded
- people_fully_vaccinated: we can have NaNs cases, not all days have new cases.
- life_expectancy: that one should be have always values. (IMPORTANT)


In [115]:
nans_columns.shape

(51, 14)

In [118]:
nans_columns = nans_columns.loc[
    (nans_columns["total_cases"] < 1.0)
    & (nans_columns["new_cases"] < 0.9 )
    & (nans_columns["total_deaths"] < 1.0 )
    & (nans_columns["hosp_patients"] < 0.9 )
    & (nans_columns["total_tests"] < 0.9 )
    & (nans_columns["positive_rate"] < 0.9 )
    & (nans_columns["total_vaccinations"] < 1.0 )
    & (nans_columns["people_vaccinated"] < 0.9 )
    & (nans_columns["life_expectancy"] < 1.0 )
    ,
    :
]

In [119]:
nans_columns.shape

(24, 14)

In [127]:
covid_europe = covid_europe.loc[ covid_europe["location"].isin( nans_columns["location"].array ) , : ] 

In [129]:
covid_europe

Unnamed: 0,location,date,total_cases,new_cases,total_deaths,new_deaths,icu_patients,hosp_patients,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,life_expectancy
9183,Austria,2020-02-25,2.0,2.0,,,,,,,,,,81.54
9184,Austria,2020-02-26,2.0,0.0,,,,,,,,,,81.54
9185,Austria,2020-02-27,3.0,1.0,,,,,,,,,,81.54
9186,Austria,2020-02-28,3.0,0.0,,,,,,,,,,81.54
9187,Austria,2020-02-29,9.0,6.0,,,,,,,,,,81.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149295,United Kingdom,2022-01-24,15992837.0,95317.0,154042.0,54.0,598.0,17210.0,429972989.0,0.0683,137474493.0,52252579.0,48224978.0,81.32
149296,United Kingdom,2022-01-25,16087344.0,94507.0,154485.0,443.0,575.0,16608.0,431317684.0,0.0686,137570759.0,52266515.0,48253766.0,81.32
149297,United Kingdom,2022-01-26,16189420.0,102076.0,154831.0,346.0,561.0,16514.0,432822845.0,0.0687,137671500.0,52281873.0,48284685.0,81.32
149298,United Kingdom,2022-01-27,16286017.0,96597.0,155169.0,338.0,549.0,16149.0,,,137770110.0,52297579.0,48314633.0,81.32


Replace NaNs with zero:

In [137]:
covid_europe = covid_europe.fillna(0)

In [138]:
covid_europe

Unnamed: 0,location,date,total_cases,new_cases,total_deaths,new_deaths,icu_patients,hosp_patients,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,life_expectancy
9183,Austria,2020-02-25,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,81.54
9184,Austria,2020-02-26,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,81.54
9185,Austria,2020-02-27,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,81.54
9186,Austria,2020-02-28,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,81.54
9187,Austria,2020-02-29,9.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,81.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149295,United Kingdom,2022-01-24,15992837.0,95317.0,154042.0,54.0,598.0,17210.0,429972989.0,0.0683,137474493.0,52252579.0,48224978.0,81.32
149296,United Kingdom,2022-01-25,16087344.0,94507.0,154485.0,443.0,575.0,16608.0,431317684.0,0.0686,137570759.0,52266515.0,48253766.0,81.32
149297,United Kingdom,2022-01-26,16189420.0,102076.0,154831.0,346.0,561.0,16514.0,432822845.0,0.0687,137671500.0,52281873.0,48284685.0,81.32
149298,United Kingdom,2022-01-27,16286017.0,96597.0,155169.0,338.0,549.0,16149.0,0.0,0.0000,137770110.0,52297579.0,48314633.0,81.32


In [139]:
covid_europe.to_csv( "../data_source/covid_europe.csv" , index_label=False )