# Data Preparation

- Is everything prepared for modeling?
- What result does analyzing the data return?
- Is reduction of the data possible?

In [13]:
##
# IMPORTS
#
import os
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#import datetime as dt

##
# SETTINGS
#
# Render all plots inline (instead of popup)
%matplotlib inline

##
# RUN
#
# run previous notebook TODO: script
%run ./01-data-understanding.ipynb

The dataset contains 71050 rows and 59 columns.


The columns contain data about: 

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',

In [14]:
## Data Standardization

# apply z-score for standardization


## Data Restructuring

In [15]:
# New columns with date split into year, month and weekday (can beused as index for plotting)
df_covid_src['year'] = df_covid_src['date'].dt.year
df_covid_src['month'] = df_covid_src['date'].dt.month
df_covid_src['weekday'] = df_covid_src['date'].dt.weekday

# create subsets by topic
df_gdp_annual = df_covid_src[['continent', 'iso_code', 'location', 'year', 'month', 'weekday', 'date', 'gdp_per_capita', 'extreme_poverty', 'life_expectancy', 'human_development_index']]

df_gdp_quarter = df_gdp_src[['LOCATION', 'TIME', 'Value']]

df_covid = df_covid_src[['continent', 'iso_code', 'location', 'year', 'month', 'weekday', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'reproduction_rate', 'stringency_index']]

df_vaccination = df_covid_src[['continent', 'iso_code', 'location', 'year', 'month', 'weekday', 'date','total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred']]

df_tests = df_covid_src[['continent', 'iso_code', 'location', 'year', 'month', 'weekday', 'date', 'new_tests', 'total_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units']]

df_population = df_covid_src[['continent', 'iso_code', 'location', 'year', 'month', 'weekday', 'date', 'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older']]

# create respective data folders if not existing already
# create directories for topic subsets
os.makedirs('./data/01-prep/by-topic/gdp', exist_ok=True)
os.makedirs('./data/01-prep/by-topic/covid', exist_ok=True)
os.makedirs('./data/01-prep/by-topic/vaccination', exist_ok=True)
os.makedirs('./data/01-prep/by-topic/tests', exist_ok=True)
os.makedirs('./data/01-prep/by-topic/population', exist_ok=True)
# create directories for location subsets
os.makedirs('./data/01-prep/by-location/gdp', exist_ok=True)
os.makedirs('./data/01-prep/by-location/covid', exist_ok=True)
os.makedirs('./data/01-prep/by-location/vaccination', exist_ok=True)
os.makedirs('./data/01-prep/by-location/tests', exist_ok=True)
os.makedirs('./data/01-prep/by-location/population', exist_ok=True)

# export by topic
exporter(df_gdp_annual, 'gdp-annual', './data/01-prep/by-topic/gdp/')
exporter(df_gdp_quarter, 'gdp-quarter', './data/01-prep/by-topic/gdp/')
exporter(df_covid, 'covid', './data/01-prep/by-topic/covid/')
exporter(df_vaccination, 'vaccination', './data/01-prep/by-topic/vaccination/')
exporter(df_tests, 'tests', './data/01-prep/by-topic/tests/')
exporter(df_population, 'population', './data/01-prep/by-topic/population/')

# split by location
split_by_value(df_gdp_annual,'./data/01-prep/by-location/gdp/','location')
split_by_value(df_covid,'./data/01-prep/by-location/covid/','location')
split_by_value(df_vaccination,'./data/01-prep/by-location/vaccination/','location')
split_by_value(df_tests,'./data/01-prep/by-location/tests/','location')
split_by_value(df_population,'./data/01-prep/by-location/population/','location')

## Average GDP per region

In [16]:
# create dir
os.makedirs('./data/01-prep/by-aggregation/gdp', exist_ok=True)

# goupby region and calculate avg gdp
df_gdp_grouped = df_gdp_annual.groupby(['location'])['gdp_per_capita'].mean().to_frame(name='gdp_average').reset_index()

# export df
exporter(df_gdp_grouped, 'gdp_grouped', './data/01-prep/by-aggregation/gdp/')

## Most Covid-19 Cases per region

In [17]:
# create dir
os.makedirs('./data/01-prep/by-aggregation/covid', exist_ok=True)

# goupby region and calculate avg gdp
df_covid_grouped = df_covid.groupby(['location'])['new_cases_per_million'].sum().to_frame(name='covid_sum').reset_index()

# export df
exporter(df_covid_grouped, 'covid_grouped', './data/01-prep/by-aggregation/covid/')

## Functions

In [18]:
# create subsets
def split_by_value(df, path, value):
    # set groups
    g = df.groupby(value)
    # set directory 
    out_path = path
    #create subset for every group
    for x in g.groups:
        # use iso for filename 
        iso = df[df[value] == x].head(1).iso_code.item()
        # create tmp subset 
        df_sub = df[df[value] == x]
        # export tmp subset
        #df_sub.to_csv(out_path+iso+'.csv', sep=',', index=False)    
        exporter(df_sub, iso, out_path)

In [19]:
def exporter(data, filename, path):
    data.to_csv(path+filename+'.csv', sep=',', index=False)