# Data Preparation
## import Datasets (csv)

In [11]:
## import libraries and adjust settings
import pandas as pd
import matplotlib.pyplot as plt

# Render our plots inline
%matplotlib inline

# set figure size
plt.rcParams['figure.figsize'] = (15, 8)

# load datasets

# | Confirmed | Recovered | Deaths | per Day for each country
## Covid
df_countries = pd.read_csv('data/covid/countries-aggregated.csv')
df_keyCountries = pd.read_csv('data/covid/key-countries-pivoted.csv')
df_references = pd.read_csv('data/covid/reference.csv')
df_timeSeries = pd.read_csv('data/covid/time-series-19-covid-combined.csv')
df_wwAggr = pd.read_csv('data/covid/worldwide-aggregate.csv')

## Industry
## us-data
df_usGDP3 = pd.read_excel('data/industry/us-gdp.xlsx', sheet_name='Table 3')
df_usGDP12 = pd.read_excel('data/industry/us-gdp.xlsx', sheet_name='Table 12')
df_usGDP14 = pd.read_excel('data/industry/us-gdp.xlsx', sheet_name='Table 14')




# COVID-Data
## Countries data set
The countries-aggregated.csv provides information about the **confirmed, recovered and deaths** cases per day for each country.

In [12]:

# .head shows first x rows
df_countries.head()

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0,0
1,2020-01-23,Afghanistan,0,0,0
2,2020-01-24,Afghanistan,0,0,0
3,2020-01-25,Afghanistan,0,0,0
4,2020-01-26,Afghanistan,0,0,0


## Key-Countries data set
The key-countries-pivoted.csv provides information about the confirmed cases within the key countries China, US, UK, Italy, France, Germany, Spain and Iran.

In [13]:
df_keyCountries.head()

Unnamed: 0,Date,China,US,United_Kingdom,Italy,France,Germany,Spain,Iran
0,2020-01-22,548,1,0,0,0,0,0,0
1,2020-01-23,643,1,0,0,0,0,0,0
2,2020-01-24,920,2,0,0,2,0,0,0
3,2020-01-25,1406,2,0,0,3,0,0,0
4,2020-01-26,2075,5,0,0,3,0,0,0


## References data set
The references.csv provides general information about the collected data in regard to their UID, Abbreviation (in iso2, iso3, code3), Province state, country region, geographical location (longitude and latitude), combined Key and the population.

In [14]:
df_references.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0
1,8,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania,2877800.0
2,12,DZ,DZA,12.0,,,,Algeria,28.0339,1.6596,Algeria,43851043.0
3,20,AD,AND,20.0,,,,Andorra,42.5063,1.5218,Andorra,77265.0
4,24,AO,AGO,24.0,,,,Angola,-11.2027,17.8739,Angola,32866268.0


## Time-Series-19-COVID-Combined data set
The time-series-19-covid-combined.csv provides information about the
TODO: Difference countries?

In [15]:
df_timeSeries.head()

Unnamed: 0,Date,Country/Region,Province/State,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0
2,2020-01-24,Afghanistan,,0,0.0,0
3,2020-01-25,Afghanistan,,0,0.0,0
4,2020-01-26,Afghanistan,,0,0.0,0


## Worldwide-Aggregated data set
The worldwide-aggregated.csv provides information about the aggregated confirmed, recovered and deaths cases worldwide.

In [16]:
df_wwAggr.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Increase rate
0,2020-01-22,557,30,17,
1,2020-01-23,655,32,18,17.594255
2,2020-01-24,941,39,26,43.664122
3,2020-01-25,1433,42,42,52.284803
4,2020-01-26,2118,56,56,47.801814


# Industry Data
## US GDP Data

In [17]:
# needs preprocessing!
df_usGDP3.head()

Unnamed: 0,Table 1. Real Gross Domestic Product and Related Measures: Percent Change from Preceding Period,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,Line,,2017.0,2018.0,2019.0,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,...,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,Seasonally adjusted at annual rates,
1,Line,,2017.0,2018.0,2019.0,2016,2017,2017,2017,2017,...,2018,2018,2019,2019,2019,2019,2020,2020,2020,
2,Line,,2017.0,2018.0,2019.0,Q4,Q1,Q2,Q3,Q4,...,Q3,Q4,Q1,Q2,Q3,Q4,Q1,Q2,Q3 r,
3,1,Gross domestic product (GDP),2.3,3.0,2.2,2.5,2.3,1.7,2.9,3.9,...,2.1,1.3,2.9,1.5,2.6,2.4,-5,-31.4,33.4,
4,2,Personal consumption expenditures,2.6,2.7,2.4,2.5,3.2,1.8,2.3,4.2,...,2.7,1.6,1.8,3.7,2.7,1.6,-6.9,-33.2,41,
