# 02 DATA ACQUISITION AND UDERSTANDING

## Goals

Produce a clean, high-quality data set whose relationship to the target variables is understood. Locate the data set in the appropriate analytics environment so you are ready to model.
Develop a solution architecture of the data pipeline that refreshes and scores the data regularly.

## How to do it

There are three main tasks addressed in this stage:

0. Ingest the data into the target analytic environment.
0. Explore the data to determine if the data quality is adequate to answer the question.
0. Set up a data pipeline to score new or regularly refreshed data.

Our source: https://ourworldindata.org/covid-cases

# 0.0. Imports

In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 200)

# Graphs
import matplotlib.pyplot as plt

import seaborn as sns
from darkstyle import dark_style as dks

# Statistics
from pandas_profiling import ProfileReport
from statsmodels.distributions.empirical_distribution import ECDF

ModuleNotFoundError: No module named 'darkstyle'

## 0.1. Helper Functions

In [None]:
# set some default figure paramenters and style
def settings():
    %matplotlibe inline
    dks.dark_style() # module for matplot darkstyle
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 8
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
settings()

In [None]:
def show_density(var_data):

     # Plot density
    var_data.plot.density()

    # Add titles and labels
    plt.title('Data Density')

    # Show the mean, median, and mode
    plt.axvline(x=var_data.mean(), color = 'cyan', linestyle='dashed', linewidth = 2, label='mean')
    plt.axvline(x=var_data.median(), color = 'red', linestyle='dashed', linewidth = 2, label='median')
    plt.axvline(x=var_data.mode()[0], color = 'yellow', linestyle='dashed', linewidth = 2, label='mode')
    plt.legend(loc='upper right')

    # Show the figure
    plt.show()

# 1.0. Data

## 1.1. Load the Data

In [None]:
# loading main file
df = pd.read_csv('dataset/owid-covid-data.csv')

# load describe columns file
cols_describe = pd.read_csv('dataset/describe.csv')

In [None]:
df.head()

## 1.2. Knowing the Data

### 1.2.1. Data shape

In [None]:
print(f'The dataset shape is: {df.shape}')

### 1.2.2. Type and Structure

In [None]:
df.info(show_counts=True)

* Some features has Null values;
* dtypes: float64(54), object(5);
* Total entries: 75558;
* Total of 59 columns;

### 1.2.3. Checking Missing Values

In [None]:
df.isnull().sum()

Some features has many null values, these values will be dropped.

In [None]:
# checking miss values rate
columns_to_drop = []
for col in df.columns:
    total = len(df[col])
    total_missing = df[col].isna().sum()
    missing_rate = total_missing/total
    # append to list
    if missing_rate > 0.6:
        columns_to_drop.append(col)

columns_to_drop

In [None]:
# drop columns with too much missing values
df2 = df.drop(columns = columns_to_drop, axis=1)

## 1.6. Descriptive Statistics

### 1.6.1. Numerical Attributes

In [None]:
# selecting numerical attributes
num_df = df2.select_dtypes(exclude=['object'])

# describe
describe = num_df.describe().T

# adding other metrics to knowing data
describe['range'] = (num_df.max() - num_df.min()).tolist()
describe['unique val.'] = num_df.nunique()
describe['variation coefficient'] = np.round((num_df.std() / num_df.mean()), 4).tolist()
describe['skew'] = np.round(num_df.skew(), 4).tolist()
describe['kurtosis'] = np.round(num_df.kurtosis(), 4).tolist()

describe

In [None]:
# about skewness
high_skewness = []
for feat in range(len(describe.index)):
    if abs(describe['skew'].iloc[feat]) > 2:
        high_skewness.append(describe.index[feat])
print(f'There\'s {len(high_skewness)} features with high skew:')
print(high_skewness)

In [None]:
# about kurstosis
high_kurstosis = []
for feat in range(len(describe.index)):
    if abs(describe['kurtosis'].iloc[feat]) > 3:
        high_kurstosis.append(describe.index[feat])
print(f'There\'s {len(high_kurstosis)} features with high kurtosis:')
print(high_kurstosis)

In [None]:
# negative values
negative_values = []
for feat in range(len(describe.index)):
    if abs(describe['min'].iloc[feat] <= 0):
        negative_values.append(describe.index[feat])
print(f'There\'s {len(negative_values)} features with negative values:')
print(negative_values)

Dealing with those features later.

### 1.6.2. Categorical Attributes

In [None]:
cat_df = df2.select_dtypes(exclude='float64')

cat_df.describe().T

* There is 215 countries in the present dataset;
* 6 continents;

# 2.0. Feature Engineering and Hypothesis Creation

## 2.1. Feature Engineering

In [None]:
# transform object in datetime format
df2['date'] = pd.to_datetime(df2.date)

# creating year column
df2['year'] = df2.date.dt.year

## 2.3. Hypothesis

0. Brazil more likely to Covid19 than USA.
0. Continentes with high vaccination rate are more effective on Covid19 control.
0. Countries with high populatation density are more likely to Covid19.
0. Countries with a high elderly are more prone to Covid19.
0. Countries with high Gross domestic product are less likely to covid19.
0. USA has more Covid19 death.
0. European continent did the most Covid19's test.

# 3.0. Exploratory Data Analysis

## 3.1. Univariate Analysis

### 3.1.1. Target Variable

Our target variable is total_cases

In [None]:
show_density(np.log1p(df2.total_cases))

### 3.1.2. Numerical variables

In [None]:
numerical = num_df.drop('total_cases', axis=1)

numerical.hist(bins=25)
plt.tight_layout()
plt.show()

## 3.2 Bivariate Analysis

### 3.2.1. Total Cases

In [None]:
sns.lineplot(data=df2, x='date', y='total_cases', hue='continent', style='continent', ci=None)
plt.title('Covid19 situation update wordwide')
plt.ylabel('Total Cases')
plt.xlabel('Date')
plt.show()

In [None]:
south_america = df2.query('continent == "South America"')

sns.lineplot(data=south_america, x='date', y='total_cases', hue='iso_code', style='iso_code', ci=None)
plt.ylabel('Total Cases')
plt.xlabel('Date')
plt.title('Covid19 pandemic in South America')
plt.show()

### 3.2.2. Total Deaths

In [None]:
sns.lineplot(data=df2, x='date', y='total_deaths', hue='continent', style='continent', ci=None)
plt.title('Covid19 situation update wordwide - Total Deaths')
plt.ylabel('Total Deaths')
plt.xlabel('Date')
plt.show()

In [None]:
sns.lineplot(data=south_america, x='date', y='total_deaths', hue='iso_code', style='iso_code', ci=None)
plt.ylabel('Total Deaths')
plt.xlabel('Date')
plt.title('Covid19 pandemic in South America - Total Deaths')
plt.show()

In [None]:
countries = df2[~df2.iso_code.str.contains('OWID')]

countries.iso_code.unique()

In [None]:
countries_cases = countries[['iso_code', 'total_cases', 'total_deaths']].groupby('iso_code')[['total_cases', 'total_deaths']].sum().reset_index()

In [None]:
countries_cases.plot(x='iso_code', y=['total_cases', 'total_deaths'], kind='bar')