In [None]:
import pandas as pd
from joblib import dump, load
from datetime import datetime

## Load datasets

### Covid-19 cases


In [None]:
df_covid = pd.read_csv('Datasets/covid_19_cases/COVID-19 Activity.csv')

Este dataset pode ser encontrado em: https://data.world/covid-19-data-resource-hub/covid-19-case-counts

O download do arquivo utilizado neste estudo foi realizado em: xxxxx

### Demographic

In [None]:
df_demographic = pd.read_csv('Datasets/demographic/demographic-2019.csv',  encoding='latin-1')

Este dataset pode ser encontrado em: https://data.world/hdx/749ed4a9-6a89-4a3f-a4c8-b5359966a6e9
        
O download do arquivo utilizado neste estudo foi realizado em: xxxxx

In [None]:
df_demographic = df_demographic.loc[:, df_demographic.columns.intersection([
                                         'ISO', 
                                         'Total population in millions, 2019',
                                         'Total fertility rate, per woman, 2019',
                                         'Population aged 0-14, percent, 2019',
                                         'Population aged 10-24, percent, 2019',
                                         'Population aged 15-64, percent, 2019',
                                         'Population aged 65 and older, percent, 2019',
                                         'Life expectancy at birth, years, 2019'])]

In [None]:
df_demographic = df_demographic.rename(columns={"Total population in millions, 2019": "TOTAL_POPULATION", 
                                                "Total fertility rate, per woman, 2019": "FERTILITY_RATE",
                                                "Population aged 0-14, percent, 2019": "PERCENT_POPULATION_AGED_0_14",
                                                "Population aged 10-24, percent, 2019": "PERCENT_POPULATION_AGED_10_24",
                                                "Population aged 15-64, percent, 2019": "PERCENT_POPULATION_AGED_15_64",
                                                "Population aged 65 and older, percent, 2019": "PERCENT_POPULATION_AGED_65_OLDER",
                                                "Life expectancy at birth, years, 2019":"LIFE_EXPECTANCY"
                                               })

### Clean and prepare Dataset

In [None]:
df_covid['REPORT_DATE'] = pd.to_datetime(df_covid['REPORT_DATE'], format='%m/%d/%Y')

In [None]:
df_covid = df_covid.set_index('REPORT_DATE')

In [None]:
countries = df_covid['COUNTRY_SHORT_NAME'].unique()

In [None]:
 for country in countries:
        day_0_for_death = df_covid[(df_covid['COUNTRY_SHORT_NAME'] == country) & (df_covid['PEOPLE_DEATH_NEW_COUNT']>0)].index.min()
        day_0_for_cases = df_covid[(df_covid['COUNTRY_SHORT_NAME'] == country) & (df_covid['PEOPLE_POSITIVE_NEW_CASES_COUNT']>0)].index.min()
        DAYS_BETWEEN_FIRST_CASE_AND_FIRST_DEATH = (day_0_for_death-day_0_for_cases).days
        df_covid.loc[df_covid['COUNTRY_SHORT_NAME'] == country, 'DAYS_AFTER_FIRST_DEATH'] = df_covid[df_covid['COUNTRY_SHORT_NAME'] == country].apply(lambda x: (x.name - day_0_for_death).days if (x.name - day_0_for_death).days >0 else None, axis=1)
        df_covid.loc[df_covid['COUNTRY_SHORT_NAME'] == country, 'DAYS_AFTER_FIRST_CASE'] = df_covid[df_covid['COUNTRY_SHORT_NAME'] == country].apply(lambda x: (x.name - day_0_for_cases).days if (x.name - day_0_for_cases).days >0 else None, axis=1)
        df_covid.loc[df_covid['COUNTRY_SHORT_NAME'] == country, 'DAYS_BETWEEN_FIRST_CASE_AND_FIRST_DEATH'] = DAYS_BETWEEN_FIRST_CASE_AND_FIRST_DEATH

#### Final Dataset

In [None]:
df = pd.merge(df_covid, df_demographic, how='left', left_on='COUNTRY_ALPHA_2_CODE', right_on='ISO')

## Training Model

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

### Pre processing

In [None]:
df_target = df.loc[:, df.columns.intersection([
                                         'PEOPLE_DEATH_NEW_COUNT',
                                         'PEOPLE_POSITIVE_NEW_CASES_COUNT'
                                          ])]

In [None]:
df = df.loc[:, df.columns.intersection([
                                         'COUNTRY_ALPHA_2_CODE',
                                         'DAYS_BETWEEN_FIRST_CASE_AND_FIRST_DEATH',
                                         'DAYS_AFTER_FIRST_DEATH',
                                         'DAYS_AFTER_FIRST_CASE',
                                         'TOTAL_POPULATION',
                                         'FERTILITY_RATE',
                                         'PERCENT_POPULATION_AGED_10_24',
                                         'PERCENT_POPULATION_AGED_15_64',
                                         'PERCENT_POPULATION_AGED_65_OLDER',
                                         'LIFE_EXPECTANCY'])]

In [None]:
df = df.fillna(0)

In [None]:
df = df.replace('-', 0, regex=True)
df = df.replace(',', '', regex=True)

In [None]:
df_dummies = pd.get_dummies(df['COUNTRY_ALPHA_2_CODE'])

In [None]:
df = pd.concat([df, df_dummies], axis=1)

In [None]:
df = df.drop(['COUNTRY_ALPHA_2_CODE'], axis=1)

In [None]:
df = df.astype({'DAYS_AFTER_FIRST_DEATH': 'float',
           'TOTAL_POPULATION':'float',
           'FERTILITY_RATE':'float',
           'PERCENT_POPULATION_AGED_10_24':'float',
           'PERCENT_POPULATION_AGED_15_64':'float',
           'PERCENT_POPULATION_AGED_65_OLDER':'float',
           'LIFE_EXPECTANCY':'float'
})

### Training

In [None]:
X, y = df, df_target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=20)

params = {'n_estimators': 700,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

In [None]:
reg = MultiOutputRegressor(ensemble.GradientBoostingRegressor(**params))
reg.fit(X_train, y_train)

In [None]:
reg.score(X_train, y_train)

In [None]:
reg.score(X_test, y_test)

### Persist Model

In [None]:
dump(reg, 'gradient_boosting.joblib') 

### Predict Brazil Data

In [None]:
df_predict_br = pd.read_csv('brazil_to_predict.csv')

In [None]:
df_predict_br.head()

In [None]:
df_predict_br = df_teste_br.drop(columns=['REPORT_DATE'])

In [None]:
gradient_boosting = load('gradient_boosting.joblib') 

In [None]:
predicted = gradient_boosting.predict(df_predict_br)