In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

# Importação do Dataset

In [4]:
# import directly from GitHub
df = pd.read_csv("https://raw.githubusercontent.com/wcota/covid19br/master/cases-brazil-states.csv")
df.head()

Unnamed: 0,epi_week,date,country,state,city,newDeaths,deaths,newCases,totalCases,deathsMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,recovered,suspects,tests,tests_per_100k_inhabitants,vaccinated,vaccinated_per_100_inhabitants,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
0,9,2020-02-25,Brazil,SP,TOTAL,0,0,1,1,0,0,0.0,0.00218,0.0,,,,,,,,,,,,
1,9,2020-02-25,Brazil,TOTAL,TOTAL,0,0,1,1,0,0,0.0,0.00047,0.0,,,,,,,,,,,,
2,9,2020-02-26,Brazil,SP,TOTAL,0,0,0,1,0,1,0.0,0.00218,0.0,,,,,,,,,,,,
3,9,2020-02-26,Brazil,TOTAL,TOTAL,0,0,0,1,0,1,0.0,0.00047,0.0,,,,,,,,,,,,
4,9,2020-02-27,Brazil,SP,TOTAL,0,0,0,1,0,1,0.0,0.00218,0.0,,,,,,,,,,,,


# Definindo dataset com os totais diários para o Brasil

In [5]:
# Ajusta tipo da coluna 'date' para data
df['date'] = pd.to_datetime(df['date'])

In [6]:
# Define novo dataset com os totais diários para o BRASIL

df_total = df[(df['state'] == 'TOTAL')]
df_total.reset_index(drop=True, inplace=True)

In [7]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   epi_week                               723 non-null    int64         
 1   date                                   723 non-null    datetime64[ns]
 2   country                                723 non-null    object        
 3   state                                  723 non-null    object        
 4   city                                   723 non-null    object        
 5   newDeaths                              723 non-null    int64         
 6   deaths                                 723 non-null    int64         
 7   newCases                               723 non-null    int64         
 8   totalCases                             723 non-null    int64         
 9   deathsMS                               723 non-null    int64     

In [8]:
df_total.head()

Unnamed: 0,epi_week,date,country,state,city,newDeaths,deaths,newCases,totalCases,deathsMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,recovered,suspects,tests,tests_per_100k_inhabitants,vaccinated,vaccinated_per_100_inhabitants,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
0,9,2020-02-25,Brazil,TOTAL,TOTAL,0,0,1,1,0,0,0.0,0.00047,0.0,,,,,,,,,,,,
1,9,2020-02-26,Brazil,TOTAL,TOTAL,0,0,0,1,0,1,0.0,0.00047,0.0,,,,,,,,,,,,
2,9,2020-02-27,Brazil,TOTAL,TOTAL,0,0,0,1,0,1,0.0,0.00047,0.0,,,,,,,,,,,,
3,9,2020-02-28,Brazil,TOTAL,TOTAL,0,0,1,2,0,1,0.0,0.00094,0.0,,,,,,,,,,,,
4,9,2020-02-29,Brazil,TOTAL,TOTAL,0,0,0,2,0,2,0.0,0.00094,0.0,,,,,,,,,,,,


In [9]:
df_total.tail()

Unnamed: 0,epi_week,date,country,state,city,newDeaths,deaths,newCases,totalCases,deathsMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,recovered,suspects,tests,tests_per_100k_inhabitants,vaccinated,vaccinated_per_100_inhabitants,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
718,206,2022-02-12,Brazil,TOTAL,TOTAL,866,638366,133193,27439759,638048,27425743,299.25608,12863.33335,0.02326,22013249.0,8866327.0,67689970.0,31732.00787,169084658.0,79.26426,147161640.0,68.98709,5033112.0,2.35944,56347125.0,26.41466
719,207,2022-02-13,Brazil,TOTAL,TOTAL,342,638708,55628,27495387,638362,27479963,299.4164,12889.41089,0.02323,22013249.0,8866327.0,67689970.0,31732.00787,169181364.0,79.3096,147304984.0,69.05429,5033860.0,2.3598,56492158.0,26.48265
720,207,2022-02-14,Brazil,TOTAL,TOTAL,494,639202,64807,27560194,638835,27538503,299.64798,12919.79141,0.02319,22032729.0,8866327.0,67689970.0,31732.00787,169403339.0,79.41366,147465536.0,69.12956,5017453.0,2.3521,57031422.0,26.73545
721,207,2022-02-15,Brazil,TOTAL,TOTAL,874,640076,117274,27677468,639689,27659052,300.0577,12974.76764,0.02313,24773656.0,8866327.0,69640999.0,32646.62,169637258.0,79.52331,147547875.0,69.16815,5028165.0,2.35713,57563050.0,26.98466
722,207,2022-02-16,Brazil,TOTAL,TOTAL,0,640076,0,27677468,639689,27659052,300.0577,12974.76764,0.02313,24773656.0,8866327.0,69640999.0,32646.62,169749398.0,79.57588,147601582.0,69.19333,5028782.0,2.35741,57822761.0,27.10641


In [10]:
df_total.columns

Index(['epi_week', 'date', 'country', 'state', 'city', 'newDeaths', 'deaths',
       'newCases', 'totalCases', 'deathsMS', 'totalCasesMS',
       'deaths_per_100k_inhabitants', 'totalCases_per_100k_inhabitants',
       'deaths_by_totalCases', 'recovered', 'suspects', 'tests',
       'tests_per_100k_inhabitants', 'vaccinated',
       'vaccinated_per_100_inhabitants', 'vaccinated_second',
       'vaccinated_second_per_100_inhabitants', 'vaccinated_single',
       'vaccinated_single_per_100_inhabitants', 'vaccinated_third',
       'vaccinated_third_per_100_inhabitants'],
      dtype='object')

## Adição de novas features
**Features adicionadas:** newCasesMS, newDeathMS, day, month, year, day_of_week

In [11]:
# Definindo o número de casos e o número de mortes diários segundo o MS
df_total['newCasesMS'] = df_total['totalCasesMS'].diff()
df_total['newDeathsMS'] = df_total['deathsMS'].diff()

# Definindo o número de recuperados, suspeitos, testes, vacinados e vacinados pela segunda vez diários
df_total['newRecovered'] = df_total['recovered'].diff()
df_total['newSuspects'] = df_total['suspects'].diff()
df_total['newTests'] = df_total['tests'].diff()
df_total['newVaccinated'] = df_total['vaccinated'].diff()
df_total['newVaccinated_second'] = df_total['vaccinated_second'].diff()

# Ajustando o valor do primeiro elemento de cada nova coluna. A operação 'diff()' faz com que esses valaores sejam NAN.
df_total.loc[0,'newDeathsMS'] = 0
df_total.loc[0,'newCasesMS'] = 0

# Criando novas features baseadas na coluna 'date'
df_total['day'] = df_total['date'].dt.day
df_total['month'] = df_total['date'].dt.month
#df_total['year'] = df_total['date'].dt.year
df_total['day_of_week'] = df_total['date'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [12]:
# Verificação da existência de valores negativos após o cálculo da primeira diferença.
# Isso pode ocorrer devido aos erros de preenchimento do dataset
features_list_diff = ['newCasesMS', 'newDeathsMS', 'newRecovered', 'newSuspects', 'newTests', 'newVaccinated', 'newVaccinated_second']
neg_cols = []

for col in features_list_diff:
  neg_check = df_total[df_total[col] < 0][col].count()

  if neg_check > 0:
    neg_cols.append(col)
    print(col,'--->',neg_check)
print('\nLista de colunas com valores negativos: \n', neg_cols)

newCasesMS ---> 1
newSuspects ---> 39
newTests ---> 1
newVaccinated ---> 1
newVaccinated_second ---> 1

Lista de colunas com valores negativos: 
 ['newCasesMS', 'newSuspects', 'newTests', 'newVaccinated', 'newVaccinated_second']


In [13]:
# Substituindo registros negativos pela mediana

print('Valores substituídos e mediana das colunas: \n')

for col in neg_cols:
  median_col = df_total[df_total[col] > 0][col].median()

  subs_list = df_total[df_total[col] < 0][col].to_list()
  print(col, '---> ', subs_list, '---> mediana: ', median_col)

  df_total[col] = df_total[col].replace(to_replace=subs_list, value=median_col)

Valores substituídos e mediana das colunas: 

newCasesMS --->  [-573.0] ---> mediana:  30434.0
newSuspects --->  [-903.0, -404.0, -2896.0, -406.0, -1836.0, -14266.0, -19096.0, -2963.0, -2584.0, -16823.0, -1847.0, -2293.0, -4141.0, -1319.0, -17762.0, -642.0, -2333.0, -21240.0, -37942.0, -3247.0, -1009.0, -38306.0, -5567.0, -274.0, -1381.0, -268.0, -4330.0, -1283.0, -3125.0, -409.0, -1423.0, -547.0, -1565.0, -76.0, -74376.0, -600.0, -7424.0, -5135.0, -2625.0] ---> mediana:  7370.5
newTests --->  [-92358.0] ---> mediana:  66592.0
newVaccinated --->  [-158674.0] ---> mediana:  289378.5
newVaccinated_second --->  [-203052.0] ---> mediana:  256683.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [14]:
# Definindo coluna 'date' como index
df_total.set_index(df_total['date'], inplace=True)

## Removendo colunas e reordenando as demais

In [15]:
# Removendo colunas não desejadas
df_total = df_total.drop(['epi_week', 'country', 'state', 'city', 'date'], axis=1)

In [16]:
df_total.columns

Index(['newDeaths', 'deaths', 'newCases', 'totalCases', 'deathsMS',
       'totalCasesMS', 'deaths_per_100k_inhabitants',
       'totalCases_per_100k_inhabitants', 'deaths_by_totalCases', 'recovered',
       'suspects', 'tests', 'tests_per_100k_inhabitants', 'vaccinated',
       'vaccinated_per_100_inhabitants', 'vaccinated_second',
       'vaccinated_second_per_100_inhabitants', 'vaccinated_single',
       'vaccinated_single_per_100_inhabitants', 'vaccinated_third',
       'vaccinated_third_per_100_inhabitants', 'newCasesMS', 'newDeathsMS',
       'newRecovered', 'newSuspects', 'newTests', 'newVaccinated',
       'newVaccinated_second', 'day', 'month', 'day_of_week'],
      dtype='object')

In [17]:
# Reordenando as colunas
new_order = [ 'month', 'day', 'day_of_week', 'newDeaths', 'deaths', 'newCases', 'totalCases', 'newDeathsMS', 'deathsMS',
              'newCasesMS', 'totalCasesMS', 'deaths_per_100k_inhabitants',
              'totalCases_per_100k_inhabitants', 'deaths_by_totalCases', 'newRecovered', 'recovered', 'newSuspects', 
              'suspects', 'newTests', 'tests', 'tests_per_100k_inhabitants', 'newVaccinated', 'vaccinated',
              'vaccinated_per_100_inhabitants', 'newVaccinated_second', 'vaccinated_second',
              'vaccinated_second_per_100_inhabitants', 'vaccinated_single',
              'vaccinated_single_per_100_inhabitants', 'vaccinated_third',
              'vaccinated_third_per_100_inhabitants']

df_total = df_total[new_order]

In [18]:
df_total.head()

Unnamed: 0_level_0,month,day,day_of_week,newDeaths,deaths,newCases,totalCases,newDeathsMS,deathsMS,newCasesMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,newRecovered,recovered,newSuspects,suspects,newTests,tests,tests_per_100k_inhabitants,newVaccinated,vaccinated,vaccinated_per_100_inhabitants,newVaccinated_second,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2020-02-25,2,25,1,0,0,1,1,0.0,0,0.0,0,0.0,0.00047,0.0,,,,,,,,,,,,,,,,,
2020-02-26,2,26,2,0,0,0,1,0.0,0,1.0,1,0.0,0.00047,0.0,,,,,,,,,,,,,,,,,
2020-02-27,2,27,3,0,0,0,1,0.0,0,0.0,1,0.0,0.00047,0.0,,,,,,,,,,,,,,,,,
2020-02-28,2,28,4,0,0,1,2,0.0,0,0.0,1,0.0,0.00094,0.0,,,,,,,,,,,,,,,,,
2020-02-29,2,29,5,0,0,0,2,0.0,0,1.0,2,0.0,0.00094,0.0,,,,,,,,,,,,,,,,,


In [19]:
df_total.tail()

Unnamed: 0_level_0,month,day,day_of_week,newDeaths,deaths,newCases,totalCases,newDeathsMS,deathsMS,newCasesMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,newRecovered,recovered,newSuspects,suspects,newTests,tests,tests_per_100k_inhabitants,newVaccinated,vaccinated,vaccinated_per_100_inhabitants,newVaccinated_second,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2022-02-12,2,12,5,866,638366,133193,27439759,896.0,638048,140234.0,27425743,299.25608,12863.33335,0.02326,0.0,22013249.0,0.0,8866327.0,0.0,67689970.0,31732.00787,216090.0,169084658.0,79.26426,101896.0,147161640.0,68.98709,5033112.0,2.35944,56347125.0,26.41466
2022-02-13,2,13,6,342,638708,55628,27495387,314.0,638362,54220.0,27479963,299.4164,12889.41089,0.02323,0.0,22013249.0,0.0,8866327.0,0.0,67689970.0,31732.00787,96706.0,169181364.0,79.3096,143344.0,147304984.0,69.05429,5033860.0,2.3598,56492158.0,26.48265
2022-02-14,2,14,0,494,639202,64807,27560194,473.0,638835,58540.0,27538503,299.64798,12919.79141,0.02319,19480.0,22032729.0,0.0,8866327.0,0.0,67689970.0,31732.00787,221975.0,169403339.0,79.41366,160552.0,147465536.0,69.12956,5017453.0,2.3521,57031422.0,26.73545
2022-02-15,2,15,1,874,640076,117274,27677468,854.0,639689,120549.0,27659052,300.0577,12974.76764,0.02313,2740927.0,24773656.0,0.0,8866327.0,1951029.0,69640999.0,32646.62,233919.0,169637258.0,79.52331,82339.0,147547875.0,69.16815,5028165.0,2.35713,57563050.0,26.98466
2022-02-16,2,16,2,0,640076,0,27677468,0.0,639689,0.0,27659052,300.0577,12974.76764,0.02313,0.0,24773656.0,0.0,8866327.0,0.0,69640999.0,32646.62,112140.0,169749398.0,79.57588,53707.0,147601582.0,69.19333,5028782.0,2.35741,57822761.0,27.10641


In [20]:
# from pandas_profiling import ProfileReport
# relatorio_completo = ProfileReport(df_total, title='Relatorio Completo')
# relatorio_completo.to_file('relatorio_completo.html')

## Definição dos Conjuntos de Treino e Teste

Divisão por períodos:
* Treino: 2021-01-01 a 2022-01-31
* Teste: 2022-02-01 a 2022-02-10

In [21]:
df_train = df_total.loc['2021-03-15':'2022-01-31', df_total.columns]
df_train = df_train.sort_index() # Reordena o dataset através do index depois de ter feito a filtragem

df_test = df_total.loc['2022-02-01':'2022-02-10', df_total.columns]
df_test = df_test.sort_index()

In [22]:
print('Dimensões do conjunto de treino: ', df_train.shape)
print('Dimensões do conjunto de teste: ', df_test.shape)

Dimensões do conjunto de treino:  (323, 31)
Dimensões do conjunto de teste:  (10, 31)


## Correlações

In [23]:
round(df_train.corr(), 2)

Unnamed: 0,month,day,day_of_week,newDeaths,deaths,newCases,totalCases,newDeathsMS,deathsMS,newCasesMS,totalCasesMS,deaths_per_100k_inhabitants,totalCases_per_100k_inhabitants,deaths_by_totalCases,newRecovered,recovered,newSuspects,suspects,newTests,tests,tests_per_100k_inhabitants,newVaccinated,vaccinated,vaccinated_per_100_inhabitants,newVaccinated_second,vaccinated_second,vaccinated_second_per_100_inhabitants,vaccinated_single,vaccinated_single_per_100_inhabitants,vaccinated_third,vaccinated_third_per_100_inhabitants
month,1.0,-0.05,0.0,-0.5,0.53,-0.73,0.46,-0.48,0.53,-0.71,0.46,0.53,0.46,0.58,-0.2,0.52,-0.07,0.43,-0.02,0.47,0.47,-0.12,0.52,0.52,0.43,0.37,0.37,0.13,0.13,-0.62,-0.62
day,-0.05,1.0,-0.02,0.02,-0.01,0.15,0.02,0.01,-0.01,0.15,0.02,-0.01,0.02,-0.16,-0.02,0.0,-0.02,0.0,-0.02,0.01,0.01,0.02,0.01,0.01,0.02,0.03,0.03,0.05,0.05,0.23,0.23
day_of_week,0.0,-0.02,1.0,-0.07,0.02,-0.02,0.02,-0.03,0.02,0.02,0.02,0.02,0.02,0.02,-0.12,0.02,-0.02,0.02,-0.04,0.02,0.02,-0.28,0.01,0.01,-0.26,0.01,0.01,-0.02,-0.02,-0.0,-0.0
newDeaths,-0.5,0.02,-0.07,1.0,-0.86,0.5,-0.86,0.99,-0.86,0.5,-0.86,-0.86,-0.86,-0.39,0.39,-0.87,0.07,-0.85,0.08,-0.85,-0.85,0.44,-0.87,-0.87,-0.25,-0.79,-0.79,-0.71,-0.71,-0.33,-0.33
deaths,0.53,-0.01,0.02,-0.86,1.0,-0.34,0.99,-0.84,1.0,-0.34,0.99,1.0,0.99,0.56,-0.34,0.99,-0.06,0.98,-0.09,0.96,0.96,-0.23,0.96,0.96,0.37,0.83,0.83,0.83,0.83,0.9,0.9
newCases,-0.73,0.15,-0.02,0.5,-0.34,1.0,-0.25,0.5,-0.35,0.99,-0.25,-0.34,-0.25,-0.54,0.19,-0.34,0.12,-0.25,0.01,-0.29,-0.29,0.2,-0.35,-0.35,-0.33,-0.22,-0.22,-0.08,-0.08,0.64,0.64
totalCases,0.46,0.02,0.02,-0.86,0.99,-0.25,1.0,-0.84,0.99,-0.25,1.0,0.99,1.0,0.43,-0.34,0.99,-0.06,0.99,-0.09,0.97,0.97,-0.27,0.97,0.97,0.34,0.87,0.87,0.74,0.74,0.93,0.93
newDeathsMS,-0.48,0.01,-0.03,0.99,-0.84,0.5,-0.84,1.0,-0.84,0.51,-0.84,-0.84,-0.84,-0.38,0.38,-0.86,0.08,-0.84,0.09,-0.83,-0.83,0.42,-0.86,-0.86,-0.26,-0.78,-0.78,-0.71,-0.71,-0.34,-0.34
deathsMS,0.53,-0.01,0.02,-0.86,1.0,-0.35,0.99,-0.84,1.0,-0.34,0.99,1.0,0.99,0.56,-0.34,0.99,-0.06,0.98,-0.09,0.96,0.96,-0.23,0.96,0.96,0.37,0.83,0.83,0.83,0.83,0.9,0.9
newCasesMS,-0.71,0.15,0.02,0.5,-0.34,0.99,-0.25,0.51,-0.34,1.0,-0.25,-0.34,-0.25,-0.53,0.18,-0.34,0.12,-0.25,0.01,-0.29,-0.29,0.19,-0.35,-0.35,-0.33,-0.22,-0.22,-0.09,-0.09,0.63,0.63


# Preparação dos dados para o Modelo de Machine Learning

## Seleção de Features e definição do Target

In [24]:
# Target
target_label = 'newDeathsMS' #'recovered' #'newRecovered' #'newCasesMS'

y_train = df_train[[target_label]]
y_test = df_test[[target_label]]

# Features
'''
features_names = ['month', 'day', 'day_of_week', 'newDeathsMS','newCasesMS', 'deaths_per_100k_inhabitants',
                  'totalCases_per_100k_inhabitants', 'deaths_by_totalCases', 'newRecovered', 'newSuspects', 'newTests', 
                  'tests_per_100k_inhabitants', 'newVaccinated', 'vaccinated_per_100_inhabitants', 'newVaccinated_second',
                  'vaccinated_second_per_100_inhabitants']
'''
features_names = ['day', 'month', 'day_of_week', 'newDeathsMS', 'newCasesMS', 'suspects', 'newSuspects', 'recovered', 'suspects', 'newRecovered',
                  'tests', 'vaccinated', 'vaccinated_second'] #, 'vaccinated_single', 'vaccinated_third']

features_names.remove(target_label)

X_train = df_train[features_names]
X_test = df_test[features_names]

In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 323 entries, 2021-03-15 to 2022-01-31
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   day                323 non-null    int64  
 1   month              323 non-null    int64  
 2   day_of_week        323 non-null    int64  
 3   newCasesMS         323 non-null    float64
 4   suspects           323 non-null    float64
 5   newSuspects        323 non-null    float64
 6   recovered          323 non-null    float64
 7   suspects           323 non-null    float64
 8   newRecovered       323 non-null    float64
 9   tests              323 non-null    float64
 10  vaccinated         323 non-null    float64
 11  vaccinated_second  323 non-null    float64
dtypes: float64(9), int64(3)
memory usage: 32.8 KB


In [26]:
# Descrição das features selecionadas
pd.options.display.float_format = '{:.2f}'.format

X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,323.0,16.18,8.83,1.0,9.0,16.0,24.0,31.0
month,323.0,7.07,3.29,1.0,5.0,7.0,10.0,12.0
day_of_week,323.0,2.99,2.01,0.0,1.0,3.0,5.0,6.0
newCasesMS,323.0,43264.34,40957.68,1419.0,12949.5,31359.0,69385.0,269968.0
suspects,323.0,7318628.67,996471.8,5115676.0,6655234.0,7958935.0,7958935.0,8866327.0
newSuspects,323.0,11611.92,69558.47,0.0,0.0,0.0,0.0,684678.0
recovered,323.0,17669988.9,3452392.75,10196596.0,14846553.5,18910110.0,20747506.0,21651890.0
suspects,323.0,7318628.67,996471.8,5115676.0,6655234.0,7958935.0,7958935.0,8866327.0
newRecovered,323.0,35638.76,49979.85,0.0,0.0,16009.0,53421.0,357041.0
tests,323.0,56026267.93,8351296.88,37907905.0,49811611.0,56636459.0,64755932.0,66606202.0


In [27]:
# Descrição da variável alvo
y_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
newDeathsMS,323.0,1080.21,1025.19,24.0,250.5,731.0,1656.5,4249.0


In [28]:
# Pipeline para preparação das variáveis numéricas

numeric_pipeline = Pipeline(steps=[
                                   ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                                   #('normalization', MinMaxScaler())
])

In [29]:
# Ajustando variáveis numéricas usando o pipeline

X_train_prepared = numeric_pipeline.fit_transform(X_train)
y_train_prepared = numeric_pipeline.fit_transform(y_train)

X_test_prepared = numeric_pipeline.fit_transform(X_test)
y_test_prepared = numeric_pipeline.fit_transform(y_test)

# Implementação do XGBoost

### Definição da seed e Instanciando o XGBoost

In [30]:
seed = 1275

xgb = XGBRegressor(
                    booster='gbtree',
                    objective='reg:squarederror', max_depth=15,
                    learning_rate=0.1, n_estimators=100,
                    random_state=seed, n_jobs=-1
)

### Time Series cross-validator

In [31]:
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)
print(tscv)

'''
>> n_splits: int, default=5
Number of splits. Must be at least 2.

>> max_train_size: int, default=None
Maximum size for a single training set.

>> test_size: int, default=None
Used to limit the size of the test set. Defaults to n_samples // (n_splits + 1), which is the maximum allowed value with gap=0.

>> gap: int, default=0
Number of samples to exclude from the end of each train set before the test set.
'''

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)


'\n>> n_splits: int, default=5\nNumber of splits. Must be at least 2.\n\n>> max_train_size: int, default=None\nMaximum size for a single training set.\n\n>> test_size: int, default=None\nUsed to limit the size of the test set. Defaults to n_samples // (n_splits + 1), which is the maximum allowed value with gap=0.\n\n>> gap: int, default=0\nNumber of samples to exclude from the end of each train set before the test set.\n'

### Avaliação do Modelo antes da busca pelos melhores parâmetros

In [32]:
# Avaliação do modelo antes da busca de parâmetros

def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "neg_mean_squared_error", "neg_mean_absolute_percentage_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    mape = -cv_results['test_neg_mean_absolute_percentage_error']
    mse = -cv_results['test_neg_mean_squared_error']
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    
    print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    #print(cv_results.keys())

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

Mean Absolute Error (MAE):     294.870 +/- 310.485
Mean Absolute Percentage Error (MAPE): 0.857 +/- 1.628
Mean Squared Error (MSE): 237916.866 +/- 435700.442
Root Mean Squared Error (RMSE): 360.005 +/- 329.110


### Busca dos Melhores Parâmetros

### Busca inicial com o RandomizedSearch

In [33]:
param_distributions = [
    {
      'n_estimators': [50, 75, 85, 100, 125, 150, 200, 250, 300, 350], 
      'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
      'max_depth':[3, 4, 5, 6, 7, 8],
      'booster':['gbtree', 'gblinear'],
      'objective':['reg:squarederror'], #,'reg:logistic'],
      'gamma':[0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5]
    }
  ]

In [34]:
rnd_search = RandomizedSearchCV(estimator=xgb, 
                                param_distributions = param_distributions, 
                                n_iter=50, scoring= 'neg_root_mean_squared_error', #'neg_mean_squared_error', #'neg_root_mean_squared_error', #'neg_mean_absolute_percentage_error'
                                n_jobs=-1, cv=tscv, random_state=seed
                              )
rnd_search.fit(X_train_prepared, y_train_prepared)

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
                   estimator=XGBRegressor(max_depth=15, n_jobs=-1,
                                          objective='reg:squarederror',
                                          random_state=1275),
                   n_iter=50, n_jobs=-1,
                   param_distributions=[{'booster': ['gbtree', 'gblinear'],
                                         'gamma': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5,
                                                   3.0, 3.5],
                                         'learning_rate': [0.1, 0.2, 0.3, 0.4,
                                                           0.5, 0.6, 0.7, 0.8],
                                         'max_depth': [3, 4, 5, 6, 7, 8],
                                         'n_estimators': [50, 75, 85, 100, 125,
                                                          150, 200, 250, 300,
                                                    

In [35]:
print('Melhores parâmetros (RamdomizedSearch): ', rnd_search.best_params_)

Melhores parâmetros (RamdomizedSearch):  {'objective': 'reg:squarederror', 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.2, 'gamma': 2.5, 'booster': 'gbtree'}


### Avaliação do Modelo após a Busca pelos Melhores Parâmetros

In [36]:
# Avaliação do modelo após a busca por parâmetros com o RandomizedSearch

xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
print(xgb.get_params)
print('\n')

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

<bound method XGBModel.get_params of XGBRegressor(gamma=2.5, learning_rate=0.2, objective='reg:squarederror',
             random_state=1275)>


Mean Absolute Error (MAE):     270.726 +/- 233.771
Mean Absolute Percentage Error (MAPE): 0.755 +/- 1.228
Mean Squared Error (MSE): 173774.926 +/- 266396.294
Root Mean Squared Error (RMSE): 330.661 +/- 253.846


## Testando o modelo

In [37]:
xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
xgb.fit(X_train_prepared,y_train_prepared)
y_predicted = xgb.predict(X_test_prepared)

y_predicted

array([589.5828 , 443.15686, 582.0339 , 548.1528 , 516.12213, 196.09857,
       265.78864, 607.28156, 120.56733, 628.2014 ], dtype=float32)

In [50]:
# Comparação dos resultados

print('(Teste, Previsão) --- Previsão-Teste')

for pair in zip(np.reshape(y_test_prepared, len(y_test_prepared)), np.round(y_predicted,0)):
  print(pair, '---', pair[1]-pair[0])

(Teste, Previsão) --- Previsão-Teste
(929.0, 590.0) --- -339.0
(893.0, 443.0) --- -450.0
(1041.0, 582.0) --- -459.0
(493.0, 548.0) --- 55.0
(1308.0, 516.0) --- -792.0
(391.0, 196.0) --- -195.0
(428.0, 266.0) --- -162.0
(1189.0, 607.0) --- -582.0
(0.0, 121.0) --- 121.0
(2207.0, 628.0) --- -1579.0


In [47]:
# Métricas para o conjunto de testes
def test_metrics(y_pred, y_test):
  mae = mean_absolute_error(y_pred, y_test)
  mape = mean_absolute_percentage_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)
      
  print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
       )
  
test_metrics(y_predicted, y_test_prepared)

Mean Absolute Error (MAE):     473.345 +/- 0.000
Mean Absolute Percentage Error (MAPE): 1.009 +/- 0.000
Mean Squared Error (MSE): 406816.205 +/- 0.000
Root Mean Squared Error (RMSE): 637.821 +/- 0.000


### Refinamento da busca com o GridSearch

In [40]:
'''

xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)

param_grid = [
              {
                'n_estimators': [675, 677, 678, 680, 682, 684], 
                'learning_rate':[0.3, 0.4, 0.5],
                'max_depth':[3, 4, 5, 6, 7, 8],
                'booster':['gbtree', 'gblinear'],
                'objective':['reg:squarederror'],
                'gamma':[0.5, 1.0, 1.5, 2.0, 2.25, 2.5]
              }
]

grid_search = GridSearchCV( estimator=xgb, param_grid=param_grid, cv=tscv,
                            scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_prepared, y_train_prepared)

'''

"\n\nxgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)\n\nparam_grid = [\n              {\n                'n_estimators': [675, 677, 678, 680, 682, 684], \n                'learning_rate':[0.3, 0.4, 0.5],\n                'max_depth':[3, 4, 5, 6, 7, 8],\n                'booster':['gbtree', 'gblinear'],\n                'objective':['reg:squarederror'],\n                'gamma':[0.5, 1.0, 1.5, 2.0, 2.25, 2.5]\n              }\n]\n\ngrid_search = GridSearchCV( estimator=xgb, param_grid=param_grid, cv=tscv,\n                            scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)\ngrid_search.fit(X_train_prepared, y_train_prepared)\n\n"

In [41]:
#print('Melhores parâmetros (GridSearch): ', grid_search.best_params_)

In [42]:
'''
# Avaliação do modelo após a busca por parâmetros com o GridSearch

xgb = XGBRegressor(**grid_search.best_params_, random_state=seed)
print(xgb.get_params)
print('\n')

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)
'''

"\n# Avaliação do modelo após a busca por parâmetros com o GridSearch\n\nxgb = XGBRegressor(**grid_search.best_params_, random_state=seed)\nprint(xgb.get_params)\nprint('\n')\n\nevaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)\n"

In [43]:
'''
cvres = rnd_search.cv_results_
for mean_score, params in sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True, key=lambda v:v[0]):
    print(np.sqrt(-mean_score), params) 
'''

'\ncvres = rnd_search.cv_results_\nfor mean_score, params in sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True, key=lambda v:v[0]):\n    print(np.sqrt(-mean_score), params) \n'