# Série temporal do Covid19 em Portugal

In [120]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [2]:
# importando  Dataset
df_pt = pd.read_csv("https://raw.githubusercontent.com/dssg-pt/covid19pt-data/master/data.csv")

In [3]:
# Redefinindo o tipo do campo de data
df_pt['data'] = pd.to_datetime(df_pt['data'])
df_pt['data_dados'] = pd.to_datetime(df_pt['data_dados'])

In [4]:
df_pt.head()

Unnamed: 0,data,data_dados,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
0,2020-02-26,2020-02-26,0,0,0,0,0,0,0,0,,0,0,0,,,,25.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-02-27,2020-02-27,0,0,0,0,0,0,0,0,,0,0,0,,,,51.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-02-28,2020-02-28,0,0,0,0,0,0,0,0,,0,0,0,,,,59.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-02-29,2020-02-29,0,0,0,0,0,0,0,0,,0,0,0,,,,70.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-01-03,2020-01-03,0,0,0,0,0,0,0,0,,0,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
df_pt.tail(15)

Unnamed: 0,data,data_dados,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
708,2022-03-02,2022-03-02,2795830,1081148,399195,1010845,93361,107779,37465,66037,,50447,2133640,20077,2440.0,155.0,,,653062.0,,,,138466.0,144948.0,178615.0,180540.0,223655.0,210641.0,239648.0,204353.0,269803.0,219382.0,187898.0,155401.0,110304.0,97524.0,64365.0,56700.0,72511.0,38524.0,...,64,167,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,82.0,123.0,179.0,415.0,568.0,1275.0,1646.0,2706.0,7001.0,6004.0,9509.0,10568.0,,,642113.0,2285.0,2552.0,7081.7,7111.8,1.09,1.1
709,2022-04-02,2022-04-02,2843029,1099251,408393,1024826,95313,109979,38656,66611,,47199,2180109,20127,2445.0,174.0,,,660347.0,,,,141461.0,148026.0,182564.0,184572.0,227277.0,213778.0,244102.0,207879.0,274544.0,223037.0,190289.0,157201.0,111792.0,98697.0,65373.0,57481.0,73366.0,38993.0,...,66,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,83.0,123.0,180.0,417.0,571.0,1278.0,1648.0,2713.0,7018.0,6018.0,9533.0,10594.0,,,642793.0,2271.0,2597.0,7163.7,7207.0,1.05,1.05
710,2022-05-02,2022-05-02,2884540,1115235,416658,1036682,97119,111877,39753,67216,,41511,2226548,20171,2409.0,169.0,,,665706.0,,,,143978.0,150638.0,186165.0,188207.0,230350.0,216556.0,247909.0,210867.0,278686.0,226131.0,192513.0,158849.0,113158.0,99782.0,66199.0,58264.0,74198.0,39478.0,...,68,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,83.0,125.0,180.0,419.0,571.0,1279.0,1653.0,2720.0,7038.0,6025.0,9558.0,10613.0,,,637821.0,2240.0,2612.0,7163.7,7207.0,1.05,1.05
711,2022-06-02,2022-06-02,2915971,1126462,422504,1046521,98358,113514,40966,67646,,31431,2266939,20222,2511.0,180.0,,,664442.0,,,,146066.0,152863.0,188983.0,191057.0,232724.0,218508.0,250783.0,213173.0,281765.0,228488.0,194087.0,160072.0,114107.0,100528.0,66818.0,58795.0,74742.0,39788.0,...,70,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,419.0,572.0,1281.0,1659.0,2728.0,7061.0,6032.0,9590.0,10632.0,,,628810.0,2331.0,2624.0,7163.7,7207.0,1.05,1.05
712,2022-07-02,2022-07-02,2932990,1134070,424773,1050847,98922,114284,42043,68051,,17019,2304585,20258,2560.0,178.0,,,665534.0,,,,147287.0,154182.0,190423.0,192606.0,233951.0,219515.0,252363.0,214358.0,283487.0,229776.0,194978.0,160745.0,114633.0,100929.0,67111.0,59038.0,75001.0,39959.0,...,70,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,420.0,574.0,1285.0,1662.0,2731.0,7072.0,6044.0,9606.0,10652.0,,,608147.0,2382.0,2648.0,6901.0,6953.7,0.97,0.97
713,2022-08-02,2022-08-02,2963747,1144795,431483,1059782,100469,115460,43195,68563,,30757,2343448,20302,2419.0,171.0,,,655520.0,,,,149148.0,156168.0,193043.0,195208.0,236176.0,221247.0,255109.0,216324.0,286532.0,231979.0,196744.0,162002.0,115883.0,101849.0,67888.0,59732.0,75657.0,40381.0,...,70,172,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,420.0,576.0,1289.0,1665.0,2737.0,7083.0,6062.0,9622.0,10680.0,,,599997.0,2248.0,2677.0,6901.0,6953.7,0.97,0.97
714,2022-09-02,2022-09-02,2997770,1156683,438304,1069700,102182,117411,44337,69153,,34023,2370709,20354,2435.0,163.0,,,646368.0,,,,151207.0,158404.0,196052.0,198191.0,238898.0,223445.0,258135.0,218735.0,289729.0,234317.0,198574.0,163335.0,117053.0,102749.0,68684.0,60359.0,76416.0,40790.0,...,72,173,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,128.0,181.0,422.0,576.0,1293.0,1671.0,2738.0,7103.0,6080.0,9648.0,10706.0,,,606707.0,2272.0,2697.0,6562.1,6610.1,0.92,0.92
715,2022-10-02,2022-10-02,3025421,1165803,443820,1078220,103751,118863,45238,69726,,27651,2388235,20401,2366.0,168.0,,,638788.0,,,,152725.0,160003.0,198587.0,200568.0,241194.0,225310.0,260512.0,220650.0,292292.0,236261.0,200057.0,164415.0,118078.0,103532.0,69303.0,60967.0,77111.0,41143.0,...,74,174,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,129.0,182.0,422.0,579.0,1294.0,1675.0,2742.0,7119.0,6097.0,9672.0,10729.0,,,616785.0,2198.0,2713.0,6562.1,6610.1,0.92,0.92
716,2022-11-02,2022-11-02,3049692,1173638,448685,1085323,105025,120557,46259,70205,,24271,2428926,20442,2332.0,159.0,,,628109.0,,,,153993.0,161421.0,200711.0,202680.0,243091.0,226931.0,262663.0,222388.0,294593.0,237846.0,201366.0,165382.0,118982.0,104223.0,69909.0,61509.0,77799.0,41479.0,...,75,176,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,129.0,183.0,423.0,580.0,1296.0,1677.0,2750.0,7130.0,6112.0,9687.0,10755.0,,,600324.0,2173.0,2726.0,6099.6,6133.0,0.88,0.88
717,2022-12-02,2022-12-02,3069128,1180020,452325,1091101,106150,121849,47045,70638,,19436,2463423,20492,2232.0,160.0,,,615777.0,,,,155093.0,162510.0,202426.0,204294.0,244567.0,228212.0,264331.0,223616.0,296347.0,239122.0,202488.0,166200.0,119768.0,104826.0,70432.0,61985.0,78380.0,41798.0,...,75,176,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,130.0,183.0,423.0,582.0,1300.0,1678.0,2759.0,7150.0,6125.0,9710.0,10782.0,,,585213.0,2072.0,2733.0,6099.6,6133.0,0.88,0.88


In [6]:
df_pt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 93 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   data                               723 non-null    datetime64[ns]
 1   data_dados                         723 non-null    datetime64[ns]
 2   confirmados                        723 non-null    int64         
 3   confirmados_arsnorte               723 non-null    int64         
 4   confirmados_arscentro              723 non-null    int64         
 5   confirmados_arslvt                 723 non-null    int64         
 6   confirmados_arsalentejo            723 non-null    int64         
 7   confirmados_arsalgarve             723 non-null    int64         
 8   confirmados_acores                 723 non-null    int64         
 9   confirmados_madeira                723 non-null    int64         
 10  confirmados_estrangeiro            16 

In [7]:
round(df_pt.describe(), 2)

Unnamed: 0,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
count,723.0,723.0,723.0,723.0,723.0,723.0,723.0,723.0,16.0,723.0,723.0,723.0,715.0,706.0,164.0,173.0,716.0,155.0,15.0,167.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,167.0,167.0,...,723.0,723.0,31.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,694.0,694.0,694.0,694.0,694.0,694.0,696.0,696.0,76.0,76.0,718.0,718.0,667.0,338.0,338.0,338.0,338.0
mean,659123.28,260272.04,89883.47,250115.27,22581.28,22208.44,5337.34,8725.23,9.12,4354.62,580167.25,10555.54,1238.47,194.37,2210.07,258703.39,77816.61,254759.04,12.2,640.63,21380.21,22311.65,34493.51,34326.31,53363.42,49603.24,52997.41,46003.75,60960.22,49126.68,51410.14,40957.32,32228.31,29392.51,20729.48,18416.3,30663.43,15423.27,0.44,0.34,...,27.55,45.42,0.0,10.8,9.1,13.9,0.0,0.0,0.0,0.0,0.0,0.79,0.82,0.72,0.61,3.42,4.81,12.28,14.66,41.74,63.46,90.55,218.32,300.19,684.73,873.2,1455.42,3905.91,3297.04,5226.96,5737.98,27.22,20.05,68876.81,1042.08,641.66,823.25,824.7,1.04,1.04
std,630632.68,238814.87,90082.01,235610.15,22342.78,25347.63,7773.07,13637.1,5.86,9996.66,541426.17,7750.96,1376.65,193.29,1387.71,157914.29,117090.33,123634.45,7.69,249.6,27461.23,28671.95,38217.2,38425.6,51144.83,48816.23,51930.08,45170.42,58999.02,48097.75,45084.21,36815.3,27648.95,24934.04,17049.48,15096.41,22764.08,11639.38,0.11,0.11,...,16.01,45.92,0.0,6.71,1.2,4.48,0.0,0.0,0.0,0.0,0.0,0.41,0.84,0.63,0.49,2.21,3.29,8.79,11.71,28.49,44.65,63.22,151.15,212.12,481.73,614.57,1030.98,2671.41,2286.83,3599.86,4010.51,14.59,9.23,113904.55,1193.25,1051.94,1703.26,1704.81,0.12,0.13
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0,9.0,30.0,25.0,81.0,1746.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1,0.15,...,0.0,0.0,0.0,3.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,0.0,5.0,9.0,12.0,10.0,23.0,0.0,0.0,2.0,0.0,-152.0,50.3,47.5,0.76,0.75
25%,55816.0,19996.5,4673.0,28874.0,892.5,1036.5,203.0,140.5,5.0,388.0,40950.5,1803.0,406.5,70.0,1417.75,115158.0,25754.0,189274.0,6.0,707.0,1005.5,1145.0,1461.75,1313.75,4781.25,4205.25,4943.25,4491.0,5176.0,4276.0,4828.75,3700.0,3009.75,2705.25,2075.0,1849.75,4252.0,2057.25,0.36,0.28,...,15.0,0.0,0.0,3.0,8.0,11.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,1.0,10.0,12.0,17.0,41.0,51.0,112.0,148.0,229.0,695.75,545.25,928.25,941.75,15.0,13.0,21551.25,335.5,66.0,82.9,78.4,0.95,0.95
50%,797525.0,324612.0,114051.0,301242.0,28141.0,19810.0,3717.0,5952.0,9.0,884.0,699222.0,15962.0,675.0,129.0,1629.0,306171.0,36505.5,290510.0,11.0,767.0,21640.5,22538.5,37404.0,37020.5,60777.0,53347.0,62603.0,52268.5,74569.5,58783.5,66456.5,52082.0,42005.0,38911.5,28090.5,24812.5,44367.0,21897.5,0.4,0.29,...,28.0,59.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,20.0,60.0,87.0,123.0,316.0,445.0,1014.0,1297.5,2191.5,5918.5,5015.5,7871.0,8655.0,21.0,19.0,31992.0,543.0,302.0,173.6,177.4,1.04,1.03
75%,1015970.0,391375.5,135149.0,396061.5,36022.0,37626.0,8367.0,11369.0,11.0,3251.0,953592.0,17626.0,1359.0,198.0,2755.5,389169.0,78898.0,351165.5,18.5,767.0,30094.5,31256.0,53699.25,53300.25,84494.25,79226.0,80033.5,70623.75,91136.0,74070.75,78438.75,62355.25,49781.75,45510.5,32357.5,28789.25,49157.0,24650.75,0.52,0.37,...,40.0,72.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,5.0,8.0,20.0,26.0,66.0,106.0,153.0,353.0,488.75,1118.0,1420.0,2375.5,6260.5,5304.25,8413.75,9291.25,43.0,28.5,67921.25,1082.0,740.0,394.6,403.1,1.12,1.13
max,3148387.0,1203128.0,467675.0,1117071.0,110358.0,126749.0,50231.0,73175.0,21.0,65706.0,2589510.0,20708.0,6869.0,904.0,5903.0,468937.0,665706.0,413609.0,24.0,770.0,159831.0,167337.0,209413.0,211125.0,250871.0,233248.0,270814.0,228521.0,303828.0,244273.0,207059.0,169435.0,122769.0,107216.0,72525.0,63713.0,80607.0,42996.0,0.77,0.89,...,81.0,181.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,8.0,11.0,22.0,33.0,85.0,133.0,185.0,430.0,592.0,1310.0,1700.0,2775.0,7221.0,6197.0,9816.0,10892.0,46.0,33.0,642793.0,6004.0,6204.0,7163.7,7207.0,1.43,1.43


## Ajustando as Features

### Excluindo features 

In [8]:
df_pt = df_pt.drop(['data_dados', 'confirmados_estrangeiro', 'cadeias_transmissao', 
                    'obitos_estrangeiro','recuperados_arscentro','recuperados_arslvt',
                    'recuperados_arsalentejo','recuperados_arsalgarve','recuperados_acores', 'recuperados_arsnorte',
                    'recuperados_madeira','recuperados_estrangeiro','obitos_arsnorte',
                    'obitos_arscentro','obitos_arslvt','obitos_arsalentejo','obitos_arsalgarve',
                    'obitos_acores','obitos_madeira','confirmados_arsnorte', 'confirmados_arscentro',
                    'confirmados_arslvt', 'confirmados_arsalentejo','confirmados_arsalgarve', 'confirmados_acores', 
                    'confirmados_madeira', 'transmissao_importada', 'confirmados_desconhecidos', 
                    'confirmados_desconhecidos_m', 'confirmados_desconhecidos_f'
                   ], axis=1)

In [9]:
df_pt.shape

(723, 63)

In [10]:
df_pt.columns

Index(['data', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
       'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
       'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
       'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
       'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
       'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
       'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
       'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
       'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
       'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
       'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
       'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
       'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
       'obitos_30_39_f', 'obitos_30_39_m', 'obit

### Redefinindo algumas features utilizando a primeira diferença
* O objetivo é transformar os valores que representam o quantitativo acumulado diariamente no quantitativo diário

In [11]:
# Fazendo cópia do Dataset orginal e após aplicação do filtro
df_pt_diff = df_pt.copy()
df_pt_diff.shape

(723, 63)

A coluna 'confirmados_0_9_m' possui uma lacuna em uma data crucial para os resultados por isso, está sendo feito o tratamento do NAN nesta coluna antes da aplicação da primeira diferença. A lacuna é para o registro 424 correspondente a data 2021-04-25

In [12]:
df_pt_diff[df_pt_diff['confirmados_0_9_m'].isna()]

Unnamed: 0,data,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,sintomas_dificuldade_respiratoria,sintomas_cefaleia,sintomas_dores_musculares,sintomas_fraqueza_generalizada,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
0,2020-02-26,0,0,0,0,,,,25.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-02-27,0,0,0,0,,,,51.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-02-28,0,0,0,0,,,,59.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-02-29,0,0,0,0,,,,70.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-01-03,0,0,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,2020-02-03,2,2,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,,,,
424,2021-04-25,834442,478,792685,16965,348.0,98.0,,,24313.0,,,,,,,,,,,,,,,,,,,,,,,,,,455613.0,378523.0,,,,,,,,,,,,,,,,,,,8056.0,8909.0,24792.0,250.0,72.1,68.3,0.98,0.99


In [13]:
df_pt_diff.dropna(subset=['confirmados_0_9_m'], axis=0,inplace=True)
df_pt_diff.reset_index(inplace=True, drop=True)

In [14]:
df_pt_diff.shape

(716, 63)

In [15]:
# Cálculo da primeira diferença

features_list_diff = ['recuperados','obitos','suspeitos','n_confirmados','confirmados_0_9_f', 'confirmados_0_9_m', 'confirmados_10_19_f',
                      'confirmados_10_19_m', 'confirmados_20_29_f', 'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
                      'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f', 'confirmados_50_59_m', 'confirmados_60_69_f',
                      'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f', 'confirmados_80_plus_m',
                      'confirmados_f', 'confirmados_m', 'obitos_f', 'obitos_m'
                    ]

for col in features_list_diff:
  temp = df_pt_diff.loc[0,col]
  df_pt_diff[col] = df_pt_diff[col].diff()
  df_pt_diff.loc[0,col] = temp

In [16]:
# Verificação da existência de valores negativos após o cálculo da primeira diferença.
# Isso pode ocorrer devido aos erros de preenchimento do dataset

neg_cols = []

for col in features_list_diff:
  neg_check = df_pt_diff[df_pt_diff[col] < 0][col].count()

  if neg_check > 0:
    neg_cols.append(col)
    print(col,'--->',neg_check)
print('\nLista de colunas com valores negativos: \n', neg_cols)

confirmados_0_9_f ---> 8
confirmados_0_9_m ---> 6
confirmados_10_19_f ---> 3
confirmados_10_19_m ---> 4
confirmados_20_29_m ---> 2
confirmados_30_39_f ---> 1
confirmados_30_39_m ---> 1
confirmados_40_49_f ---> 2
confirmados_60_69_f ---> 1
confirmados_60_69_m ---> 2
confirmados_70_79_f ---> 3
confirmados_70_79_m ---> 3
confirmados_80_plus_f ---> 1
confirmados_80_plus_m ---> 3

Lista de colunas com valores negativos: 
 ['confirmados_0_9_f', 'confirmados_0_9_m', 'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m', 'confirmados_40_49_f', 'confirmados_60_69_f', 'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f', 'confirmados_80_plus_m']


In [17]:
# Substituindo registros negativos pela mediana

print('Valores substituídos e mediana das colunas: \n')

for col in neg_cols:
  median_col = df_pt_diff[df_pt_diff[col] > 0][col].median()

  subs_list = df_pt_diff[df_pt_diff[col] < 0][col].to_list()
  print(col, '---> ', subs_list, '---> mediana: ', median_col)

  df_pt_diff[col] = df_pt_diff[col].replace(to_replace=subs_list, value=median_col)

Valores substituídos e mediana das colunas: 

confirmados_0_9_f --->  [-1.0, -5.0, -1.0, -2.0, -1.0, -3.0, -1.0, -9.0] ---> mediana:  40.0
confirmados_0_9_m --->  [-5.0, -6.0, -1.0, -4.0, -5.0, -7.0] ---> mediana:  42.0
confirmados_10_19_f --->  [-1.0, -6.0, -2.0] ---> mediana:  46.0
confirmados_10_19_m --->  [-2.0, -4.0, -3.0, -5.0] ---> mediana:  51.0
confirmados_20_29_m --->  [-118.0, -5.0] ---> mediana:  80.5
confirmados_30_39_f --->  [-5.0] ---> mediana:  71.0
confirmados_30_39_m --->  [-192.0] ---> mediana:  67.0
confirmados_40_49_f --->  [-2.0, -4.0] ---> mediana:  76.5
confirmados_60_69_f --->  [-14.0] ---> mediana:  47.0
confirmados_60_69_m --->  [-6.0, -9.0] ---> mediana:  39.5
confirmados_70_79_f --->  [-9.0, -1.0, -2.0] ---> mediana:  31.0
confirmados_70_79_m --->  [-5.0, -1.0, -4.0] ---> mediana:  27.0
confirmados_80_plus_f --->  [-1.0] ---> mediana:  37.0
confirmados_80_plus_m --->  [-2.0, -1.0, -1.0] ---> mediana:  20.0


### Criando novas features baseadas na coluna data

In [18]:
# Criando novas features baseadas na coluna 'date'
df_pt_diff['dia'] = df_pt_diff['data'].dt.day
df_pt_diff['mes'] = df_pt_diff['data'].dt.month
#df_pt_diff['ano'] = df_pt_diff['data'].dt.year
df_pt_diff['dia_da_semana'] = df_pt_diff['data'].dt.dayofweek

In [19]:
df_pt_diff.columns

Index(['data', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
       'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
       'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
       'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
       'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
       'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
       'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
       'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
       'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
       'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
       'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
       'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
       'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
       'obitos_30_39_f', 'obitos_30_39_m', 'obit

In [20]:
# Reordenando as colunas
new_order = [ 'data', 'mes','dia', 'dia_da_semana', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
              'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
              'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
              'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
              'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
              'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
              'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
              'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
              'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
              'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
              'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
              'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
              'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
              'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m',
              'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m',
              'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f',
              'obitos_80_plus_m', 'obitos_f', 'obitos_m', 'ativos',
              'internados_enfermaria', 'incidencia_nacional', 'incidencia_continente',
              'rt_nacional', 'rt_continente',]

df_pt_diff = df_pt_diff[new_order]

### Definindo coluna 'data' como index

In [21]:
df_pt_diff.set_index(df_pt_diff['data'], inplace=True)
df_pt_diff = df_pt_diff.drop(['data'], axis=1)
df_pt_diff = df_pt_diff.sort_index()
df_pt_diff.index

DatetimeIndex(['2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07',
               '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-11',
               '2020-01-12', '2020-02-04',
               ...
               '2022-08-01', '2022-08-02', '2022-09-01', '2022-09-02',
               '2022-10-01', '2022-10-02', '2022-11-01', '2022-11-02',
               '2022-12-01', '2022-12-02'],
              dtype='datetime64[ns]', name='data', length=716, freq=None)

## Definindo intervalo que será utilizado para filtrar o dataset - 2021-03-15 a 2022-01-31

Definição dos Conjuntos de Treino e Teste:
* Treino: 2021-03-15 a 2022-01-21
* Teste: 2022-01-22 a 2022-01-31 (Intervalo de 10 dias para o conjunto de teste)
<br>
<br>Obs1.: Erro na data a partir de fev. 2022
<br>Obs2.: Iniciando em de março de 2021 pois a features 'incidencia_nacional', 'incidencia_continente', 'rt_nacional' e 'rt_continente' só passaram a ser contabilizadas a partir dessa data.

In [22]:
df_train = df_pt_diff.loc['2021-03-15':'2022-01-21', :].copy()
df_train = df_train.sort_index() # Reordena o dataset através do index depois de ter feito a filtragem

df_test = df_pt_diff.loc['2022-01-22':'2022-01-31', :].copy()
df_test = df_test.sort_index()

In [23]:
# Verifica se as datas estão corretamente ordenadas
df_train.head()

Unnamed: 0_level_0,mes,dia,dia_da_semana,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,sintomas_dificuldade_respiratoria,sintomas_cefaleia,sintomas_dores_musculares,sintomas_fraqueza_generalizada,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
2021-03-15,3,15,0,814513,256,2371.0,10.0,996.0,231.0,,,16685.0,,40.0,4.0,13.0,51.0,11.0,21.0,21.0,0.0,11.0,25.0,28.0,25.0,30.0,10.0,13.0,17.0,22.0,13.0,,,,,,,144.0,113.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,89.0,126.0,323.0,452.0,1026.0,1307.0,2226.0,5968.0,5059.0,5.0,5.0,36031.0,765.0,96.0,84.2,0.83,0.79
2021-03-16,3,16,1,814897,384,1173.0,13.0,955.0,213.0,,,15774.0,,12.0,2.0,12.0,12.0,9.0,39.0,28.0,24.0,24.0,31.0,37.0,21.0,27.0,26.0,17.0,13.0,27.0,21.0,,,,,,,193.0,189.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,128.0,323.0,452.0,1027.0,1309.0,2229.0,5970.0,5061.0,6.0,7.0,35229.0,742.0,96.0,82.3,0.83,0.79
2021-03-17,3,17,2,815570,673,1058.0,15.0,856.0,205.0,,,15183.0,,12.0,12.0,19.0,32.0,56.0,37.0,50.0,42.0,60.0,46.0,72.0,44.0,40.0,41.0,29.0,22.0,40.0,20.0,,,,,,,378.0,296.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,128.0,324.0,453.0,1028.0,1311.0,2232.0,5973.0,5065.0,6.0,9.0,34829.0,651.0,90.3,79.1,0.84,0.8
2021-03-18,3,18,3,816055,485,580.0,21.0,828.0,187.0,,,15268.0,,6.0,15.0,12.0,15.0,37.0,43.0,27.0,44.0,40.0,37.0,42.0,30.0,28.0,28.0,31.0,15.0,16.0,14.0,,,,,,,239.0,241.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,129.0,325.0,453.0,1031.0,1313.0,2237.0,5976.0,5071.0,6.0,15.0,34713.0,641.0,90.3,79.1,0.84,0.8
2021-03-19,3,19,4,816623,568,1571.0,11.0,789.0,182.0,,,14915.0,,18.0,17.0,22.0,24.0,45.0,44.0,36.0,52.0,53.0,37.0,37.0,40.0,49.0,23.0,10.0,22.0,33.0,5.0,,,,,,,303.0,264.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,91.0,129.0,325.0,453.0,1035.0,1314.0,2238.0,5978.0,5073.0,3.0,8.0,33699.0,607.0,87.2,75.7,0.86,0.84


In [24]:
print('Dimensões do conjunto de treino: ', df_train.shape)
print('Dimensões do conjunto de teste: ', df_test.shape)

Dimensões do conjunto de treino:  (302, 65)
Dimensões do conjunto de teste:  (10, 65)


In [25]:
df_train.columns

Index(['mes', 'dia', 'dia_da_semana', 'confirmados', 'confirmados_novos',
       'recuperados', 'obitos', 'internados', 'internados_uci', 'lab',
       'suspeitos', 'vigilancia', 'n_confirmados', 'confirmados_0_9_f',
       'confirmados_0_9_m', 'confirmados_10_19_f', 'confirmados_10_19_m',
       'confirmados_20_29_f', 'confirmados_20_29_m', 'confirmados_30_39_f',
       'confirmados_30_39_m', 'confirmados_40_49_f', 'confirmados_40_49_m',
       'confirmados_50_59_f', 'confirmados_50_59_m', 'confirmados_60_69_f',
       'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m',
       'confirmados_80_plus_f', 'confirmados_80_plus_m', 'sintomas_tosse',
       'sintomas_febre', 'sintomas_dificuldade_respiratoria',
       'sintomas_cefaleia', 'sintomas_dores_musculares',
       'sintomas_fraqueza_generalizada', 'confirmados_f', 'confirmados_m',
       'obitos_0_9_f', 'obitos_0_9_m', 'obitos_10_19_f', 'obitos_10_19_m',
       'obitos_20_29_f', 'obitos_20_29_m', 'obitos_30_39_f', 

In [26]:
# Listando colunas nulas após definição do período
null_cols = []

for col in df_train.columns:
  if df_train[col].sum() == 0:
    null_cols.append(col)

print(null_cols)

['lab', 'suspeitos', 'n_confirmados', 'sintomas_tosse', 'sintomas_febre', 'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia', 'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada']


In [27]:
# Excluindo colunas nulas
df_train = df_train.drop(null_cols, axis=1)
df_test = df_test.drop(null_cols, axis=1)

In [28]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 302 entries, 2021-03-15 to 2022-01-21
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   mes                    302 non-null    int64  
 1   dia                    302 non-null    int64  
 2   dia_da_semana          302 non-null    int64  
 3   confirmados            302 non-null    int64  
 4   confirmados_novos      302 non-null    int64  
 5   recuperados            302 non-null    float64
 6   obitos                 302 non-null    float64
 7   internados             302 non-null    float64
 8   internados_uci         302 non-null    float64
 9   vigilancia             302 non-null    float64
 10  confirmados_0_9_f      302 non-null    float64
 11  confirmados_0_9_m      302 non-null    float64
 12  confirmados_10_19_f    302 non-null    float64
 13  confirmados_10_19_m    302 non-null    float64
 14  confirmados_20_29_f    302 non-null    

## Correlação

In [29]:
round(df_train.corr(),4)

Unnamed: 0,mes,dia,dia_da_semana,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,vigilancia,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
mes,1.0,-0.0474,-0.0067,-0.0018,-0.2492,-0.2766,0.0104,-0.0227,-0.0019,-0.1737,-0.2968,-0.3019,-0.2763,-0.2845,-0.2052,-0.1733,-0.2594,-0.2332,-0.2588,-0.2486,-0.2254,-0.2179,-0.2004,-0.1786,-0.2094,-0.2153,-0.206,-0.225,-0.2529,-0.2455,,0.3732,0.0209,,-0.3127,0.0518,-0.0244,0.194,0.1419,0.2555,0.2123,0.1228,0.1295,0.1016,0.1236,0.0795,0.0857,0.0797,0.0044,0.0158,-0.2467,-0.0259,-0.3177,-0.3145,0.1248,0.132
dia,-0.0474,1.0,-0.0384,0.212,0.0017,-0.107,-0.4046,-0.4013,-0.4303,-0.1187,0.0176,0.0142,0.0177,0.0146,0.031,0.0464,0.0074,0.0197,0.0049,0.0079,-0.0125,-0.0029,-0.0423,-0.0428,-0.0916,-0.0914,-0.2354,-0.2018,-0.0025,0.0067,,0.2726,0.1257,,0.2474,0.3601,0.3731,0.383,0.3851,0.3888,0.3982,0.4124,0.4124,0.4099,0.4057,0.409,0.3947,0.3957,-0.3907,-0.412,-0.1164,-0.3941,-0.0358,-0.0365,0.0784,0.0675
dia_da_semana,-0.0067,-0.0384,1.0,0.0535,0.0562,0.0216,0.0171,0.0049,-0.0008,0.0466,0.0354,0.0377,0.0609,0.0602,0.0584,0.0591,0.0534,0.0576,0.0531,0.051,0.0549,0.0537,0.0614,0.0552,0.073,0.0684,0.0759,0.0677,0.0561,0.0559,,0.0203,0.0355,,0.0166,0.0305,0.0115,0.0244,0.0223,0.0246,0.026,0.0193,0.0173,0.0165,0.0185,0.0148,0.0153,0.0162,0.0178,0.0162,0.0596,0.0058,0.0521,0.0525,0.0123,0.0118
confirmados,-0.0018,0.212,0.0535,1.0,0.7448,0.6772,-0.1537,-0.1022,-0.2868,0.6654,0.7666,0.764,0.7675,0.77,0.7151,0.7182,0.7493,0.7436,0.7472,0.7472,0.6981,0.7005,0.6824,0.6665,0.646,0.6572,0.3837,0.4594,0.7407,0.7491,,0.7057,0.6867,,0.8072,0.8217,0.4836,0.7728,0.7643,0.7853,0.7855,0.736,0.7032,0.6883,0.7204,0.6707,0.6441,0.6532,-0.1473,-0.1577,0.716,-0.0716,0.8874,0.8884,0.3102,0.2986
confirmados_novos,-0.2492,0.0017,0.0562,0.7448,1.0,0.8816,0.2094,0.2883,0.1193,0.885,0.9412,0.9386,0.9916,0.9859,0.9824,0.9699,0.9978,0.9962,0.999,0.9983,0.9836,0.9789,0.9777,0.9657,0.9627,0.9649,0.8183,0.8685,0.9999,0.9998,,0.181,0.735,,0.5902,0.3262,0.0133,0.2396,0.2384,0.2335,0.233,0.2009,0.1592,0.1563,0.1863,0.1452,0.1113,0.1238,0.2156,0.2009,0.932,0.3141,0.9405,0.9395,0.3737,0.3598
recuperados,-0.2766,-0.107,0.0216,0.6772,0.8816,1.0,0.3828,0.4527,0.2951,0.8997,0.8538,0.8523,0.881,0.8802,0.836,0.8123,0.8758,0.8598,0.879,0.8751,0.8626,0.8519,0.8696,0.8602,0.8857,0.8939,0.8026,0.8496,0.8836,0.879,,0.1211,0.6034,,0.5452,0.2771,-0.0103,0.1771,0.1657,0.1517,0.1512,0.1231,0.0875,0.0906,0.1186,0.0833,0.0669,0.0757,0.3898,0.3713,0.9283,0.4756,0.9554,0.9554,0.2619,0.2503
obitos,0.0104,-0.4046,0.0171,-0.1537,0.2094,0.3828,1.0,0.9796,0.9439,0.4837,0.1321,0.1349,0.1635,0.1603,0.1804,0.165,0.1852,0.1788,0.1975,0.196,0.2472,0.2366,0.3061,0.3176,0.3872,0.3749,0.6171,0.5648,0.2151,0.2028,,-0.3284,0.0506,,-0.3998,-0.4587,-0.6144,-0.4902,-0.5347,-0.4823,-0.527,-0.5939,-0.6042,-0.5972,-0.5716,-0.6109,-0.5758,-0.5823,0.9926,0.9937,0.4121,0.9794,0.8085,0.8096,0.2088,0.194
internados,-0.0227,-0.4013,0.0049,-0.1022,0.2883,0.4527,0.9796,1.0,0.9678,0.558,0.2078,0.2093,0.2434,0.239,0.2632,0.2488,0.2643,0.2599,0.2749,0.2749,0.324,0.3131,0.376,0.3877,0.451,0.4406,0.6635,0.6167,0.2935,0.2824,,-0.3295,0.1204,,-0.3421,-0.4432,-0.6011,-0.4787,-0.5222,-0.4765,-0.5205,-0.5815,-0.5933,-0.5871,-0.5586,-0.5982,-0.5675,-0.5726,0.967,0.9784,0.4892,0.9992,0.8439,0.8454,0.2674,0.251
internados_uci,-0.0019,-0.4303,-0.0008,-0.2868,0.1193,0.2951,0.9439,0.9678,1.0,0.3962,0.0335,0.0348,0.0754,0.0693,0.1107,0.0999,0.0956,0.0957,0.1044,0.1054,0.1608,0.1508,0.2096,0.2253,0.2841,0.2707,0.5282,0.4716,0.1244,0.1134,,-0.4274,-0.0048,,-0.4711,-0.5602,-0.6189,-0.5824,-0.6213,-0.5833,-0.626,-0.6683,-0.6701,-0.6621,-0.6404,-0.6671,-0.6306,-0.6375,0.9261,0.948,0.3224,0.9567,0.3232,0.3262,0.0286,0.0161
vigilancia,-0.1737,-0.1187,0.0466,0.6654,0.885,0.8997,0.4837,0.558,0.3962,1.0,0.8473,0.8461,0.8866,0.8853,0.8516,0.8335,0.8819,0.869,0.8767,0.8756,0.851,0.8402,0.8742,0.8598,0.8943,0.8959,0.837,0.8664,0.8867,0.8827,,0.0927,0.6636,,0.4427,0.1945,-0.106,0.1466,0.1049,0.1127,0.1081,0.0599,0.0241,0.0254,0.0596,0.0113,-0.0067,0.0013,0.4861,0.4752,0.9686,0.5809,0.9611,0.9624,0.3879,0.3778


# Preparação dos dados para o Modelo de Machine Learning

## Seleção de Features e definição do Target

In [30]:
# Armazenando os nomes das colunas como lista

features_names = df_train.columns.to_list()

# Definindo a variável alvo e as features
target_name = 'confirmados_novos' #'recuperados' #'confirmados_novos' #'obitos' #'internados'

confirmados_list = ['confirmados_novos', 'confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
                    'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
                    'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
                    'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
                    'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
                    'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
                    'confirmados_80_plus_m', 'confirmados_f', 'confirmados_m']

obitos_list = ['obitos', 'obitos_0_9_m', 'obitos_0_9_f', 'obitos_10_19_m', 'obitos_10_19_f', 'obitos_20_29_f', 'obitos_20_29_m',
               'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m', 
               'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m', 
               'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f', 'obitos_80_plus_m', 
               'obitos_f', 'obitos_m']

internados_list = ['internados', 'internados_uci', 'internados_enfermaria'] #+ obitos_list

if target_name == 'confirmados_novos':
  for name in confirmados_list:
    features_names.remove(name)
elif target_name == 'obitos':
  for name in obitos_list:
    features_names.remove(name)
elif target_name == 'internados':
  for name in internados_list:
    features_names.remove(name)
else:
  features_names.remove(target_name)

print(features_names)

['mes', 'dia', 'dia_da_semana', 'recuperados', 'obitos', 'internados', 'internados_uci', 'vigilancia', 'obitos_0_9_f', 'obitos_0_9_m', 'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m', 'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m', 'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m', 'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f', 'obitos_80_plus_m', 'obitos_f', 'obitos_m', 'ativos', 'internados_enfermaria', 'incidencia_nacional', 'incidencia_continente', 'rt_nacional', 'rt_continente']


In [31]:
y_train = df_train[[target_name]]
y_test = df_test[[target_name]]

X_train = df_train[features_names]
X_test = df_test[features_names]

In [32]:
#X_train.info()

In [33]:
# Descrição das features selecionadas
pd.options.display.float_format = '{:.2f}'.format

X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mes,302.0,7.48,2.99,1.0,5.0,8.0,10.0,12.0
dia,302.0,16.1,8.75,1.0,9.0,16.0,23.0,31.0
dia_da_semana,302.0,2.99,1.99,0.0,1.0,3.0,5.0,6.0
recuperados,302.0,2766.99,5431.81,226.0,605.0,1088.5,2631.0,44610.0
obitos,302.0,17.95,37.16,0.0,4.0,8.0,14.0,258.0
internados,302.0,878.1,1096.59,207.0,353.75,597.0,866.75,6496.0
internados_uci,302.0,155.95,154.39,49.0,75.0,119.0,156.5,904.0
vigilancia,302.0,59468.3,68915.38,14734.0,22031.5,34922.0,74863.0,639307.0
obitos_0_9_f,302.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
obitos_0_9_m,302.0,1.45,0.57,0.0,1.0,1.0,2.0,2.0


In [34]:
# Descrição da variável alvo
y_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
confirmados_novos,302.0,3680.2,8469.54,158.0,598.0,1182.5,2886.25,58530.0


In [35]:
# Pipeline para preparação das variáveis numéricas

numeric_pipeline = Pipeline(steps=[
                                   ('imputer', SimpleImputer(strategy='median')), # fill_value=0
                                   #('normalization', MinMaxScaler())
])

In [36]:
# Ajustando variáveis numéricas usando o pipeline

X_train_prepared = numeric_pipeline.fit_transform(X_train)
y_train_prepared = numeric_pipeline.fit_transform(y_train)

X_test_prepared = numeric_pipeline.fit_transform(X_test)
y_test_prepared = numeric_pipeline.fit_transform(y_test)

# Implementação do XGBoost

### Definição da seed e Instanciando o XGBoost

In [78]:
seed = 1275

xgb = XGBRegressor( #tree_method = 'gpu_hist',
                    booster='gbtree',
                    objective='reg:squarederror', max_depth=15,
                    learning_rate=0.1, n_estimators=100,
                    random_state=seed, n_jobs=-1, reg_lambda=2
)

### Time Series cross-validator

In [38]:
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)


### Avaliação do Modelo antes da busca pelos melhores parâmetros

In [79]:
# Avaliação do modelo antes da busca de parâmetros

def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "neg_mean_squared_error", "neg_mean_absolute_percentage_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    mape = -cv_results['test_neg_mean_absolute_percentage_error']
    mse = -cv_results['test_neg_mean_squared_error']
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    
    print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    #print(cv_results.keys())

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

Mean Absolute Error (MAE):     2444.396 +/- 5975.649
Mean Absolute Percentage Error (MAPE): 0.303 +/- 0.121
Mean Squared Error (MSE): 76623804.017 +/- 228074104.823
Root Mean Squared Error (RMSE): 3425.917 +/- 8055.240


### Busca dos Melhores Parâmetros

In [113]:
param_distributions = [
                        {
                          'n_estimators': [100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400], #, 425, 450, 475, 500, 525, 550, 575, 600], 
                          'learning_rate':[0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
                          'max_depth':[4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 20, 22, 24],
                          'booster':['gbtree', 'gblinear'],
                          'objective':['reg:squarederror'], #,'reg:logistic'],
                          'gamma':[0, 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0],
                          'reg_lambda': [1, 1.5, 2, 2.5, 3, 3.5, 4],
                          #'reg_alpha': [0, 1, 1.5, 2, 2.5, 3, 3.5, 4]
                          #'tree_method': ['gpu_hist']
                        }
                      ]

In [122]:
rnd_search = RandomizedSearchCV(estimator=xgb, 
                                param_distributions = param_distributions, 
                                n_iter=50, scoring= 'neg_mean_squared_error', #'neg_mean_squared_error', #'neg_root_mean_squared_error', #'neg_mean_absolute_percentage_error'
                                n_jobs=-1, cv=tscv, random_state=seed
                              )
rnd_search.fit(X_train_prepared, y_train_prepared)

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
                   estimator=XGBRegressor(gamma=3.0, learning_rate=0.05,
                                          max_depth=6, n_estimators=250,
                                          objective='reg:squarederror',
                                          random_state=1275, reg_lambda=2.5),
                   n_iter=50, n_jobs=-1,
                   param_distributions=[{'booster': ['gbtree', 'gblinear'],
                                         'gamma': [0, 0.1, 0.5, 1.0, 1.5, 2.0,
                                                   2.5, 3.0, 3.5, 4.0],
                                         'learning_rate': [0.01, 0.03, 0.05,
                                                           0.1, 0.2, 0.3, 0.4,
                                                           0.5, 0.6, 0.7, 0.8],
                                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 12,
                       

In [123]:
print('Melhores parâmetros: ', rnd_search.best_params_)

Melhores parâmetros:  {'reg_lambda': 3, 'objective': 'reg:squarederror', 'n_estimators': 375, 'max_depth': 7, 'learning_rate': 0.6, 'gamma': 3.0, 'booster': 'gblinear'}


### Avaliação do Modelo após a Busca pelos Melhores Parâmetros

In [124]:
# Avaliação do modelo após a busca por parâmetros

xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
print(xgb.get_params)
print('\n')

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

<bound method XGBModel.get_params of XGBRegressor(booster='gblinear', gamma=3.0, learning_rate=0.6, max_depth=7,
             n_estimators=375, objective='reg:squarederror', random_state=1275,
             reg_lambda=3)>


Mean Absolute Error (MAE):     1170.797 +/- 1993.626
Mean Absolute Percentage Error (MAPE): 0.385 +/- 0.070
Mean Squared Error (MSE): 8966254.542 +/- 24719719.403
Root Mean Squared Error (RMSE): 1638.486 +/- 2506.315


## Testando o Modelo

In [125]:
xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
xgb.fit(X_train_prepared,y_train_prepared)
y_predicted = xgb.predict(X_test_prepared)

y_predicted

array([48735.65 , 48724.117, 55957.895, 74847.73 , 76993.91 , 67449.46 ,
       76881.13 , 81265.06 , 79808.75 , 72472.57 ], dtype=float32)

In [126]:
# Comparação dos resultados

print('(Teste, Previsão) --- Previsão-Teste')

for pair in zip(np.reshape(y_test_prepared, len(y_test_prepared)), np.round(y_predicted,0)):
  print(pair, '---', pair[1]-pair[0])

(Teste, Previsão) --- Previsão-Teste
(58131.0, 48736.0) --- -9395.0
(45569.0, 48724.0) --- 3155.0
(32758.0, 55958.0) --- 23200.0
(57657.0, 74848.0) --- 17191.0
(65578.0, 76994.0) --- 11416.0
(65706.0, 67449.0) --- 1743.0
(63833.0, 76881.0) --- 13048.0
(59194.0, 81265.0) --- 22071.0
(45335.0, 79809.0) --- 34474.0
(27916.0, 72473.0) --- 44557.0


In [127]:
# Métricas para o conjunto de testes
def test_metrics(y_pred, y_test):
  mae = mean_absolute_error(y_pred, y_test)
  mape = mean_absolute_percentage_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)
      
  print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
       )
  
test_metrics(y_predicted, y_test_prepared)

Mean Absolute Error (MAE):     18024.997 +/- 0.000
Mean Absolute Percentage Error (MAPE): 0.256 +/- 0.000
Mean Squared Error (MSE): 489645911.985 +/- 0.000
Root Mean Squared Error (RMSE): 22127.944 +/- 0.000
