# Série temporal do Covid19 em Portugal

In [138]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [139]:
# importando  Dataset
df_pt = pd.read_csv("https://raw.githubusercontent.com/dssg-pt/covid19pt-data/master/data.csv")

In [140]:
# Redefinindo o tipo do campo de data
df_pt['data'] = pd.to_datetime(df_pt['data'])
df_pt['data_dados'] = pd.to_datetime(df_pt['data_dados'])

In [141]:
df_pt.head()

Unnamed: 0,data,data_dados,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
0,2020-02-26,2020-02-26,0,0,0,0,0,0,0,0,,0,0,0,,,,25.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-02-27,2020-02-27,0,0,0,0,0,0,0,0,,0,0,0,,,,51.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-02-28,2020-02-28,0,0,0,0,0,0,0,0,,0,0,0,,,,59.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-02-29,2020-02-29,0,0,0,0,0,0,0,0,,0,0,0,,,,70.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-01-03,2020-01-03,0,0,0,0,0,0,0,0,,0,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [142]:
df_pt.tail(15)

Unnamed: 0,data,data_dados,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
707,2022-02-02,2022-02-02,2745383,1060869,389740,996552,91089,105676,35954,65503,,54693,2112346,20024,2442.0,149.0,,,645697.0,,,,135259.0,141652.0,174379.0,176221.0,219766.0,207373.0,234773.0,200515.0,264593.0,215266.0,185387.0,153523.0,108729.0,96343.0,63452.0,55863.0,71702.0,38073.0,...,62,164,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,82.0,122.0,179.0,415.0,567.0,1271.0,1642.0,2699.0,6982.0,5987.0,9485.0,10539.0,,,613013.0,2293.0,2514.0,7081.7,7111.8,1.09,1.1
708,2022-03-02,2022-03-02,2795830,1081148,399195,1010845,93361,107779,37465,66037,,50447,2133640,20077,2440.0,155.0,,,653062.0,,,,138466.0,144948.0,178615.0,180540.0,223655.0,210641.0,239648.0,204353.0,269803.0,219382.0,187898.0,155401.0,110304.0,97524.0,64365.0,56700.0,72511.0,38524.0,...,64,167,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,82.0,123.0,179.0,415.0,568.0,1275.0,1646.0,2706.0,7001.0,6004.0,9509.0,10568.0,,,642113.0,2285.0,2552.0,7081.7,7111.8,1.09,1.1
709,2022-04-02,2022-04-02,2843029,1099251,408393,1024826,95313,109979,38656,66611,,47199,2180109,20127,2445.0,174.0,,,660347.0,,,,141461.0,148026.0,182564.0,184572.0,227277.0,213778.0,244102.0,207879.0,274544.0,223037.0,190289.0,157201.0,111792.0,98697.0,65373.0,57481.0,73366.0,38993.0,...,66,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,83.0,123.0,180.0,417.0,571.0,1278.0,1648.0,2713.0,7018.0,6018.0,9533.0,10594.0,,,642793.0,2271.0,2597.0,7163.7,7207.0,1.05,1.05
710,2022-05-02,2022-05-02,2884540,1115235,416658,1036682,97119,111877,39753,67216,,41511,2226548,20171,2409.0,169.0,,,665706.0,,,,143978.0,150638.0,186165.0,188207.0,230350.0,216556.0,247909.0,210867.0,278686.0,226131.0,192513.0,158849.0,113158.0,99782.0,66199.0,58264.0,74198.0,39478.0,...,68,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,83.0,125.0,180.0,419.0,571.0,1279.0,1653.0,2720.0,7038.0,6025.0,9558.0,10613.0,,,637821.0,2240.0,2612.0,7163.7,7207.0,1.05,1.05
711,2022-06-02,2022-06-02,2915971,1126462,422504,1046521,98358,113514,40966,67646,,31431,2266939,20222,2511.0,180.0,,,664442.0,,,,146066.0,152863.0,188983.0,191057.0,232724.0,218508.0,250783.0,213173.0,281765.0,228488.0,194087.0,160072.0,114107.0,100528.0,66818.0,58795.0,74742.0,39788.0,...,70,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,419.0,572.0,1281.0,1659.0,2728.0,7061.0,6032.0,9590.0,10632.0,,,628810.0,2331.0,2624.0,7163.7,7207.0,1.05,1.05
712,2022-07-02,2022-07-02,2932990,1134070,424773,1050847,98922,114284,42043,68051,,17019,2304585,20258,2560.0,178.0,,,665534.0,,,,147287.0,154182.0,190423.0,192606.0,233951.0,219515.0,252363.0,214358.0,283487.0,229776.0,194978.0,160745.0,114633.0,100929.0,67111.0,59038.0,75001.0,39959.0,...,70,170,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,420.0,574.0,1285.0,1662.0,2731.0,7072.0,6044.0,9606.0,10652.0,,,608147.0,2382.0,2648.0,6901.0,6953.7,0.97,0.97
713,2022-08-02,2022-08-02,2963747,1144795,431483,1059782,100469,115460,43195,68563,,30757,2343448,20302,2419.0,171.0,,,655520.0,,,,149148.0,156168.0,193043.0,195208.0,236176.0,221247.0,255109.0,216324.0,286532.0,231979.0,196744.0,162002.0,115883.0,101849.0,67888.0,59732.0,75657.0,40381.0,...,70,172,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,127.0,181.0,420.0,576.0,1289.0,1665.0,2737.0,7083.0,6062.0,9622.0,10680.0,,,599997.0,2248.0,2677.0,6901.0,6953.7,0.97,0.97
714,2022-09-02,2022-09-02,2997770,1156683,438304,1069700,102182,117411,44337,69153,,34023,2370709,20354,2435.0,163.0,,,646368.0,,,,151207.0,158404.0,196052.0,198191.0,238898.0,223445.0,258135.0,218735.0,289729.0,234317.0,198574.0,163335.0,117053.0,102749.0,68684.0,60359.0,76416.0,40790.0,...,72,173,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,128.0,181.0,422.0,576.0,1293.0,1671.0,2738.0,7103.0,6080.0,9648.0,10706.0,,,606707.0,2272.0,2697.0,6562.1,6610.1,0.92,0.92
715,2022-10-02,2022-10-02,3025421,1165803,443820,1078220,103751,118863,45238,69726,,27651,2388235,20401,2366.0,168.0,,,638788.0,,,,152725.0,160003.0,198587.0,200568.0,241194.0,225310.0,260512.0,220650.0,292292.0,236261.0,200057.0,164415.0,118078.0,103532.0,69303.0,60967.0,77111.0,41143.0,...,74,174,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,129.0,182.0,422.0,579.0,1294.0,1675.0,2742.0,7119.0,6097.0,9672.0,10729.0,,,616785.0,2198.0,2713.0,6562.1,6610.1,0.92,0.92
716,2022-11-02,2022-11-02,3049692,1173638,448685,1085323,105025,120557,46259,70205,,24271,2428926,20442,2332.0,159.0,,,628109.0,,,,153993.0,161421.0,200711.0,202680.0,243091.0,226931.0,262663.0,222388.0,294593.0,237846.0,201366.0,165382.0,118982.0,104223.0,69909.0,61509.0,77799.0,41479.0,...,75,176,,,,,,,,,,1.0,2.0,2.0,1.0,8.0,10.0,22.0,32.0,84.0,129.0,183.0,423.0,580.0,1296.0,1677.0,2750.0,7130.0,6112.0,9687.0,10755.0,,,600324.0,2173.0,2726.0,6099.6,6133.0,0.88,0.88


In [143]:
df_pt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722 entries, 0 to 721
Data columns (total 93 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   data                               722 non-null    datetime64[ns]
 1   data_dados                         722 non-null    datetime64[ns]
 2   confirmados                        722 non-null    int64         
 3   confirmados_arsnorte               722 non-null    int64         
 4   confirmados_arscentro              722 non-null    int64         
 5   confirmados_arslvt                 722 non-null    int64         
 6   confirmados_arsalentejo            722 non-null    int64         
 7   confirmados_arsalgarve             722 non-null    int64         
 8   confirmados_acores                 722 non-null    int64         
 9   confirmados_madeira                722 non-null    int64         
 10  confirmados_estrangeiro            16 

In [144]:
round(df_pt.describe(), 2)

Unnamed: 0,confirmados,confirmados_arsnorte,confirmados_arscentro,confirmados_arslvt,confirmados_arsalentejo,confirmados_arsalgarve,confirmados_acores,confirmados_madeira,confirmados_estrangeiro,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,cadeias_transmissao,transmissao_importada,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,...,obitos_acores,obitos_madeira,obitos_estrangeiro,recuperados_arsnorte,recuperados_arscentro,recuperados_arslvt,recuperados_arsalentejo,recuperados_arsalgarve,recuperados_acores,recuperados_madeira,recuperados_estrangeiro,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,confirmados_desconhecidos_m,confirmados_desconhecidos_f,ativos,internados_enfermaria,confirmados_desconhecidos,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
count,722.0,722.0,722.0,722.0,722.0,722.0,722.0,722.0,16.0,722.0,722.0,722.0,714.0,705.0,164.0,173.0,715.0,155.0,15.0,167.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,167.0,167.0,...,722.0,722.0,31.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,694.0,693.0,693.0,693.0,693.0,693.0,693.0,695.0,695.0,76.0,76.0,717.0,717.0,666.0,337.0,337.0,337.0,337.0
mean,655675.54,258966.14,89360.21,248914.5,22459.71,22063.64,5275.16,8635.96,9.12,4337.81,577384.22,10541.48,1237.37,194.46,2210.07,258703.39,77162.87,254759.04,12.2,640.63,21186.57,22108.82,34248.86,34079.04,53087.18,49346.39,52692.77,45748.48,60620.55,48853.75,51192.44,40777.63,32101.68,29283.66,20657.03,18352.95,30593.58,15384.7,0.44,0.34,...,27.48,45.23,0.0,10.8,9.1,13.9,0.0,0.0,0.0,0.0,0.0,0.79,0.82,0.71,0.61,3.42,4.8,12.26,14.63,41.68,63.36,90.41,218.01,299.77,683.82,872.0,1453.52,3901.13,3292.85,5220.36,5730.56,27.22,20.05,68222.29,1040.9,638.41,812.66,814.13,1.04,1.04
std,624213.94,236383.08,89038.15,233549.21,22117.66,25064.21,7596.4,13433.51,5.86,9993.36,536601.64,7747.09,1377.3,193.42,1387.71,157914.29,115857.3,123634.45,7.69,249.6,26986.86,28173.28,37678.74,37878.16,50643.32,48363.84,51322.2,44682.25,58335.51,47573.34,44737.61,36525.52,27459.74,24780.69,16950.79,15011.43,22703.1,11601.66,0.11,0.11,...,15.89,45.67,0.0,6.71,1.2,4.48,0.0,0.0,0.0,0.0,0.0,0.41,0.84,0.62,0.49,2.21,3.28,8.78,11.7,28.46,44.61,63.16,151.05,211.98,481.49,614.21,1030.51,2670.36,2285.82,3598.23,4008.62,14.59,9.23,112624.8,1193.66,1049.38,1694.62,1696.23,0.12,0.13
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0,9.0,30.0,25.0,81.0,1746.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1,0.15,...,0.0,0.0,0.0,3.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,0.0,5.0,9.0,12.0,10.0,23.0,0.0,0.0,2.0,0.0,-152.0,50.3,47.5,0.76,0.75
25%,55768.0,19984.25,4671.0,28845.0,889.75,1035.25,202.5,140.25,5.0,388.0,40915.25,1802.0,406.25,70.0,1417.75,115158.0,25716.0,189274.0,6.0,707.0,1003.0,1141.0,1459.5,1309.5,4774.5,4190.5,4932.5,4485.0,5167.0,4269.0,4822.5,3695.0,3005.5,2701.5,2074.0,1848.5,4249.0,2055.5,0.36,0.28,...,15.0,0.0,0.0,3.0,8.0,11.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,1.0,10.0,12.0,17.0,41.0,51.0,112.0,148.0,229.0,695.0,545.0,927.5,941.5,15.0,13.0,21548.0,335.0,65.5,82.9,78.4,0.95,0.95
50%,796932.0,324433.0,113950.0,301000.0,28120.0,19799.0,3713.5,5916.5,9.0,883.5,698069.0,15929.5,674.0,129.0,1629.0,306171.0,36481.0,290510.0,11.0,767.0,21621.0,22523.0,37374.0,36987.0,60727.0,53294.0,62551.0,52212.0,74505.0,58723.0,66399.0,52033.0,41959.0,38877.0,28058.0,24787.0,44329.0,21875.0,0.4,0.29,...,28.0,59.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,20.0,60.0,87.0,123.0,315.0,445.0,1014.0,1296.0,2187.0,5916.0,5012.0,7867.0,8645.0,21.0,19.0,31948.0,542.0,299.5,173.6,176.9,1.04,1.03
75%,1014005.25,390683.0,134861.0,395409.5,35930.0,37456.75,8334.75,11330.25,11.0,3232.0,951496.25,17619.75,1342.5,198.0,2755.5,389169.0,78776.0,351165.5,18.5,767.0,30035.5,31197.5,53550.0,53151.5,84322.5,79013.5,79950.5,70542.0,91066.0,74013.5,78376.5,62307.5,49749.0,45484.0,32332.5,28769.0,49138.5,24638.5,0.52,0.37,...,40.0,72.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,5.0,8.0,20.0,26.0,66.0,106.0,152.75,353.0,488.0,1118.0,1420.0,2374.0,6259.0,5302.0,8408.5,9287.5,43.0,28.5,67805.0,1073.0,740.0,394.6,403.1,1.12,1.13
max,3131899.0,1198738.0,464530.0,1111346.0,109416.0,125751.0,49488.0,72630.0,21.0,65706.0,2574750.0,20666.0,6869.0,904.0,5903.0,468937.0,665706.0,413609.0,24.0,770.0,158957.0,166450.0,207911.0,209715.0,249393.0,232137.0,269462.0,227535.0,302363.0,243241.0,206117.0,168737.0,122144.0,106691.0,72091.0,63355.0,80086.0,42722.0,0.77,0.89,...,80.0,179.0,0.0,16.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,8.0,11.0,22.0,33.0,85.0,132.0,185.0,427.0,591.0,1308.0,1697.0,2770.0,7209.0,6182.0,9800.0,10866.0,46.0,33.0,642793.0,6004.0,6204.0,7163.7,7207.0,1.43,1.43


## Ajustando as Features

### Excluindo features 

In [145]:
df_pt = df_pt.drop(['data_dados', 'confirmados_estrangeiro', 'cadeias_transmissao', 
                    'obitos_estrangeiro','recuperados_arscentro','recuperados_arslvt',
                    'recuperados_arsalentejo','recuperados_arsalgarve','recuperados_acores', 'recuperados_arsnorte',
                    'recuperados_madeira','recuperados_estrangeiro','obitos_arsnorte',
                    'obitos_arscentro','obitos_arslvt','obitos_arsalentejo','obitos_arsalgarve',
                    'obitos_acores','obitos_madeira','confirmados_arsnorte', 'confirmados_arscentro',
                    'confirmados_arslvt', 'confirmados_arsalentejo','confirmados_arsalgarve', 'confirmados_acores', 
                    'confirmados_madeira', 'transmissao_importada', 'confirmados_desconhecidos', 
                    'confirmados_desconhecidos_m', 'confirmados_desconhecidos_f'
                   ], axis=1)

In [146]:
df_pt.shape

(722, 63)

In [147]:
df_pt.columns

Index(['data', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
       'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
       'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
       'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
       'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
       'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
       'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
       'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
       'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
       'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
       'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
       'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
       'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
       'obitos_30_39_f', 'obitos_30_39_m', 'obit

### Redefinindo algumas features utilizando a primeira diferença
* O objetivo é transformar os valores que representam o quantitativo acumulado diariamente no quantitativo diário

In [148]:
# Fazendo cópia do Dataset orginal e após aplicação do filtro
df_pt_diff = df_pt.copy()
df_pt_diff.shape

(722, 63)

A coluna 'confirmados_0_9_m' possui uma lacuna em uma data crucial para os resultados por isso, está sendo feito o tratamento do NAN nesta coluna antes da aplicação da primeira diferença. A lacuna é para o registro 424 correspondente a data 2021-04-25

In [149]:
df_pt_diff[df_pt_diff['confirmados_0_9_m'].isna()]

Unnamed: 0,data,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,sintomas_dificuldade_respiratoria,sintomas_cefaleia,sintomas_dores_musculares,sintomas_fraqueza_generalizada,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
0,2020-02-26,0,0,0,0,,,,25.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-02-27,0,0,0,0,,,,51.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-02-28,0,0,0,0,,,,59.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-02-29,0,0,0,0,,,,70.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-01-03,0,0,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,2020-02-03,2,2,0,0,,,,85.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,,,,
424,2021-04-25,834442,478,792685,16965,348.0,98.0,,,24313.0,,,,,,,,,,,,,,,,,,,,,,,,,,455613.0,378523.0,,,,,,,,,,,,,,,,,,,8056.0,8909.0,24792.0,250.0,72.1,68.3,0.98,0.99


In [150]:
df_pt_diff.dropna(subset=['confirmados_0_9_m'], axis=0,inplace=True)
df_pt_diff.reset_index(inplace=True, drop=True)

In [151]:
df_pt_diff.shape

(715, 63)

In [152]:
# Cálculo da primeira diferença

features_list_diff = ['recuperados','obitos','suspeitos','n_confirmados','confirmados_0_9_f', 'confirmados_0_9_m', 'confirmados_10_19_f',
                      'confirmados_10_19_m', 'confirmados_20_29_f', 'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
                      'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f', 'confirmados_50_59_m', 'confirmados_60_69_f',
                      'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f', 'confirmados_80_plus_m',
                      'confirmados_f', 'confirmados_m', 'obitos_f', 'obitos_m'
                    ]

for col in features_list_diff:
  temp = df_pt_diff.loc[0,col]
  df_pt_diff[col] = df_pt_diff[col].diff()
  df_pt_diff.loc[0,col] = temp

In [153]:
# Verificação da existência de valores negativos após o cálculo da primeira diferença.
# Isso pode ocorrer devido aos erros de preenchimento do dataset

neg_cols = []

for col in features_list_diff:
  neg_check = df_pt_diff[df_pt_diff[col] < 0][col].count()

  if neg_check > 0:
    neg_cols.append(col)
    print(col,'--->',neg_check)
print('\nLista de colunas com valores negativos: \n', neg_cols)

confirmados_0_9_f ---> 8
confirmados_0_9_m ---> 6
confirmados_10_19_f ---> 3
confirmados_10_19_m ---> 4
confirmados_20_29_m ---> 2
confirmados_30_39_f ---> 1
confirmados_30_39_m ---> 1
confirmados_40_49_f ---> 2
confirmados_60_69_f ---> 1
confirmados_60_69_m ---> 2
confirmados_70_79_f ---> 3
confirmados_70_79_m ---> 3
confirmados_80_plus_f ---> 1
confirmados_80_plus_m ---> 3

Lista de colunas com valores negativos: 
 ['confirmados_0_9_f', 'confirmados_0_9_m', 'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m', 'confirmados_40_49_f', 'confirmados_60_69_f', 'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f', 'confirmados_80_plus_m']


In [154]:
# Substituindo registros negativos pela mediana

print('Valores substituídos e mediana das colunas: \n')

for col in neg_cols:
  median_col = df_pt_diff[df_pt_diff[col] > 0][col].median()

  subs_list = df_pt_diff[df_pt_diff[col] < 0][col].to_list()
  print(col, '---> ', subs_list, '---> mediana: ', median_col)

  df_pt_diff[col] = df_pt_diff[col].replace(to_replace=subs_list, value=median_col)

Valores substituídos e mediana das colunas: 

confirmados_0_9_f --->  [-1.0, -5.0, -1.0, -2.0, -1.0, -3.0, -1.0, -9.0] ---> mediana:  40.0
confirmados_0_9_m --->  [-5.0, -6.0, -1.0, -4.0, -5.0, -7.0] ---> mediana:  42.0
confirmados_10_19_f --->  [-1.0, -6.0, -2.0] ---> mediana:  46.0
confirmados_10_19_m --->  [-2.0, -4.0, -3.0, -5.0] ---> mediana:  50.5
confirmados_20_29_m --->  [-118.0, -5.0] ---> mediana:  80.0
confirmados_30_39_f --->  [-5.0] ---> mediana:  71.0
confirmados_30_39_m --->  [-192.0] ---> mediana:  67.0
confirmados_40_49_f --->  [-2.0, -4.0] ---> mediana:  76.0
confirmados_60_69_f --->  [-14.0] ---> mediana:  47.0
confirmados_60_69_m --->  [-6.0, -9.0] ---> mediana:  39.0
confirmados_70_79_f --->  [-9.0, -1.0, -2.0] ---> mediana:  31.0
confirmados_70_79_m --->  [-5.0, -1.0, -4.0] ---> mediana:  27.0
confirmados_80_plus_f --->  [-1.0] ---> mediana:  36.5
confirmados_80_plus_m --->  [-2.0, -1.0, -1.0] ---> mediana:  20.0


### Criando novas features baseadas na coluna data

In [155]:
# Criando novas features baseadas na coluna 'date'
df_pt_diff['dia'] = df_pt_diff['data'].dt.day
df_pt_diff['mes'] = df_pt_diff['data'].dt.month
#df_pt_diff['ano'] = df_pt_diff['data'].dt.year
df_pt_diff['dia_da_semana'] = df_pt_diff['data'].dt.dayofweek

In [156]:
df_pt_diff.columns

Index(['data', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
       'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
       'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
       'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
       'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
       'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
       'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
       'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
       'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
       'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
       'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
       'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
       'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
       'obitos_30_39_f', 'obitos_30_39_m', 'obit

In [157]:
# Reordenando as colunas
new_order = [ 'data', 'mes','dia', 'dia_da_semana', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos',
              'internados', 'internados_uci', 'lab', 'suspeitos', 'vigilancia',
              'n_confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
              'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
              'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
              'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
              'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
              'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
              'confirmados_80_plus_m', 'sintomas_tosse', 'sintomas_febre',
              'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia',
              'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada',
              'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m',
              'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m',
              'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m',
              'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m',
              'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f',
              'obitos_80_plus_m', 'obitos_f', 'obitos_m', 'ativos',
              'internados_enfermaria', 'incidencia_nacional', 'incidencia_continente',
              'rt_nacional', 'rt_continente',]

df_pt_diff = df_pt_diff[new_order]

### Definindo coluna 'data' como index

In [158]:
df_pt_diff.set_index(df_pt_diff['data'], inplace=True)
df_pt_diff = df_pt_diff.drop(['data'], axis=1)
df_pt_diff = df_pt_diff.sort_index()
df_pt_diff.index

DatetimeIndex(['2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07',
               '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-11',
               '2020-01-12', '2020-02-04',
               ...
               '2022-08-01', '2022-08-02', '2022-09-01', '2022-09-02',
               '2022-10-01', '2022-10-02', '2022-11-01', '2022-11-02',
               '2022-12-01', '2022-12-02'],
              dtype='datetime64[ns]', name='data', length=715, freq=None)

## Definindo intervalo que será utilizado para filtrar o dataset - 2021-03-15 a 2022-01-31

Definição dos Conjuntos de Treino e Teste:
* Treino: 2021-03-15 a 2022-01-21
* Teste: 2022-01-22 a 2022-01-31 (Intervalo de 10 dias para o conjunto de teste)
<br>
<br>Obs1.: Erro na data a partir de fev. 2022
<br>Obs2.: Iniciando em de março de 2021 pois a features 'incidencia_nacional', 'incidencia_continente', 'rt_nacional' e 'rt_continente' só passaram a ser contabilizadas a partir dessa data.

In [159]:
df_train = df_pt_diff.loc['2021-03-15':'2022-01-21', :].copy()
df_train = df_train.sort_index() # Reordena o dataset através do index depois de ter feito a filtragem

df_test = df_pt_diff.loc['2022-01-22':'2022-01-31', :].copy()
df_test = df_test.sort_index()

In [160]:
# Verifica se as datas estão corretamente ordenadas
df_train.head()

Unnamed: 0_level_0,mes,dia,dia_da_semana,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,lab,suspeitos,vigilancia,n_confirmados,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,sintomas_tosse,sintomas_febre,sintomas_dificuldade_respiratoria,sintomas_cefaleia,sintomas_dores_musculares,sintomas_fraqueza_generalizada,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
2021-03-15,3,15,0,814513,256,2371.0,10.0,996.0,231.0,,,16685.0,,40.0,4.0,13.0,50.5,11.0,21.0,21.0,0.0,11.0,25.0,28.0,25.0,30.0,10.0,13.0,17.0,22.0,13.0,,,,,,,144.0,113.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,89.0,126.0,323.0,452.0,1026.0,1307.0,2226.0,5968.0,5059.0,5.0,5.0,36031.0,765.0,96.0,84.2,0.83,0.79
2021-03-16,3,16,1,814897,384,1173.0,13.0,955.0,213.0,,,15774.0,,12.0,2.0,12.0,12.0,9.0,39.0,28.0,24.0,24.0,31.0,37.0,21.0,27.0,26.0,17.0,13.0,27.0,21.0,,,,,,,193.0,189.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,128.0,323.0,452.0,1027.0,1309.0,2229.0,5970.0,5061.0,6.0,7.0,35229.0,742.0,96.0,82.3,0.83,0.79
2021-03-17,3,17,2,815570,673,1058.0,15.0,856.0,205.0,,,15183.0,,12.0,12.0,19.0,32.0,56.0,37.0,50.0,42.0,60.0,46.0,72.0,44.0,40.0,41.0,29.0,22.0,40.0,20.0,,,,,,,378.0,296.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,128.0,324.0,453.0,1028.0,1311.0,2232.0,5973.0,5065.0,6.0,9.0,34829.0,651.0,90.3,79.1,0.84,0.8
2021-03-18,3,18,3,816055,485,580.0,21.0,828.0,187.0,,,15268.0,,6.0,15.0,12.0,15.0,37.0,43.0,27.0,44.0,40.0,37.0,42.0,30.0,28.0,28.0,31.0,15.0,16.0,14.0,,,,,,,239.0,241.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,90.0,129.0,325.0,453.0,1031.0,1313.0,2237.0,5976.0,5071.0,6.0,15.0,34713.0,641.0,90.3,79.1,0.84,0.8
2021-03-19,3,19,4,816623,568,1571.0,11.0,789.0,182.0,,,14915.0,,18.0,17.0,22.0,24.0,45.0,44.0,36.0,52.0,53.0,37.0,37.0,40.0,49.0,23.0,10.0,22.0,33.0,5.0,,,,,,,303.0,264.0,1.0,1.0,1.0,1.0,5.0,7.0,20.0,21.0,61.0,91.0,129.0,325.0,453.0,1035.0,1314.0,2238.0,5978.0,5073.0,3.0,8.0,33699.0,607.0,87.2,75.7,0.86,0.84


In [161]:
print('Dimensões do conjunto de treino: ', df_train.shape)
print('Dimensões do conjunto de teste: ', df_test.shape)

Dimensões do conjunto de treino:  (302, 65)
Dimensões do conjunto de teste:  (10, 65)


In [162]:
df_train.columns

Index(['mes', 'dia', 'dia_da_semana', 'confirmados', 'confirmados_novos',
       'recuperados', 'obitos', 'internados', 'internados_uci', 'lab',
       'suspeitos', 'vigilancia', 'n_confirmados', 'confirmados_0_9_f',
       'confirmados_0_9_m', 'confirmados_10_19_f', 'confirmados_10_19_m',
       'confirmados_20_29_f', 'confirmados_20_29_m', 'confirmados_30_39_f',
       'confirmados_30_39_m', 'confirmados_40_49_f', 'confirmados_40_49_m',
       'confirmados_50_59_f', 'confirmados_50_59_m', 'confirmados_60_69_f',
       'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m',
       'confirmados_80_plus_f', 'confirmados_80_plus_m', 'sintomas_tosse',
       'sintomas_febre', 'sintomas_dificuldade_respiratoria',
       'sintomas_cefaleia', 'sintomas_dores_musculares',
       'sintomas_fraqueza_generalizada', 'confirmados_f', 'confirmados_m',
       'obitos_0_9_f', 'obitos_0_9_m', 'obitos_10_19_f', 'obitos_10_19_m',
       'obitos_20_29_f', 'obitos_20_29_m', 'obitos_30_39_f', 

In [163]:
# Listando colunas nulas após definição do período
null_cols = []

for col in df_train.columns:
  if df_train[col].sum() == 0:
    null_cols.append(col)

print(null_cols)

['lab', 'suspeitos', 'n_confirmados', 'sintomas_tosse', 'sintomas_febre', 'sintomas_dificuldade_respiratoria', 'sintomas_cefaleia', 'sintomas_dores_musculares', 'sintomas_fraqueza_generalizada']


In [164]:
# Excluindo colunas nulas
df_train = df_train.drop(null_cols, axis=1)
df_test = df_test.drop(null_cols, axis=1)

In [165]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 302 entries, 2021-03-15 to 2022-01-21
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   mes                    302 non-null    int64  
 1   dia                    302 non-null    int64  
 2   dia_da_semana          302 non-null    int64  
 3   confirmados            302 non-null    int64  
 4   confirmados_novos      302 non-null    int64  
 5   recuperados            302 non-null    float64
 6   obitos                 302 non-null    float64
 7   internados             302 non-null    float64
 8   internados_uci         302 non-null    float64
 9   vigilancia             302 non-null    float64
 10  confirmados_0_9_f      302 non-null    float64
 11  confirmados_0_9_m      302 non-null    float64
 12  confirmados_10_19_f    302 non-null    float64
 13  confirmados_10_19_m    302 non-null    float64
 14  confirmados_20_29_f    302 non-null    

## Correlação

In [166]:
round(df_train.corr(),4)

Unnamed: 0,mes,dia,dia_da_semana,confirmados,confirmados_novos,recuperados,obitos,internados,internados_uci,vigilancia,confirmados_0_9_f,confirmados_0_9_m,confirmados_10_19_f,confirmados_10_19_m,confirmados_20_29_f,confirmados_20_29_m,confirmados_30_39_f,confirmados_30_39_m,confirmados_40_49_f,confirmados_40_49_m,confirmados_50_59_f,confirmados_50_59_m,confirmados_60_69_f,confirmados_60_69_m,confirmados_70_79_f,confirmados_70_79_m,confirmados_80_plus_f,confirmados_80_plus_m,confirmados_f,confirmados_m,obitos_0_9_f,obitos_0_9_m,obitos_10_19_f,obitos_10_19_m,obitos_20_29_f,obitos_20_29_m,obitos_30_39_f,obitos_30_39_m,obitos_40_49_f,obitos_40_49_m,obitos_50_59_f,obitos_50_59_m,obitos_60_69_f,obitos_60_69_m,obitos_70_79_f,obitos_70_79_m,obitos_80_plus_f,obitos_80_plus_m,obitos_f,obitos_m,ativos,internados_enfermaria,incidencia_nacional,incidencia_continente,rt_nacional,rt_continente
mes,1.0,-0.05,-0.01,-0.0,-0.25,-0.28,0.01,-0.02,-0.0,-0.17,-0.3,-0.3,-0.28,-0.28,-0.21,-0.17,-0.26,-0.23,-0.26,-0.25,-0.23,-0.22,-0.2,-0.18,-0.21,-0.22,-0.21,-0.23,-0.25,-0.25,,0.37,0.02,,-0.31,0.05,-0.02,0.19,0.14,0.26,0.21,0.12,0.13,0.1,0.12,0.08,0.09,0.08,0.0,0.02,-0.25,-0.03,-0.32,-0.31,0.12,0.13
dia,-0.05,1.0,-0.04,0.21,0.0,-0.11,-0.4,-0.4,-0.43,-0.12,0.02,0.01,0.02,0.01,0.03,0.05,0.01,0.02,0.0,0.01,-0.01,-0.0,-0.04,-0.04,-0.09,-0.09,-0.24,-0.2,-0.0,0.01,,0.27,0.13,,0.25,0.36,0.37,0.38,0.39,0.39,0.4,0.41,0.41,0.41,0.41,0.41,0.39,0.4,-0.39,-0.41,-0.12,-0.39,-0.04,-0.04,0.08,0.07
dia_da_semana,-0.01,-0.04,1.0,0.05,0.06,0.02,0.02,0.0,-0.0,0.05,0.04,0.04,0.06,0.06,0.06,0.06,0.05,0.06,0.05,0.05,0.05,0.05,0.06,0.06,0.07,0.07,0.08,0.07,0.06,0.06,,0.02,0.04,,0.02,0.03,0.01,0.02,0.02,0.02,0.03,0.02,0.02,0.02,0.02,0.01,0.02,0.02,0.02,0.02,0.06,0.01,0.05,0.05,0.01,0.01
confirmados,-0.0,0.21,0.05,1.0,0.74,0.68,-0.15,-0.1,-0.29,0.67,0.77,0.76,0.77,0.77,0.72,0.72,0.75,0.74,0.75,0.75,0.7,0.7,0.68,0.67,0.65,0.66,0.38,0.46,0.74,0.75,,0.71,0.69,,0.81,0.82,0.48,0.77,0.76,0.79,0.79,0.74,0.7,0.69,0.72,0.67,0.64,0.65,-0.15,-0.16,0.72,-0.07,0.89,0.89,0.31,0.3
confirmados_novos,-0.25,0.0,0.06,0.74,1.0,0.88,0.21,0.29,0.12,0.89,0.94,0.94,0.99,0.99,0.98,0.97,1.0,1.0,1.0,1.0,0.98,0.98,0.98,0.97,0.96,0.96,0.82,0.87,1.0,1.0,,0.18,0.73,,0.59,0.33,0.01,0.24,0.24,0.23,0.23,0.2,0.16,0.16,0.19,0.15,0.11,0.12,0.22,0.2,0.93,0.31,0.94,0.94,0.37,0.36
recuperados,-0.28,-0.11,0.02,0.68,0.88,1.0,0.38,0.45,0.3,0.9,0.85,0.85,0.88,0.88,0.84,0.81,0.88,0.86,0.88,0.88,0.86,0.85,0.87,0.86,0.89,0.89,0.8,0.85,0.88,0.88,,0.12,0.6,,0.55,0.28,-0.01,0.18,0.17,0.15,0.15,0.12,0.09,0.09,0.12,0.08,0.07,0.08,0.39,0.37,0.93,0.48,0.96,0.96,0.26,0.25
obitos,0.01,-0.4,0.02,-0.15,0.21,0.38,1.0,0.98,0.94,0.48,0.13,0.13,0.16,0.16,0.18,0.17,0.19,0.18,0.2,0.2,0.25,0.24,0.31,0.32,0.39,0.37,0.62,0.56,0.22,0.2,,-0.33,0.05,,-0.4,-0.46,-0.61,-0.49,-0.53,-0.48,-0.53,-0.59,-0.6,-0.6,-0.57,-0.61,-0.58,-0.58,0.99,0.99,0.41,0.98,0.81,0.81,0.21,0.19
internados,-0.02,-0.4,0.0,-0.1,0.29,0.45,0.98,1.0,0.97,0.56,0.21,0.21,0.24,0.24,0.26,0.25,0.26,0.26,0.27,0.27,0.32,0.31,0.38,0.39,0.45,0.44,0.66,0.62,0.29,0.28,,-0.33,0.12,,-0.34,-0.44,-0.6,-0.48,-0.52,-0.48,-0.52,-0.58,-0.59,-0.59,-0.56,-0.6,-0.57,-0.57,0.97,0.98,0.49,1.0,0.84,0.85,0.27,0.25
internados_uci,-0.0,-0.43,-0.0,-0.29,0.12,0.3,0.94,0.97,1.0,0.4,0.03,0.03,0.08,0.07,0.11,0.1,0.1,0.1,0.1,0.11,0.16,0.15,0.21,0.23,0.28,0.27,0.53,0.47,0.12,0.11,,-0.43,-0.0,,-0.47,-0.56,-0.62,-0.58,-0.62,-0.58,-0.63,-0.67,-0.67,-0.66,-0.64,-0.67,-0.63,-0.64,0.93,0.95,0.32,0.96,0.32,0.33,0.03,0.02
vigilancia,-0.17,-0.12,0.05,0.67,0.89,0.9,0.48,0.56,0.4,1.0,0.85,0.85,0.89,0.89,0.85,0.83,0.88,0.87,0.88,0.88,0.85,0.84,0.87,0.86,0.89,0.9,0.84,0.87,0.89,0.88,,0.09,0.66,,0.44,0.19,-0.11,0.15,0.1,0.11,0.11,0.06,0.02,0.03,0.06,0.01,-0.01,0.0,0.49,0.48,0.97,0.58,0.96,0.96,0.39,0.38


# Preparação dos dados para o Modelo de Machine Learning

## Seleção de Features e definição do Target

In [167]:
# Armazenando os nomes das colunas como lista

features_names = df_train.columns.to_list()

# Definindo a variável alvo e as features
target_name = 'internados_uci' #'recuperados' #'confirmados_novos' #'obitos' #'internados'

confirmados_list = ['confirmados_novos', 'confirmados', 'confirmados_0_9_f', 'confirmados_0_9_m',
                    'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f',
                    'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m',
                    'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f',
                    'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m',
                    'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f',
                    'confirmados_80_plus_m', 'confirmados_f', 'confirmados_m']

obitos_list = ['obitos', 'obitos_0_9_m', 'obitos_0_9_f', 'obitos_10_19_m', 'obitos_10_19_f', 'obitos_20_29_f', 'obitos_20_29_m',
               'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m', 
               'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m', 
               'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f', 'obitos_80_plus_m', 
               'obitos_f', 'obitos_m']

internados_list = ['internados', 'internados_uci', 'internados_enfermaria'] #+ obitos_list

if target_name == 'confirmados_novos':
  for name in confirmados_list:
    features_names.remove(name)
elif target_name == 'obitos':
  for name in obitos_list:
    features_names.remove(name)
elif target_name == 'internados':
  for name in internados_list:
    features_names.remove(name)
elif target_name == 'internados_uci':
  features_names.remove(target_name)
  features_names.remove('internados')
  #features_names.remove('internados_enfermaria')
else:
  features_names.remove(target_name)

print(features_names)

['mes', 'dia', 'dia_da_semana', 'confirmados', 'confirmados_novos', 'recuperados', 'obitos', 'vigilancia', 'confirmados_0_9_f', 'confirmados_0_9_m', 'confirmados_10_19_f', 'confirmados_10_19_m', 'confirmados_20_29_f', 'confirmados_20_29_m', 'confirmados_30_39_f', 'confirmados_30_39_m', 'confirmados_40_49_f', 'confirmados_40_49_m', 'confirmados_50_59_f', 'confirmados_50_59_m', 'confirmados_60_69_f', 'confirmados_60_69_m', 'confirmados_70_79_f', 'confirmados_70_79_m', 'confirmados_80_plus_f', 'confirmados_80_plus_m', 'confirmados_f', 'confirmados_m', 'obitos_0_9_f', 'obitos_0_9_m', 'obitos_10_19_f', 'obitos_10_19_m', 'obitos_20_29_f', 'obitos_20_29_m', 'obitos_30_39_f', 'obitos_30_39_m', 'obitos_40_49_f', 'obitos_40_49_m', 'obitos_50_59_f', 'obitos_50_59_m', 'obitos_60_69_f', 'obitos_60_69_m', 'obitos_70_79_f', 'obitos_70_79_m', 'obitos_80_plus_f', 'obitos_80_plus_m', 'obitos_f', 'obitos_m', 'ativos', 'internados_enfermaria', 'incidencia_nacional', 'incidencia_continente', 'rt_nacional',

In [168]:
y_train = df_train[[target_name]]
y_test = df_test[[target_name]]

X_train = df_train[features_names]
X_test = df_test[features_names]

In [169]:
#X_train.info()

In [170]:
# Descrição das features selecionadas
pd.options.display.float_format = '{:.2f}'.format

X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mes,302.0,7.48,2.99,1.0,5.0,8.0,10.0,12.0
dia,302.0,16.1,8.75,1.0,9.0,16.0,23.0,31.0
dia_da_semana,302.0,2.99,1.99,0.0,1.0,3.0,5.0,6.0
confirmados,302.0,993315.18,252809.67,431623.0,839621.5,964743.5,1083418.5,2690690.0
confirmados_novos,302.0,3680.2,8469.54,158.0,598.0,1182.5,2886.25,58530.0
recuperados,302.0,2766.99,5431.81,226.0,605.0,1088.5,2631.0,44610.0
obitos,302.0,17.95,37.16,0.0,4.0,8.0,14.0,258.0
vigilancia,302.0,59468.3,68915.38,14734.0,22031.5,34922.0,74863.0,639307.0
confirmados_0_9_f,302.0,190.79,588.26,0.0,22.25,56.0,136.0,4768.0
confirmados_0_9_m,302.0,199.96,623.92,0.0,24.25,59.5,130.75,5116.0


In [171]:
# Descrição da variável alvo
y_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
internados_uci,302.0,155.95,154.39,49.0,75.0,119.0,156.5,904.0


In [172]:
# Pipeline para preparação das variáveis numéricas

numeric_pipeline = Pipeline(steps=[
                                   ('imputer', SimpleImputer(strategy='median')), # fill_value=0
                                   #('normalization', MinMaxScaler())
])

In [173]:
# Ajustando variáveis numéricas usando o pipeline

X_train_prepared = numeric_pipeline.fit_transform(X_train)
y_train_prepared = numeric_pipeline.fit_transform(y_train)

X_test_prepared = numeric_pipeline.fit_transform(X_test)
y_test_prepared = numeric_pipeline.fit_transform(y_test)

# Implementação do XGBoost

### Definição da seed e Instanciando o XGBoost

In [174]:
seed = 1275

xgb = XGBRegressor( #tree_method = 'gpu_hist',
                    booster='gbtree',
                    objective='reg:squarederror', max_depth=15,
                    learning_rate=0.1, n_estimators=100,
                    random_state=seed, n_jobs=-1
)

### Time Series cross-validator

In [175]:
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None)


### Avaliação do Modelo antes da busca pelos melhores parâmetros

In [176]:
# Avaliação do modelo antes da busca de parâmetros

def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "neg_mean_squared_error", "neg_mean_absolute_percentage_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    mape = -cv_results['test_neg_mean_absolute_percentage_error']
    mse = -cv_results['test_neg_mean_squared_error']
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    
    print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    #print(cv_results.keys())

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

Mean Absolute Error (MAE):     14.412 +/- 7.763
Mean Absolute Percentage Error (MAPE): 0.106 +/- 0.049
Mean Squared Error (MSE): 1301.705 +/- 2080.481
Root Mean Squared Error (RMSE): 28.320 +/- 22.354


### Busca dos Melhores Parâmetros

In [177]:
param_distributions = [
                        {
                          'n_estimators': [100, 120, 130, 140, 150, 160, 200, 225, 235, 250, 265, 275], 
                          'learning_rate':[0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
                          'max_depth':[4, 5, 6, 7, 8, 9, 10, 12, 14],
                          'booster':['gbtree', 'gblinear'],
                          'objective':['reg:squarederror'], #,'reg:logistic'],
                          'gamma':[0, 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
                          #'tree_method': ['gpu_hist']
                        }
                      ]

In [178]:
rnd_search = RandomizedSearchCV(estimator=xgb, 
                                param_distributions = param_distributions, 
                                n_iter=50, scoring='neg_mean_squared_error', #'neg_root_mean_squared_error', #'neg_mean_absolute_percentage_error'
                                n_jobs=-1, cv=tscv, random_state=seed
                              )
rnd_search.fit(X_train_prepared, y_train_prepared)

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
                   estimator=XGBRegressor(max_depth=15, n_jobs=-1,
                                          objective='reg:squarederror',
                                          random_state=1275),
                   n_iter=50, n_jobs=-1,
                   param_distributions=[{'booster': ['gbtree', 'gblinear'],
                                         'gamma': [0, 0.1, 0.5, 1.0, 1.5, 2.0,
                                                   2.5, 3.0],
                                         'learning_rate': [0.01, 0.03, 0.05,
                                                           0.1, 0.2, 0.3, 0.4,
                                                           0.5, 0.6, 0.7, 0.8],
                                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 12,
                                                       14],
                                         'n_estimators': [100, 120, 

In [179]:
print('Melhores parâmetros: ', rnd_search.best_params_)

Melhores parâmetros:  {'objective': 'reg:squarederror', 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.03, 'gamma': 0, 'booster': 'gbtree'}


### Avaliação do Modelo após a Busca pelos Melhores Parâmetros

In [180]:
# Avaliação do modelo após a busca por parâmetros

xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
print(xgb.get_params)
print('\n')

evaluate(xgb, X_train_prepared, y_train_prepared, cv=tscv)

<bound method XGBModel.get_params of XGBRegressor(learning_rate=0.03, max_depth=10, n_estimators=200,
             objective='reg:squarederror', random_state=1275)>


Mean Absolute Error (MAE):     14.672 +/- 8.010
Mean Absolute Percentage Error (MAPE): 0.107 +/- 0.050
Mean Squared Error (MSE): 1233.808 +/- 2039.850
Root Mean Squared Error (RMSE): 27.920 +/- 21.314


## Testando o Modelo

In [181]:
xgb = XGBRegressor(**rnd_search.best_params_, random_state=seed)
xgb.fit(X_train_prepared,y_train_prepared)
y_predicted = xgb.predict(X_test_prepared)

y_predicted

array([156.29758, 159.949  , 159.62973, 156.09949, 152.22345, 156.90675,
       153.06445, 153.06445, 154.87793, 155.11096], dtype=float32)

In [182]:
# Comparação dos resultados

print('(Teste, Previsão) --- Previsão-Teste')

for pair in zip(np.reshape(y_test_prepared, len(y_test_prepared)), np.round(y_predicted,0)):
  print(pair, '---', pair[1]-pair[0])

(Teste, Previsão) --- Previsão-Teste
(154.0, 156.0) --- 2.0
(160.0, 160.0) --- 0.0
(172.0, 160.0) --- -12.0
(158.0, 156.0) --- -2.0
(154.0, 152.0) --- -2.0
(147.0, 157.0) --- 10.0
(152.0, 153.0) --- 1.0
(153.0, 153.0) --- 0.0
(160.0, 155.0) --- -5.0
(160.0, 155.0) --- -5.0


In [183]:
# Métricas para o conjunto de testes
def test_metrics(y_pred, y_test):
  mae = mean_absolute_error(y_pred, y_test)
  mape = mean_absolute_percentage_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)
      
  print(
        f"Mean Absolute Error (MAE):     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Mean Absolute Percentage Error (MAPE): {mape.mean():.3f} +/- {mape.std():.3f}\n"
        f"Mean Squared Error (MSE): {mse.mean():.3f} +/- {mse.std():.3f}\n"
        f"Root Mean Squared Error (RMSE): {rmse.mean():.3f} +/- {rmse.std():.3f}"
       )
  
test_metrics(y_predicted, y_test_prepared)

Mean Absolute Error (MAE):     3.944 +/- 0.000
Mean Absolute Percentage Error (MAPE): 0.025 +/- 0.000
Mean Squared Error (MSE): 31.449 +/- 0.000
Root Mean Squared Error (RMSE): 5.608 +/- 0.000
