<a href="https://colab.research.google.com/github/yvpaulo/PythonMLCovid19_Previsao_Evolucao/blob/main/ModeloCovid19DIOLab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projeto Covid19
##Digital Innovation One

In [1]:
import pandas as pd
import numpy as np # lib para computação científica
from datetime import datetime #para trabalhar com datas
import plotly.express as px #para visualização (gráficos)
import plotly.graph_objects as go #para gráficos mais complexos

In [4]:
# importar dados do git 
url = 'https://github.com/yvpaulo/PythonMLCovid19_Previsao_Evolucao/blob/main/covid_19_data.csv?raw=true'

In [6]:
# criando o data frame e informando os campos que são datas
df = pd.read_csv(url, parse_dates=['ObservationDate','Last Update'])
df

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
26708,26709,2020-05-19,Wyoming,US,2020-05-20 02:32:19,776.0,10.0,0.0
26709,26710,2020-05-19,Xinjiang,Mainland China,2020-05-20 02:32:19,76.0,3.0,73.0
26710,26711,2020-05-19,Yukon,Canada,2020-05-20 02:32:19,11.0,0.0,11.0
26711,26712,2020-05-19,Yunnan,Mainland China,2020-05-20 02:32:19,185.0,2.0,183.0


In [7]:
#Conferindo tipos de cada coluna
df.dtypes

SNo                         int64
ObservationDate    datetime64[ns]
Province/State             object
Country/Region             object
Last Update        datetime64[ns]
Confirmed                 float64
Deaths                    float64
Recovered                 float64
dtype: object

Nomes de colunas não devem ter letras maiúsculas e nem caracteres especiais. Vamos agora implementar uma função para fazer a limpeza dos domes das colunas

In [9]:
#fazendo importe de lib para trabalhar com expressões regulares, obs: o recomendável é fazer os importes todos na parte superior do código
import re
#criando a função que remove caracteres especiais e espaços vazios e coloca o nomes das colunas em minúsculo.
def corrige_colunas(col_name):
  return re.sub(r"[/| ]", "", col_name).lower()

In [10]:
#testando a função
corrige_colunas("AdgE/P ou")

'adgepou'

In [12]:
#corrigindo as colunas
#Criamos uma lista e para cada coluna dentro do data frame aplicamos a função e corrigimos as colunas
df.columns = [corrige_colunas(col) for col in df.columns]
df

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
26708,26709,2020-05-19,Wyoming,US,2020-05-20 02:32:19,776.0,10.0,0.0
26709,26710,2020-05-19,Xinjiang,Mainland China,2020-05-20 02:32:19,76.0,3.0,73.0
26710,26711,2020-05-19,Yukon,Canada,2020-05-20 02:32:19,11.0,0.0,11.0
26711,26712,2020-05-19,Yunnan,Mainland China,2020-05-20 02:32:19,185.0,2.0,183.0


In [13]:
#Vamos Agora selecionar somente os dados do Brasil
df.loc[df.countryregion == 'Brazil']

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
82,83,2020-01-23,,Brazil,2020-01-23 17:00:00,0.0,0.0,0.0
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0


In [14]:
#testado a expressão vamos pegar somente as linhas que tenham casos confirmados
brasil = df.loc[
                (df.countryregion == 'Brazil') &
                (df.confirmed > 0)
]
brasil

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
2903,2904,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0


In [15]:
#verificando os casos confirmados
# Gráfico da evolução dos casos confirmados
px.line(brasil, 'observationdate', 'confirmed', title='Casos Confirmados no Brasil')

#Novos casos por dia

In [16]:
#usando Técnica de programação funcional
#o resultado dessa função é transformado em uma lista que será incluída como uma nova coluna no df
#Essa função verifica se é o primeiro caso, e se for retorna 0, a partir do segundo caso retornamos hoje menos ontem
brasil['novoscasos'] = list(map(
    lambda x: 0 if (x==0) else brasil['confirmed'].iloc[x] - brasil['confirmed'].iloc[x-1],
    np.arange(brasil.shape[0])

))
brasil



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered,novoscasos
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,1.0
2903,2904,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0,17126.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0,13220.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0,7569.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0,14288.0


In [18]:
#visualizando
px.line(brasil, x='observationdate', y='novoscasos', title='Novos Casos por dia')

# Verificando o Nº de mortes

In [19]:
fig =go.Figure()
fig.add_trace(
    go.Scatter(x=brasil.observationdate, y=brasil.deaths, name='Mortes',
               mode='lines+markers', line={'color':'red'})
)
#Layout
fig.update_layout(title='Mortes por COVID-19 no Brasil')
fig.show()

#Taxa de crescimento
taxa_crescimento = (presente/passado)**(1/n) -1


In [20]:
def taxa_crescimento(data, variable, data_inicio=None, data_fim=None):
  #Se data início for None define como a primeira data disponíel
  if data_inicio == None:
    data_inicio = data.observationdate.loc[data[variable] > 0].min()
  else:
    data_inicio = pd.to_datetime(data_inicio)
  
  if data_fim == None:
    data_fim = data.observationdate.iloc[-1]
  else:
    data_fim = pd.to_datetime(data_fim)
  #Definir os valores do presente e passado
  passado = data.loc[data.observationdate == data_inicio, variable].values[0]
  presente = data.loc[data.observationdate == data_fim, variable].values[0]
  
  #Definir o número de pontos no tempo que pamos avaliar
  n = (data_fim - data_inicio).days

  #Calcular taxa
  taxa = (presente/passado)**(1/n) - 1

  return taxa*100

In [21]:
# Taxa de crescimento médio do COVID no Brasil em todo o período
taxa_crescimento(brasil, 'confirmed') #=16% ao dia

16.27183353112116

In [None]:
#parei em Estabelecendo a taxa de crescimento diário