In [1]:
#Importando bibliotecas
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime

In [5]:
url = 'https://github.com/neylsoncrepalde/projeto_eda_covid/blob/master/covid_19_data.csv?raw=true'

df = pd.read_csv(url, parse_dates=['ObservationDate', 'Last Update'])
df.head(5)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26713 entries, 0 to 26712
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   SNo              26713 non-null  int64         
 1   ObservationDate  26713 non-null  datetime64[ns]
 2   Province/State   12882 non-null  object        
 3   Country/Region   26713 non-null  object        
 4   Last Update      26713 non-null  datetime64[ns]
 5   Confirmed        26713 non-null  float64       
 6   Deaths           26713 non-null  float64       
 7   Recovered        26713 non-null  float64       
dtypes: datetime64[ns](2), float64(3), int64(1), object(2)
memory usage: 1.6+ MB


In [7]:
df.describe().drop(columns='SNo').T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Confirmed,26713.0,5690.64755,23417.369124,0.0,18.0,192.0,1350.0,352845.0
Deaths,26713.0,373.353236,2214.074387,0.0,0.0,3.0,28.0,35341.0
Recovered,26713.0,1736.979224,10864.727709,0.0,0.0,8.0,187.0,289392.0


In [8]:
df.describe(include='O').T


Unnamed: 0,count,unique,top,freq
Province/State,12882,354,Diamond Princess cruise ship,175
Country/Region,26713,223,US,4990


In [9]:
#Alterar nomes das colunas:
df.columns = [re.sub(r'[/ ]', '_', col).lower() for col in df.columns]

In [10]:
df.sample(4)

Unnamed: 0,sno,observationdate,province_state,country_region,last_update,confirmed,deaths,recovered
12075,12076,2020-04-05,,Slovenia,2020-04-05 23:13:44,997.0,28.0,79.0
8370,8371,2020-03-24,,Venezuela,2020-03-24 23:41:50,84.0,0.0,15.0
5462,5463,2020-03-14,Florida,US,2020-03-14 18:53:03,76.0,3.0,0.0
19394,19395,2020-04-28,,Moldova,2020-04-29 02:32:29,3638.0,103.0,975.0



# Análise dos casos de Covid-19 no Brasil

In [12]:
#df.loc[df.country_region == 'Brazil', 'province_state'].notnull().sum()
df.loc[df.country_region == 'Brazil'].notnull().sum()

sno                85
observationdate    85
province_state      0
country_region     85
last_update        85
confirmed          85
deaths             85
recovered          85
dtype: int64

In [13]:
df_brasil = df.loc[df.country_region == 'Brazil'].drop(columns=['province_state','sno'])
df_brasil

Unnamed: 0,observationdate,country_region,last_update,confirmed,deaths,recovered
82,2020-01-23,Brazil,2020-01-23 17:00:00,0.0,0.0,0.0
2455,2020-02-26,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2020-02-27,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2020-02-28,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2020-02-29,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...
24850,2020-05-15,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0
25227,2020-05-16,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0
25604,2020-05-17,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0
25981,2020-05-18,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0


# Número de casos confirmados


In [14]:
df_brasil = df_brasil[df_brasil.confirmed > 0]
df_brasil.shape

(84, 6)

In [15]:
fig = px.line(df_brasil, x='observationdate', y='confirmed',
        labels={'observationdate':'Data', 'confirmed':'Número de casos confirmados'},
        title='Casos confirmados no Brasil', width=1000, height=400)

fig.update_layout(
    margin=dict(l=30, r=20, t=60, b=5),
    font=dict(size=15, color='black')
)
fig.show()

# Número de novos casos por dia


In [16]:
#Função para fazer a contagem de novos casos:
#[Subtração entre o número de casos de um dia e o dia anterior]
def dif(v):
    J=[v[i+1]-v[i] for i in range(len(v)-1)]
    J.insert(0, v[0])
    return J

In [17]:
def dif2(v):
    J=[v[0]]
    for i in range(len(v)-1):
        J.append(v[i+1]-v[i])
    return np.array(J)

In [18]:
df_brasil = df_brasil.assign( novoscasos=dif(df_brasil['confirmed'].values) )
df_brasil

Unnamed: 0,observationdate,country_region,last_update,confirmed,deaths,recovered,novoscasos
2455,2020-02-26,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,1.0
2559,2020-02-27,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2668,2020-02-28,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2776,2020-02-29,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,1.0
2903,2020-03-01,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
24850,2020-05-15,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0,17126.0
25227,2020-05-16,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0,13220.0
25604,2020-05-17,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0,7569.0
25981,2020-05-18,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0,14288.0


In [19]:
#Visualizando
fig = px.line(df_brasil, x='observationdate', y='novoscasos', title='Novos casos por dia',
        labels={'observationdate':'Data', 'novoscasos':'Novos casos'}, width=1000, height=400)

fig.update_layout(
    margin=dict(l=20, r=20, t=60, b=0),
    #font_family="Times New Roman",
    font=dict(
      family="Times New Roman", size=18
    )
    #paper_bgcolor="LightSteelBlue"
)
fig.show()