In [1]:
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
plt.style.use('ggplot')

data_raw_file = '../../Data/Raw/covid19_data_modeling.parquet'
data_proc_file = '../../Data/Processed/covid19_data_modeling.parquet'


pd.plotting.register_matplotlib_converters()

# Leitura dos Dados 

In [2]:
data = pd.read_parquet(data_raw_file)

print('shape:', data.shape)
print('columns:', data.columns)
data.head()

shape: (27154, 5)
columns: Index(['cases', 'countrycode', 'date', 'deaths', 'recovered'], dtype='object')


Unnamed: 0,cases,countrycode,date,deaths,recovered
0,852,AD,6/06/20,51,1
1,852,AD,6/05/20,51,1
2,852,AD,6/04/20,51,1
3,851,AD,6/03/20,51,1
4,844,AD,6/02/20,51,1


# Estatistica dos dados 

In [3]:
data.describe()

Unnamed: 0,cases,countrycode,date,deaths,recovered
count,27154,27154,27154,27154,27154
unique,10468,190,187,3518,566
top,1,JP,5/31/20,0,0
freq,796,187,189,6115,11246


# Correção do tipo dos dados

In [4]:
data['date'] = pd.to_datetime(data.date, format='%m/%d/%y')

data_types = {
    'cases': float,
    'countrycode': str,
    'deaths': float,
    'recovered': float,
}
for cname, dtype in data_types.items():
    if dtype == float:
        data.loc[data.recovered.astype(str) == '',cname] = '0'
    
    data[cname] = data[cname].astype(dtype)
data.describe()

Unnamed: 0,cases,deaths,recovered
count,27154.0,27154.0,27154.0
mean,29736.06,1562.512816,6415.592
std,166153.8,8287.240094,47517.53
min,0.0,0.0,0.0
25%,72.0,1.0,0.0
50%,874.0,15.0,2.0
75%,7457.0,161.0,33.0
max,4227364.0,146889.0,1089686.0


# Verificar Valores Nulos

In [5]:
data.isnull().sum()

cases          0
countrycode    0
date           0
deaths         0
recovered      0
dtype: int64

# Variável Alvo 

In [6]:
data.groupby('countrycode').cases.sum().sort_values(ascending=False).head(10)

countrycode
US    219284308.0
BR     86189347.0
RU     44566385.0
IN     38243347.0
ES     27125161.0
GB     26932978.0
IT     26496208.0
FR     21165421.0
DE     20848479.0
IR     19045661.0
Name: cases, dtype: float64

# Exportar Base de Dados 

In [7]:
data.to_parquet(data_proc_file)