In [1]:
import pandas as pd
import os

In [2]:
# Define algumas variáveis
pasta_datasets = '../Datasets/'
arquivo_ipca = 'ipca_mensal.csv'
arquivo_dolar = 'dolar.csv'

In [3]:
# Efetua a leitura do arquivo 'ipca.csv' 

print(pasta_datasets+arquivo_ipca)
df_ipca =  pd.read_csv(pasta_datasets+arquivo_ipca,sep = ';',low_memory=False, parse_dates=['Data'],dayfirst = True,encoding='utf-8')

# Cria um novo índice com a coluna 'Data' 
df_ipca.set_index(['Data'], inplace=True)

display(df_ipca)
df_ipca.info()



../Datasets/ipca_mensal.csv


Unnamed: 0_level_0,Indice_Dez93,Ipca_Mensal
Data,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-01-01,2412.83,0.58
2005-02-01,2427.07,0.59
2005-03-01,2441.87,0.61
2005-04-01,2463.11,0.87
2005-05-01,2475.18,0.49
...,...,...
2021-08-01,5876.05,0.87
2021-09-01,5944.21,1.16
2021-10-01,6018.51,1.25
2021-11-01,6075.69,0.95


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 204 entries, 2005-01-01 to 2021-12-01
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Indice_Dez93  204 non-null    float64
 1   Ipca_Mensal   204 non-null    float64
dtypes: float64(2)
memory usage: 4.8 KB


In [4]:
# Efetua a leitura do arquivo 'dolar.csv' 

print(pasta_datasets+arquivo_dolar)
df_dolar =  pd.read_csv(pasta_datasets+arquivo_dolar,sep = ';',low_memory=False, parse_dates=['Data'],dayfirst = True,encoding='utf-8')

# Cria um novo índice com a coluna 'Data' 
df_dolar.set_index(['Data'], inplace=True)

display(df_dolar)
df_dolar.info()


../Datasets/dolar.csv


Unnamed: 0_level_0,cotacaoCompra,cotacaoVenda
Data,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-01-03,2.6674,2.6682
2005-01-04,2.6879,2.6887
2005-01-05,2.7088,2.7096
2005-01-06,2.7199,2.7207
2005-01-07,2.7024,2.7032
...,...,...
2021-12-27,5.6644,5.6650
2021-12-28,5.6432,5.6438
2021-12-29,5.6613,5.6619
2021-12-30,5.5799,5.5805


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4268 entries, 2005-01-03 to 2021-12-31
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cotacaoCompra  4268 non-null   float64
 1   cotacaoVenda   4268 non-null   float64
dtypes: float64(2)
memory usage: 100.0 KB


In [5]:
# É efetuado um merge "outer" entre os dataframes df_dolar e df_ipca
# Cabe observar que o df_ipva possui dados apenas no dia 01 de cada mês

df_dolar_ipca = pd.merge(df_dolar,df_ipca,on="Data",how="outer")

display(df_dolar_ipca)
df_dolar_ipca.info()

Unnamed: 0_level_0,cotacaoCompra,cotacaoVenda,Indice_Dez93,Ipca_Mensal
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-03,2.6674,2.6682,,
2005-01-04,2.6879,2.6887,,
2005-01-05,2.7088,2.7096,,
2005-01-06,2.7199,2.7207,,
2005-01-07,2.7024,2.7032,,
...,...,...,...,...
2020-08-01,,,5357.46,0.24
2020-11-01,,,5486.52,0.89
2021-01-01,,,5574.49,0.25
2021-05-01,,,5739.56,0.83


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4350 entries, 2005-01-03 to 2021-08-01
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cotacaoCompra  4268 non-null   float64
 1   cotacaoVenda   4268 non-null   float64
 2   Indice_Dez93   204 non-null    float64
 3   Ipca_Mensal    204 non-null    float64
dtypes: float64(4)
memory usage: 169.9 KB


In [6]:
#É feita a ordenação por Data, de forma crescente
df_dolar_ipca.sort_values(by='Data', ascending=True, inplace = True)

display(df_dolar_ipca)

Unnamed: 0_level_0,cotacaoCompra,cotacaoVenda,Indice_Dez93,Ipca_Mensal
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-01,,,2412.83,0.58
2005-01-03,2.6674,2.6682,,
2005-01-04,2.6879,2.6887,,
2005-01-05,2.7088,2.7096,,
2005-01-06,2.7199,2.7207,,
...,...,...,...,...
2021-12-27,5.6644,5.6650,,
2021-12-28,5.6432,5.6438,,
2021-12-29,5.6613,5.6619,,
2021-12-30,5.5799,5.5805,,


In [7]:
# O outer join agrupou assim os dois dataframes, e observamos duas situações:
# 1 - quando a Data existe nos dois, as colunas estão todas preenchidas
# 2 - quando a Data existe apenas em uma das tabelas, há colunas com valores nulos, que deverão ser tratados.
# Assim, como o valor do IPCA é mensal, e vale para todos os dias do mês, então será necessário
# replicar este valor nas demais datas do mês

temp_indice_dez93 = -1
temp_ipca_mensal = -1
    
for idx in df_dolar_ipca.index:
    
    if(pd.notnull(df_dolar_ipca.loc[idx,'Indice_Dez93'])==False):
        df_dolar_ipca.loc[idx,'Indice_Dez93'] = temp_indice_dez93
        df_dolar_ipca.loc[idx,'Ipca_Mensal'] = temp_ipca_mensal
    else:
        temp_indice_dez93 = df_dolar_ipca.loc[idx,'Indice_Dez93']
        temp_ipca_mensal = df_dolar_ipca.loc[idx,'Ipca_Mensal']
        
display(df_dolar_ipca)


Unnamed: 0_level_0,cotacaoCompra,cotacaoVenda,Indice_Dez93,Ipca_Mensal
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-01,,,2412.83,0.58
2005-01-03,2.6674,2.6682,2412.83,0.58
2005-01-04,2.6879,2.6887,2412.83,0.58
2005-01-05,2.7088,2.7096,2412.83,0.58
2005-01-06,2.7199,2.7207,2412.83,0.58
...,...,...,...,...
2021-12-27,5.6644,5.6650,6120.04,0.73
2021-12-28,5.6432,5.6438,6120.04,0.73
2021-12-29,5.6613,5.6619,6120.04,0.73
2021-12-30,5.5799,5.5805,6120.04,0.73


In [8]:
# Verifica se ainda há linhas com valores nulos
df_dolar_ipca.isnull().values.any()

df_dolar_ipca.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4350 entries, 2005-01-01 to 2021-12-31
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cotacaoCompra  4268 non-null   float64
 1   cotacaoVenda   4268 non-null   float64
 2   Indice_Dez93   4350 non-null   float64
 3   Ipca_Mensal    4350 non-null   float64
dtypes: float64(4)
memory usage: 299.0 KB


In [9]:
# Ainda há 82 linhas em que as colunas 'cotacaoCompra' e 'cotacaoVenda' permanecem em branco.
# Esses são os casos em que o 1º dia do mês não é dia útil, assim essas linhas serão desconsideradas

df_dolar_ipca.dropna(subset = ['cotacaoCompra'], inplace=True)

display(df_dolar_ipca)

Unnamed: 0_level_0,cotacaoCompra,cotacaoVenda,Indice_Dez93,Ipca_Mensal
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-03,2.6674,2.6682,2412.83,0.58
2005-01-04,2.6879,2.6887,2412.83,0.58
2005-01-05,2.7088,2.7096,2412.83,0.58
2005-01-06,2.7199,2.7207,2412.83,0.58
2005-01-07,2.7024,2.7032,2412.83,0.58
...,...,...,...,...
2021-12-27,5.6644,5.6650,6120.04,0.73
2021-12-28,5.6432,5.6438,6120.04,0.73
2021-12-29,5.6613,5.6619,6120.04,0.73
2021-12-30,5.5799,5.5805,6120.04,0.73


In [10]:
# Verifica que esse dataframe contém apenas dias úteis (dayofweek de 0 a 4)

# 0 = Segunda-feira
# 1 = Terça-feira
# 2 = Quarta-feira
# 3 = Quinta-feira
# 4 = Sexta-feira
# 5 = Sábado
# 6 = Domingo

df_dolar_ipca.index.dayofweek.unique()

Int64Index([0, 1, 2, 3, 4], dtype='int64', name='Data')

In [11]:
# Verifica se ainda há linhas com valores nulos
df_dolar_ipca.isnull().values.any()

df_dolar_ipca.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4268 entries, 2005-01-03 to 2021-12-31
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cotacaoCompra  4268 non-null   float64
 1   cotacaoVenda   4268 non-null   float64
 2   Indice_Dez93   4268 non-null   float64
 3   Ipca_Mensal    4268 non-null   float64
dtypes: float64(4)
memory usage: 166.7 KB


In [12]:
# Exporta o dataset para um arquivo CSV
df_dolar_ipca.to_csv(pasta_datasets + 'dolar_ipca.csv', sep = ';',index=True)