In [16]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import requests
from transformers import AutoTokenizer, BertForSequenceClassification
import torch
import PyPDF2
import json
import shutil

In [17]:
#c:\\Users\\thgcn\\OneDrive\\Academico\\Financial-Reports-Impact
pwd

NameError: name 'pwd' is not defined

In [6]:
folders = ["dataset", "dataset/prices", "dataset/prices_processed"]

# Verifica se as pastas existem, se não, cria-as
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Pasta '{folder}' foi criada.")
    else:
        print(f"Pasta '{folder}' já existe.")

Pasta 'dataset' foi criada.
Pasta 'dataset/prices' foi criada.
Pasta 'dataset/prices_processed' foi criada.


In [3]:
# API YahooFinance para baixar os dados historicos
def HistoricalData(ticker, startDate, endDate, path2save = ''):

  """
  ticker: Simbolo ação. Ex: VALE
  startDate: Data inicial. Ex: 2010-01-01
  endDate: Data final. Ex: 2020-12-31
  path2save: Caminho para salvar o dataframe.
  """
  data = yf.download(ticker, start=startDate, end=endDate)
  df = pd.DataFrame(data)

  if path2save != '':
    df.to_csv(path2save)

  return df


dados = HistoricalData("ANIM3.SA", startDate='2010-01-01', endDate='2023-01-01')
dados

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-10-28,6.500000,6.500000,6.353333,6.403333,5.971816,12658800
2013-10-29,6.400000,6.416666,6.283333,6.310000,5.884773,1794300
2013-10-30,6.266666,6.266666,6.170000,6.170000,5.754207,1532100
2013-10-31,6.116666,6.436666,6.116666,6.393333,5.962490,2972700
2013-11-01,6.323333,6.656666,6.296666,6.633333,6.186317,539700
...,...,...,...,...,...,...
2022-12-23,3.990000,4.190000,3.920000,4.180000,4.180000,4693300
2022-12-26,4.170000,4.260000,4.060000,4.160000,4.160000,1076000
2022-12-27,4.170000,4.230000,3.830000,3.850000,3.850000,4134200
2022-12-28,3.880000,4.110000,3.820000,4.110000,4.110000,5200300


In [4]:
# Criando ferramenta para adicionar novos campos para o dataframe
def catalog_return(row, x, name_return):
    if row[name_return] > x * row[f'Cumulative_std_{name_return}']:
        return 1
    elif row[name_return] < -x * row[f'Cumulative_std_{name_return}']:
        return -1
    else:
        return 0


class DataProcessing:
    def __init__(self, data):
        self.dataframe = data
        self.dataframe['Date'] = pd.to_datetime(self.dataframe['Date'])
        self.dataframe = self.dataframe.sort_values(by='Date')

    def get_by_date_range(self, start_date, end_date):
        mask = ((self.dataframe['Date'] >= start_date) & (self.dataframe['Date'] <= end_date))
        return self.dataframe.loc[mask]

    def get_by_date(self, date):
        return self.dataframe.loc[(self.dataframe['Date'] == date)]

    def create_return_by_period(self, name_return, period, remove_nan=False):
        self.dataframe[f'{name_return}'] = np.log(
            self.dataframe['Close'] / self.dataframe['Close'].shift(period))
        if remove_nan:
            self.dataframe = self.dataframe.dropna()

    def create_cumulative_std(self, name_return):
        self.dataframe[f'Cumulative_std_{name_return}'] = self.dataframe[name_return].expanding().std()

    def create_indicator(self, name_return, factor):
        self.dataframe[f'Indicator_{name_return}'] = self.dataframe.apply(lambda row:
                                                                          catalog_return(row, factor, name_return),
                                                                          axis=1)

In [7]:
# Baixar precos de varias empresas
empresas = ["ANIM3.SA", "AZUL4.SA", "BBAS3.SA"]
for emp in empresas:
  HistoricalData(ticker=emp, startDate="2010-01-01", endDate="2024-01-01", path2save=f"dataset/prices/{emp}.csv")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [8]:
files = os.listdir('dataset/prices')
for file in files:
    data_processed = DataProcessing(pd.read_csv(f'dataset/prices/{file}'))
    data_processed.create_return_by_period(name_return='Daily_Return', period=1, remove_nan=False)
    data_processed.create_return_by_period(name_return='Week_Return', period=6, remove_nan=False)
    data_processed.create_return_by_period(name_return='Month_Return', period=22, remove_nan=False)
    data_processed.create_cumulative_std(name_return='Daily_Return')
    data_processed.create_cumulative_std(name_return='Week_Return')
    data_processed.create_cumulative_std(name_return='Month_Return')
    data_processed.create_indicator(name_return='Daily_Return', factor=0.1)
    data_processed.create_indicator(name_return='Week_Return', factor=0.1)
    data_processed.create_indicator(name_return='Month_Return', factor=0.1)
    data_processed.dataframe.to_csv(f'dataset/prices_processed/{file}', index_label=False)
    print(f'File {file} created and save in dataset/prices_processed/{file}')

File ANIM3.SA.csv created and save in dataset/prices_processed/ANIM3.SA.csv
File AZUL4.SA.csv created and save in dataset/prices_processed/AZUL4.SA.csv
File BBAS3.SA.csv created and save in dataset/prices_processed/BBAS3.SA.csv


In [9]:
def EventsDate(ticker, userName="aluno.thiago.nunes", password="NLPfinance2%4023", startDate='01012010', endDate="01012024"):
  url = "https://www.comdinheiro.com.br/Clientes/API/EndPoint001.php"
  querystring = {"code":"import_data"}
  payload = f"username={userName}&password={password}&URL=HistoricoIndicadoresFundamentalistas001.php%3F%26data_ini%3D{startDate}%26data_fim%3D{endDate}%26trailing%3D12%26conv%3DMIXED%26moeda%3DMOEDA_ORIGINAL%26c_c%3Dconsolidado%26m_m%3D1000000%26n_c%3D5%26f_v%3D1%26papel%3D{ticker}%26indic%3DNOME_EMPRESA%2BRL%2BLL%2BEBITDA%2BDATA_PUBLICACAO%2BPRECO_ABERTURA%2BPRECO_FECHAMENTO%26periodicidade%3Dtri%26graf_tab%3Dtabela%26desloc_data_analise%3D1%26flag_transpor%3D0%26c_d%3Dd%26enviar_email%3D0%26enviar_email_log%3D0%26cabecalho_excel%3Dmodo1%26relat_alias_automatico%3Dcmd_alias_01&format=json3"
  headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  response = requests.request("POST", url, data=payload, headers=headers, params=querystring)
  data = json.loads(response.text)
  df = pd.DataFrame(data["tables"]["tab0"]).T
  novas_colunas = ["Data", "Empresa", "Receita", "Lucro", "EBITDA", "Data_Publicacao", "Preco_Abertura", "Preco_fechamento", "Consolidado", "Convencao", "Moeda", "Data_Demonstracao", "Meses", "Data_Analise"]
  df.columns = novas_colunas
  df = df.drop("lin0")
  df['Data_Publicacao'] = pd.to_datetime(df['Data_Publicacao'], errors = 'coerce')
  df.reset_index(drop=True, inplace=True)
  df['Data_Publicacao'] = pd.to_datetime(df['Data_Publicacao'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
  return df


In [10]:
files = os.listdir('dataset/prices_processed')
for emp in files:
  price_p = pd.read_csv(f"dataset/prices_processed/{emp}")
  date_df = EventsDate(ticker= emp[0:-7])
  price_p.insert(1, 'event', price_p['Date'].apply(lambda date: 1 if date in date_df['Data_Publicacao'].values else 0))
  price_p.to_csv(f"dataset/prices_processed/{emp}", index=False)

  df['Data_Publicacao'] = pd.to_datetime(df['Data_Publicacao'], errors = 'coerce')


In [11]:
df_processed = pd.read_csv("dataset/prices_processed/AZUL4.SA.csv")

In [12]:
def separate_returns(final_data):
    # Inicializar colunas se não existirem
    if 'return_daily' not in final_data.columns:
        final_data['return_daily'] = np.nan
    if 'return_week' not in final_data.columns:
        final_data['return_week'] = np.nan
    if 'return_month' not in final_data.columns:
        final_data['return_month'] = np.nan

    first_return_daily_list = []
    remaining_return_daily_list = []
    first_return_week_list = []
    remaining_return_week_list = []
    first_return_month_list = []
    remaining_return_month_list = []

    start_idx = 0

    while start_idx < len(final_data):
        # Encontra o evento
        if 1 in final_data[start_idx:]['event'].values:
            event_idx = final_data[start_idx:]['event'].eq(1).idxmax()
        else:
            break
        # Encontrar o próximo evento
        if 1 in final_data[event_idx+1:]['event'].values:
            prox_event_idx = final_data[event_idx+1:]['event'].eq(1).idxmax()
        else:
            prox_event_idx = len(final_data)

        # Calcular o primeiro retorno diario logo após o evento
        if event_idx + 1 < len(final_data):
            final_data.loc[event_idx + 1, 'return_daily'] = np.log(final_data.loc[event_idx + 1, 'Close'] / final_data.loc[event_idx, 'Close'])
            first_return_daily_list.append(final_data.iloc[event_idx + 1])

        # Calcular os retornos diarios restantes até o próximo evento
        for i in range(event_idx + 2, prox_event_idx, 1):
            final_data.loc[i, 'return_daily'] = np.log(final_data.loc[i, 'Close'] / final_data.loc[i-1, 'Close'])
            remaining_return_daily_list.append(final_data.iloc[i])

        # Calcular o primeiro retorno semanal logo após o evento
        if event_idx + 5 < len(final_data):
            final_data.loc[event_idx + 5, 'return_week'] = np.log(final_data.loc[event_idx + 5, 'Close'] / final_data.loc[event_idx, 'Close'])
            first_return_week_list.append(final_data.iloc[event_idx + 5])

        # Calcular os retornos semanais restantes até o próximo evento
        for i in range(event_idx + 10, prox_event_idx, 5):
            final_data.loc[i, 'return_week'] = np.log(final_data.loc[i, 'Close'] / final_data.loc[i-5, 'Close'])
            remaining_return_week_list.append(final_data.iloc[i])


        # Calcular o primeiro retorno mensal logo após o evento
        if event_idx + 21 < len(final_data):
            final_data.loc[event_idx + 21, 'return_month'] = np.log(final_data.loc[event_idx + 21, 'Close'] / final_data.loc[event_idx, 'Close'])
            first_return_month_list.append(final_data.iloc[event_idx + 21])

        # Calcular os retornos mensais restantes até o próximo evento
        for i in range(event_idx + 42, prox_event_idx, 22):
            final_data.loc[i, 'return_month'] = np.log(final_data.loc[i, 'Close'] / final_data.loc[i-21, 'Close'])
            remaining_return_month_list.append(final_data.iloc[i])

        # Reinicia após o evento
        start_idx = event_idx + 1

    # Criar DataFrames para o primeiro e os demais retornos semanais e mensais
    first_return_daily_df = pd.DataFrame(first_return_daily_list).reset_index()[['index',
    'Close', 'Date'
, 'event', 'return_daily']]
    remaining_return_daily_df = pd.DataFrame(remaining_return_daily_list).reset_index()[['index',
 'Close', 'Date'
, 'event', 'return_daily']]
    first_return_week_df = pd.DataFrame(first_return_week_list).reset_index()[['index',
 'Close', 'Date'
, 'event', 'return_week']]
    remaining_return_week_df = pd.DataFrame(remaining_return_week_list).reset_index()[['index',
 'Close', 'Date'
, 'event', 'return_week']]
    first_return_month_df = pd.DataFrame(first_return_month_list).reset_index()[['index',
 'Close', 'Date'
, 'event', 'return_month']]
    remaining_return_month_df = pd.DataFrame(remaining_return_month_list).reset_index()[['index',
 'Close', 'Date'
, 'event', 'return_month']]

    first_return_daily_df.rename(columns={'return_daily': 'return'}, inplace=True)
    remaining_return_daily_df.rename(columns={'return_daily': 'return'}, inplace=True)
    first_return_week_df.rename(columns={'return_week': 'return'}, inplace=True)
    remaining_return_week_df.rename(columns={'return_week': 'return'}, inplace=True)
    first_return_month_df.rename(columns={'return_month': 'return'}, inplace=True)
    remaining_return_month_df.rename(columns={'return_month': 'return'}, inplace=True)

    return first_return_daily_df, remaining_return_daily_df, first_return_week_df, remaining_return_week_df, first_return_month_df, remaining_return_month_df


first_return_daily_df, remaining_return_daily_df, first_return_week_df, remaining_return_week_df, first_return_month_df, remaining_return_month_df = separate_returns(df_processed)
