In [2]:
# BIBLIOTECAS
# =====================
from __future__ import annotations
import os
import io
import math
import json
import time
import enum
import warnings
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import requests

from scipy import stats
from scipy.stats.mstats import winsorize

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LinearRegression
import ta
from collections import OrderedDict


In [6]:
# CONFIGURAÇÕES GERAIS
# =====================

load_dotenv()

userName ="aluno.thiago.nunes" # os.getenv("USERNAME")
password = "NLPfinance2@23" #os.getenv("PASSWORD")

CD_USERNAME = os.getenv("COMD_USER", "aluno.thiago.nunes")
CD_PASSWORD = os.getenv("COMD_PASS", "NLPfinance2@23")


# Proxy de mercado e risk-free
#MARKET_PROXY = "^BVSP"  # alternativas: "BOVA11.SA" (se preferir ETF)
#RISK_FREE_SERIES = "CDI"      # CDI diário como proxy (pode trocar para SELIC diária se preferir)

# Janela de estimação e holding
#ESTIMATION_WINDOW = 504  # ~2 anos de pregões
#HOLDING_DAYS = 30         # janela pós-earnings (CAR)

# Universo de teste (exemplos; substitua pelos seus)
#B3_TICKERS = [
#    "ABEV3.SA", "ITUB4.SA", "PETR4.SA", "VALE3.SA", "BBDC4.SA",
#]

startDate = "01012010"
endDate   = "31122019"


# PASTAS
# =====================
folders = ["dataset", "dataset/prices", "dataset/fundamental"]

# Verifica se as pastas existem, se não, cria-as
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Pasta '{folder}' foi criada.")
    else:
        print(f"Pasta '{folder}' já existe.")


Pasta 'dataset' já existe.
Pasta 'dataset/prices' já existe.
Pasta 'dataset/fundamental' já existe.


In [7]:
empresas = ['IBOV', 'CDI', 'ABEV3']

In [3]:
indices = ['IBOV', 'CDI']
empresas = [
    #indices
    "IBOV", "CDI",
    # Empresas com papéis mais líquidos (ON, PN ou Unit)
    "AEDU3", "ABEV3", "ALLL11", "ALLL3",
    "ALPA4", "ALSC3", "ALUP11", "AMBV3",
    "AMBV4", "AMIL3", "ANIM3", "ARTR3",
    "ARZZ3", "AZUL4", "B3SA3", "BBAS3",
    "BBDC3", "BBDC4", "BBRK3", "BBSE3",
    "BEEF3", "BIDI11", "BISA3", "BPAC11",
    "BPNM4", "BRAP4", "BRDT3", "BRFS3",
    "BRKM5", "BRML3", "BRPR3", "BRSR6",
    "BRTO4", "BTOW3", "BVMF3", "CCRO3",
    "CCXC3", "CESP6", "CIEL3", "CMIG3",
    "CMIG4", "CNFB4", "CPFE3", "CPLE6",
    "CRFB3", "CRUZ3", "CSAN3", "CSMG3",
    "CSNA3", "CTIP3", "CVCB3", "CYRE3",
    "DASA3", "DTEX3", "ECOD3", "ECOR3",
    "EGIE3", "ELET3", "ELET6", "ELPL4",
    "ELPL6", "EMBR3", "ENAT3", "ENBR3",
    "ENEV3", "ENGI11", "EQTL3", "ESTC3",
    "EVEN3", "EZTC3", "FFTL4", "FIBR3",
    "FLRY3", "GETI4", "GFSA3", "GGBR3",
    "GGBR4", "GNDI3", "GOAU4", "GOLL4",
    "GRND3", "HAPV3", "HGTX3", "HRTP3",
    "HYPE3", "IGTA3", "INPR3", "IRBR3",
    "ITSA4", "ITUB3", "ITUB4", "JBSS3",
    "JHSF3", "KEPL3", "KLBN11", "KLBN4",
    "KROT3", "LAME3", "LAME4", "LCAM3",
    "LEVE3", "LIGT3", "LINX3", "LLXL3",
    "LREN3", "LUPA3", "MAGG3", "MDIA3",
    "MGLU3", "MILS3", "MMXM3", "MPLU3",
    "MPXE3", "MRFG3", "MRVE3", "MULT3",
    "MYPK3", "NATU3", "NETC4", "ODPV3",
    "OGXP3", "OIBR3", "OIBR4", "OSXB3",
    "PCAR4", "PCAR5", "PDGR3", "PETR3",
    "PETR4", "PLAS3", "PMAM3", "POMO4",
    "POSI3", "PRML3", "PSSA3", "QGEP3",
    "QUAL3", "RADL3", "RAIL3", "RAPT4",
    "RDCD3", "RENT3", "RLOG3", "RPMG3",
    "RSID3", "RUMO3", "SANB11", "SAPR11",
    "SAPR4", "SBSP3", "SEER3", "SLCE3",
    "SMLE3", "SMLS3", "SMTO3", "SULA11",
    "SUZB3", "SUZB5", "TAEE11", "TAMM4",
    "TBLE3", "TCSA3", "TCSL3", "TCSL4",
    "TEND3", "TERI3", "TIET11", "TIMP3",
    "TLPP4", "TMAR5", "TNLP3", "TNLP4",
    "TOTS3", "TRPL4", "TUPY3", "UGPA3",
    "UGPA4", "USIM3", "USIM5", "VAGR3",
    "VALE3", "VALE5", "VIVO4", "VIVT4",
    "VLID3", "VVAR11", "VVAR3", "WEGE3",
    "WIZS3", "YDUQ3"
]


In [19]:
# FUNÇÕES CAPTURA VALORES
def GetHistoricalPriceComdinheiro(ticker, startDate, endDate, userName, password, path2save):
    url = "https://www.comdinheiro.com.br/Clientes/API/EndPoint001.php"
    querystring = {"code": "import_data"}

    internal_url = f"HistoricoCotacaoAcao001-{ticker}-{startDate}-{endDate}-1-1"
    payload = f"username={userName}&password={password}&URL={internal_url}&format=json3"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}

    response = requests.post(url, data=payload, headers=headers, params=querystring)
    data = response.json()

    # --- Validação: se não houver dados ---
    
    if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
        print(f"⚠️ Sem dados disponíveis para {ticker} entre {startDate} e {endDate}")
        return pd.DataFrame()    
 
    df = pd.DataFrame(data["tables"]["tab0"]).T
    df = df.drop("lin0", errors='ignore')

    df.columns = [
        "Data", "FechAjust", "Var", "FechHist", "AbertAjust",
        "MinAjust", "MedAjust", "MaxAjust", "Vol", "Neg", "Fator", "Tipo", "COL_A", "COL_B"
    ]

    # 1) Converte Data → datetime (aceita 'DD/MM/AAAA' e 'AAAA-MM-DD')
    #    dayfirst=True garante que '10/11/2012' seja 10 de novembro.
    #df["Data"] = pd.to_datetime(df["Data"], errors="coerce", dayfirst=True)
    df["Data"] = pd.to_datetime(df["Data"], errors="coerce", format='%d/%m/%Y', dayfirst=True)
    df["Data"] = df["Data"].dt.strftime('%Y-%m-%d')


    # Substitui 'nd' por NaN
    df.replace("nd", pd.NA, inplace=True)

    # Converte colunas numéricas
    colunas_numericas = [
        "FechAjust", "Var", "FechHist", "AbertAjust",
        "MinAjust", "MedAjust", "MaxAjust", "Vol", "Neg", "Fator"
    ]

    for col in colunas_numericas:
        df[col] = df[col].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.sort_values("Data").reset_index(drop=True)
    df = df.drop(columns=[ "COL_A", "COL_B"])
    
    if path2save != '':
     df.to_csv(path2save, index=False, date_format="%d/%m/%Y")

    return df

# Captura de Eventos + Indicadores Fundamentalistas
def EventsDate(ticker, userName, password, startDate, endDate, path2save):
    url = "https://www.comdinheiro.com.br/Clientes/API/EndPoint001.php"
    querystring = {"code": "import_data"}
    
    # Payload atualizado com todos os indicadores
    payload = (
        f"username={userName}&password={password}"
        f"&URL=HistoricoIndicadoresFundamentalistas001.php%3F"
        f"%26data_ini%3D{startDate}"
        f"%26data_fim%3D{endDate}"
        f"%26trailing%3D12"
        f"%26conv%3DMIXED"
        f"%26moeda%3DMOEDA_ORIGINAL"
        f"%26c_c%3Dconsolidado"
        f"%26m_m%3D1000000"
        f"%26n_c%3D2"
        f"%26f_v%3D1"
        f"%26papel%3D{ticker}"
        f"%26indic%3DNOME_EMPRESA%2BRL%2BLL%2BEBITDA%2BDATA_PUBLICACAO"
        f"%2BPRECO_ABERTURA%2BPRECO_FECHAMENTO%2BLPA%2BROA%2BROE%2BMEB"
        f"%2BRL%2BCRESC_RL_12M%2BCRESC_LL_12M%2BCRESC_EBITDA_12M%2BCAPEX"
        f"%2BRL%2BFCO%2BEBITDA%2BFCF%2BDIVIDA_LIQUIDA%2BPL%2BDIVIDA_BRUTA"
        f"%2BAT%2BDVA_DESPESAS_FIN%2BPC%2BPNC%2BOUTROS_PC%2BLUB"
        f"%26periodicidade%3Dtri"
        f"%26graf_tab%3Dtabela"
        f"%26desloc_data_analise%3D1"
        f"%26flag_transpor%3D0"
        f"%26c_d%3Dd"
        f"%26enviar_email%3D0"
        f"%26enviar_email_log%3D0"
        f"%26cabecalho_excel%3Dmodo1"
        f"%26relat_alias_automatico%3Dcmd_alias_01"
        "&format=json3"
    )
    
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    response = requests.post(url, data=payload, headers=headers, params=querystring)
    data = json.loads(response.text)
    
    # --- Validação: se não houver dados ---
    if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
        print(f"⚠️ Sem dados disponíveis para {ticker} entre {startDate} e {endDate}")
        return pd.DataFrame()
        
    # Constrói DataFrame
    df = pd.DataFrame(data["tables"]["tab0"]).T
    
    # Colunas de acordo com a ordem dos indicadores
    novas_colunas = [
        "Data", "Empresa", "RL", "LL", "EBITDA", "Data_Publicacao",
        "Preco_Abertura", "Preco_Fechamento", "LPA", "ROA", "ROE", "MEB",
        "RL_dup1", "CRESC_RL_12M", "CRESC_LL_12M", "CRESC_EBITDA_12M",
        "CAPEX", "RL_dup2", "FCO", "EBITDA_dup", "FCF",
        "Divida_Liquida", "PL", "Divida_Bruta", "AT", "DVA_Despesas_Fin",
        "PC", "PNC", "Outros_PC", "LUB",
        "Consolidado", "Convencao", "Moeda", "Data_Demonstracao",
        "Meses", "Data_Analise"
    ]
    df.columns = novas_colunas
    
    # Remove linha de cabeçalho interno
    df = df.drop("lin0", errors="ignore")
    
    # Normaliza datas
    df["Data"] = pd.to_datetime(df["Data"], errors="coerce", format='%d/%m/%Y', dayfirst=True)
    df["Data_Publicacao"] = pd.to_datetime(df["Data_Publicacao"], errors="coerce", format='%d/%m/%Y', dayfirst=True)
    df["Data_Demonstracao"] = pd.to_datetime(df["Data_Demonstracao"], errors="coerce", format='%d/%m/%Y', dayfirst=True)
    df["Data_Analise"] = pd.to_datetime(df["Data_Analise"], errors="coerce", format='%d/%m/%Y', dayfirst=True)
    #df['Data_Publicacao'] = pd.to_datetime(df['Data_Publicacao'], errors='coerce', format='%d/%m/%Y')
    #df['Data'] = pd.to_datetime(df['Data'], errors='coerce', format='%d/%m/%Y')
    #df['Data_Demonstracao'] = pd.to_datetime(df['Data_Demonstracao'], errors='coerce', format='%d/%m/%Y')    
    df.reset_index(drop=True, inplace=True)
    
    # Consolida duplicados (mantém o primeiro válido)
    if {"RL", "RL_dup1", "RL_dup2"}.issubset(df.columns):
        df["RL"] = df[["RL", "RL_dup1", "RL_dup2"]].bfill(axis=1).iloc[:, 0]
        df = df.drop(columns=["RL_dup1", "RL_dup2"])
    if {"EBITDA", "EBITDA_dup"}.issubset(df.columns):
        df["EBITDA"] = df[["EBITDA", "EBITDA_dup"]].bfill(axis=1).iloc[:, 0]
        df = df.drop(columns=["EBITDA_dup"])
    
    if path2save != '':
        df.to_csv(path2save, index=False)        
    
    return df

##### COLETA DE TICKERS DO IBOV

def GetIbovComposition(userName, password, startYear=2010, endYear=2019):
    url = "https://api.comdinheiro.com.br/v1/ep1/import-data"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    
    all_dfs = []
    
    for year in range(startYear, endYear + 1):
        data_analise = f"2209{year}"  # 31/12/AAAA
        
        inner_url = (
            f"ComposicaoIndices001.php?"
            f"data_analise={data_analise}"
            f"&indice=IBOV"
            f"&nome_portfolio="
            f"&tipo_portfolio="
            f"&overwrite=0"
            f"&design=2"
            f"&obs_portfolio=0"
            f"&num_casas=0"
            f"&salvar_dados=nenhum"
            f"&sufixo="
            f"&nome_serie="
            f"&filtro_avancado="
        )
        
        payload = f"username={userName}&password={password}&URL={inner_url}&format=json3"
        
        response = requests.post(url, data=payload, headers=headers)
        data = response.json()
        
        if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
            print(f"⚠️ Sem dados para {data_analise}")
            continue
        
        df = pd.DataFrame(data["tables"]["tab0"]).T
        df = df.drop("lin0", errors="ignore").reset_index(drop=True)
        #df["Data_Analise"] = pd.to_datetime(data_analise, format="%d%m%Y")
        all_dfs.append(df)
    
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
    else:
        final_df = pd.DataFrame()
    
    return final_df

# Exemplo de chamada
#df_ibov = GetIbovComposition(
#    userName=userName,
#    password=password,
#    startYear=2010,
#    endYear=2019
#)
#
#print(df_ibov.head())
#df_ibov.to_csv("dataset/IBOV_Composicao_2010_2019.csv", index=False)


In [20]:
#DOWNNLOAD PREÇOS
for emp in empresas:
  GetHistoricalPriceComdinheiro(ticker=emp, startDate=startDate, endDate=endDate, userName=userName, password=password, path2save=f"dataset/prices/{emp}.SA.csv")

In [14]:
#DOWNLOAD FUNDAMENTALISTAS
files = os.listdir('dataset/prices')
for emp in files:  
  EventsDate(ticker= emp[0:-7], userName=userName, password=password, startDate=startDate, endDate=endDate, path2save=f"dataset/fundamental/{emp}")
  os.listdir('dataset/fundamental')
#  price_p.insert(1, 'event', price_p['Data'].apply(lambda date: 1 if date in date_df['Data_Publicacao'].values else 0))
#  price_p.to_csv(f"dataset/prices_processed/{emp}", index=False)