In [41]:
# BIBLIOTECAS
# =====================
from __future__ import annotations
import os
import io
import math
import json
import time
import enum
import warnings
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import requests

from scipy import stats
from scipy.stats.mstats import winsorize

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LinearRegression
import ta

import xgboost as xgb

In [46]:
# CONFIGURA√á√ïES GERAIS
# =====================

load_dotenv()

userName ="aluno.thiago.nunes" # os.getenv("USERNAME")
password = "NLPfinance2@23" #os.getenv("PASSWORD")

CD_USERNAME = os.getenv("COMD_USER", "aluno.thiago.nunes")
CD_PASSWORD = os.getenv("COMD_PASS", "NLPfinance2@23")


# Proxy de mercado e risk-free
MARKET_PROXY = "^BVSP"  # alternativas: "BOVA11.SA" (se preferir ETF)
RISK_FREE_SERIES = "CDI"      # CDI di√°rio como proxy (pode trocar para SELIC di√°ria se preferir)

# Janela de estima√ß√£o e holding
ESTIMATION_WINDOW = 252  # ~1 ano de preg√µes
HOLDING_DAYS = 30         # janela p√≥s-earnings (CAR)

# Universo de teste (exemplos; substitua pelos seus)
#B3_TICKERS = [
#    "ABEV3.SA", "ITUB4.SA", "PETR4.SA", "VALE3.SA", "BBDC4.SA",
#]

startDate = "01012010"
endDate   = "31122019"


# PASTAS
# =====================
folders = ["dataset", "dataset/prices", "dataset/prices_processed", "dataset/fundamental"]

# Verifica se as pastas existem, se n√£o, cria-as
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Pasta '{folder}' foi criada.")
    else:
        print(f"Pasta '{folder}' j√° existe.")


Pasta 'dataset' foi criada.
Pasta 'dataset/prices' foi criada.
Pasta 'dataset/prices_processed' foi criada.
Pasta 'dataset/fundamental' foi criada.


In [47]:
indices = ['IBOV', 'CDI']
empresas = [
    # Empresas com pap√©is mais l√≠quidos (ON, PN ou Unit)
    "AEDU3", "ABEV3", "ALLL11", "ALLL3",
    "ALPA4", "ALSC3", "ALUP11", "AMBV3",
    "AMBV4", "AMIL3", "ANIM3", "ARTR3",
    "ARZZ3", "AZUL4", "B3SA3", "BBAS3",
    "BBDC3", "BBDC4", "BBRK3", "BBSE3",
    "BEEF3", "BIDI11", "BISA3", "BPAC11",
    "BPNM4", "BRAP4", "BRDT3", "BRFS3",
    "BRKM5", "BRML3", "BRPR3", "BRSR6",
    "BRTO4", "BTOW3", "BVMF3", "CCRO3",
    "CCXC3", "CESP6", "CIEL3", "CMIG3",
    "CMIG4", "CNFB4", "CPFE3", "CPLE6",
    "CRFB3", "CRUZ3", "CSAN3", "CSMG3",
    "CSNA3", "CTIP3", "CVCB3", "CYRE3",
    "DASA3", "DTEX3", "ECOD3", "ECOR3",
    "EGIE3", "ELET3", "ELET6", "ELPL4",
    "ELPL6", "EMBR3", "ENAT3", "ENBR3",
    "ENEV3", "ENGI11", "EQTL3", "ESTC3",
    "EVEN3", "EZTC3", "FFTL4", "FIBR3",
    "FLRY3", "GETI4", "GFSA3", "GGBR3",
    "GGBR4", "GNDI3", "GOAU4", "GOLL4",
    "GRND3", "HAPV3", "HGTX3", "HRTP3",
    "HYPE3", "IGTA3", "INPR3", "IRBR3",
    "ITSA4", "ITUB3", "ITUB4", "JBSS3",
    "JHSF3", "KEPL3", "KLBN11", "KLBN4",
    "KROT3", "LAME3", "LAME4", "LCAM3",
    "LEVE3", "LIGT3", "LINX3", "LLXL3",
    "LREN3", "LUPA3", "MAGG3", "MDIA3",
    "MGLU3", "MILS3", "MMXM3", "MPLU3",
    "MPXE3", "MRFG3", "MRVE3", "MULT3",
    "MYPK3", "NATU3", "NETC4", "ODPV3",
    "OGXP3", "OIBR3", "OIBR4", "OSXB3",
    "PCAR4", "PCAR5", "PDGR3", "PETR3",
    "PETR4", "PLAS3", "PMAM3", "POMO4",
    "POSI3", "PRML3", "PSSA3", "QGEP3",
    "QUAL3", "RADL3", "RAIL3", "RAPT4",
    "RDCD3", "RENT3", "RLOG3", "RPMG3",
    "RSID3", "RUMO3", "SANB11", "SAPR11",
    "SAPR4", "SBSP3", "SEER3", "SLCE3",
    "SMLE3", "SMLS3", "SMTO3", "SULA11",
    "SUZB3", "SUZB5", "TAEE11", "TAMM4",
    "TBLE3", "TCSA3", "TCSL3", "TCSL4",
    "TEND3", "TERI3", "TIET11", "TIMP3",
    "TLPP4", "TMAR5", "TNLP3", "TNLP4",
    "TOTS3", "TRPL4", "TUPY3", "UGPA3",
    "UGPA4", "USIM3", "USIM5", "VAGR3",
    "VALE3", "VALE5", "VIVO4", "VIVT4",
    "VLID3", "VVAR11", "VVAR3", "WEGE3",
    "WIZS3", "YDUQ3"
]


In [175]:
# FUN√á√ïES CAPTURA VALORES
def GetHistoricalPriceComdinheiro(ticker, startDate, endDate, userName, password, path2save):
    url = "https://www.comdinheiro.com.br/Clientes/API/EndPoint001.php"
    querystring = {"code": "import_data"}

    internal_url = f"HistoricoCotacaoAcao001-{ticker}-{startDate}-{endDate}-1-1"
    payload = f"username={userName}&password={password}&URL={internal_url}&format=json3"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}

    response = requests.post(url, data=payload, headers=headers, params=querystring)
    data = response.json()

    # --- Valida√ß√£o: se n√£o houver dados ---
    
    if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
        print(f"‚ö†Ô∏è Sem dados dispon√≠veis para {ticker} entre {startDate} e {endDate}")
        return pd.DataFrame()    
 
    df = pd.DataFrame(data["tables"]["tab0"]).T
    df = df.drop("lin0", errors='ignore')

    df.columns = [
        "Data", "FechAjust", "Var", "FechHist", "AbertAjust",
        "MinAjust", "MedAjust", "MaxAjust", "Vol", "Neg", "Fator", "Tipo", "COL_A", "COL_B"
    ]

    # Converte data
    df["Data"] = pd.to_datetime(df["Data"], format="%d/%m/%Y", errors='coerce')

    # Substitui 'nd' por NaN
    df.replace("nd", pd.NA, inplace=True)

    # Converte colunas num√©ricas
    colunas_numericas = [
        "FechAjust", "Var", "FechHist", "AbertAjust",
        "MinAjust", "MedAjust", "MaxAjust", "Vol", "Neg", "Fator"
    ]

    for col in colunas_numericas:
        df[col] = df[col].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.sort_values("Data").reset_index(drop=True)
    df = df.drop(columns=[ "COL_A", "COL_B"])
    
    if path2save != '':
     df.to_csv(path2save, index=False)

    return df

# Captura de Eventos + Indicadores Fundamentalistas
def EventsDate(ticker, userName, password, startDate, endDate, path2save):
    url = "https://www.comdinheiro.com.br/Clientes/API/EndPoint001.php"
    querystring = {"code": "import_data"}
    
    # Payload atualizado com todos os indicadores
    payload = (
        f"username={userName}&password={password}"
        f"&URL=HistoricoIndicadoresFundamentalistas001.php%3F"
        f"%26data_ini%3D{startDate}"
        f"%26data_fim%3D{endDate}"
        f"%26trailing%3D12"
        f"%26conv%3DMIXED"
        f"%26moeda%3DMOEDA_ORIGINAL"
        f"%26c_c%3Dconsolidado"
        f"%26m_m%3D1000000"
        f"%26n_c%3D2"
        f"%26f_v%3D1"
        f"%26papel%3D{ticker}"
        f"%26indic%3DNOME_EMPRESA%2BRL%2BLL%2BEBITDA%2BDATA_PUBLICACAO"
        f"%2BPRECO_ABERTURA%2BPRECO_FECHAMENTO%2BLPA%2BROA%2BROE%2BMEB"
        f"%2BRL%2BCRESC_RL_12M%2BCRESC_LL_12M%2BCRESC_EBITDA_12M%2BCAPEX"
        f"%2BRL%2BFCO%2BEBITDA%2BFCF%2BDIVIDA_LIQUIDA%2BPL%2BDIVIDA_BRUTA"
        f"%2BAT%2BDVA_DESPESAS_FIN%2BPC%2BPNC%2BOUTROS_PC%2BLUB"
        f"%26periodicidade%3Dtri"
        f"%26graf_tab%3Dtabela"
        f"%26desloc_data_analise%3D1"
        f"%26flag_transpor%3D0"
        f"%26c_d%3Dd"
        f"%26enviar_email%3D0"
        f"%26enviar_email_log%3D0"
        f"%26cabecalho_excel%3Dmodo1"
        f"%26relat_alias_automatico%3Dcmd_alias_01"
        "&format=json3"
    )
    
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    response = requests.post(url, data=payload, headers=headers, params=querystring)
    data = json.loads(response.text)
    
    # --- Valida√ß√£o: se n√£o houver dados ---
    if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
        print(f"‚ö†Ô∏è Sem dados dispon√≠veis para {ticker} entre {startDate} e {endDate}")
        return pd.DataFrame()
        
    # Constr√≥i DataFrame
    df = pd.DataFrame(data["tables"]["tab0"]).T
    
    # Colunas de acordo com a ordem dos indicadores
    novas_colunas = [
        "Data", "Empresa", "RL", "LL", "EBITDA", "Data_Publicacao",
        "Preco_Abertura", "Preco_Fechamento", "LPA", "ROA", "ROE", "MEB",
        "RL_dup1", "CRESC_RL_12M", "CRESC_LL_12M", "CRESC_EBITDA_12M",
        "CAPEX", "RL_dup2", "FCO", "EBITDA_dup", "FCF",
        "Divida_Liquida", "PL", "Divida_Bruta", "AT", "DVA_Despesas_Fin",
        "PC", "PNC", "Outros_PC", "LUB",
        "Consolidado", "Convencao", "Moeda", "Data_Demonstracao",
        "Meses", "Data_Analise"
    ]
    df.columns = novas_colunas
    
    # Remove linha de cabe√ßalho interno
    df = df.drop("lin0", errors="ignore")
    
    # Normaliza datas
    df['Data_Publicacao'] = pd.to_datetime(df['Data_Publicacao'], errors='coerce', format='%d/%m/%Y')
    df['Data_Publicacao'] = df['Data_Publicacao'].dt.strftime('%Y-%m-%d')
    df.reset_index(drop=True, inplace=True)
    
    # Consolida duplicados (mant√©m o primeiro v√°lido)
    if {"RL", "RL_dup1", "RL_dup2"}.issubset(df.columns):
        df["RL"] = df[["RL", "RL_dup1", "RL_dup2"]].bfill(axis=1).iloc[:, 0]
        df = df.drop(columns=["RL_dup1", "RL_dup2"])
    if {"EBITDA", "EBITDA_dup"}.issubset(df.columns):
        df["EBITDA"] = df[["EBITDA", "EBITDA_dup"]].bfill(axis=1).iloc[:, 0]
        df = df.drop(columns=["EBITDA_dup"])
    
    if path2save != '':
        df.to_csv(path2save, index=False)        
    
    return df

##### COLETA DE TICKERS DO IBOV

def GetIbovComposition(userName, password, startYear=2010, endYear=2019):
    url = "https://api.comdinheiro.com.br/v1/ep1/import-data"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    
    all_dfs = []
    
    for year in range(startYear, endYear + 1):
        data_analise = f"2209{year}"  # 31/12/AAAA
        
        inner_url = (
            f"ComposicaoIndices001.php?"
            f"data_analise={data_analise}"
            f"&indice=IBOV"
            f"&nome_portfolio="
            f"&tipo_portfolio="
            f"&overwrite=0"
            f"&design=2"
            f"&obs_portfolio=0"
            f"&num_casas=0"
            f"&salvar_dados=nenhum"
            f"&sufixo="
            f"&nome_serie="
            f"&filtro_avancado="
        )
        
        payload = f"username={userName}&password={password}&URL={inner_url}&format=json3"
        
        response = requests.post(url, data=payload, headers=headers)
        data = response.json()
        
        if not isinstance(data, dict) or "tables" not in data or "tab0" not in data["tables"]:
            print(f"‚ö†Ô∏è Sem dados para {data_analise}")
            continue
        
        df = pd.DataFrame(data["tables"]["tab0"]).T
        df = df.drop("lin0", errors="ignore").reset_index(drop=True)
        df["Data_Analise"] = pd.to_datetime(data_analise, format="%d%m%Y")
        all_dfs.append(df)
    
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
    else:
        final_df = pd.DataFrame()
    
    return final_df

# Exemplo de chamada
#df_ibov = GetIbovComposition(
#    userName=userName,
#    password=password,
#    startYear=2010,
#    endYear=2019
#)
#
#print(df_ibov.head())
#df_ibov.to_csv("dataset/IBOV_Composicao_2010_2019.csv", index=False)


In [49]:
#DOWN
for emp in empresas:
  GetHistoricalPriceComdinheiro(ticker=emp, startDate=startDate, endDate=endDate, userName=userName, password=password, path2save=f"dataset/prices/{emp}.SA.csv")

In [137]:
files = os.listdir('dataset/prices')
for emp in files:  
  EventsDate(ticker= emp[0:-7], userName=userName, password=password, startDate=startDate, endDate=endDate, path2save=f"dataset/fundamental/{emp}")
  os.listdir('dataset/fundamental')
#  price_p.insert(1, 'event', price_p['Data'].apply(lambda date: 1 if date in date_df['Data_Publicacao'].values else 0))
#  price_p.to_csv(f"dataset/prices_processed/{emp}", index=False)

In [None]:
# -*- coding: utf-8 -*-
"""
dataprep_pipeline.py

Pr√©-processamento de:
  - Pre√ßos: dataset/prices/TICKER.SA.csv
  - Fundamentos (tri): dataset/fundamental/TICKER_fUND.csv

Sa√≠das:
  - dataset/prices_processed/TICKER.SA.csv (pre√ßo com retornos/indicadores)
  - dataset/final/TICKER.final.csv (pre√ßo + flag de evento + fundamentos asof)
  - dataset/final/final_dataprep.csv (consolidado de todos os tickers)

Requisitos: pandas, numpy
"""

from __future__ import annotations
import os, re
import numpy as np
import pandas as pd
from typing import List, Optional

# -------------------------
# Utils de I/O e diret√≥rios
# -------------------------
DEFAULT_DIRS = ["dataset", "dataset/prices", "dataset/prices_processed",
                "dataset/fundamental", "dataset/final"]

def ensure_dirs(paths: List[str] = DEFAULT_DIRS) -> None:
    for p in paths:
        os.makedirs(p, exist_ok=True)

# -------------------------
# Utils de parsing num√©rico
# -------------------------
def to_float_smart(x) -> float:
    """
    Converte string num√©rica brasileira/heterog√™nea para float.
    Regras:
    - 'nd'/'': NaN
    - se tem v√≠rgula -> assume v√≠rgula decimal: remove pontos (milhar) e troca v√≠rgula por ponto
    - sen√£o, se tem >1 ponto -> mant√©m apenas o √∫ltimo ponto como decimal
    - sen√£o, tenta float direto
    """
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    s = str(x).strip()
    if s == "" or s.lower() in {"nd", "nan", "none"}:
        return np.nan
    if "," in s:
        s = s.replace(".", "").replace(",", ".")
        try:
            return float(s)
        except Exception:
            return np.nan
    # n√∫meros com muitos pontos (milhar com ponto e decimal com ponto)
    if s.count(".") > 1:
        parts = s.split(".")
        s = "".join(parts[:-1]) + "." + parts[-1]
    # remove qualquer lixo exceto d√≠gitos, sinal e ponto
    s = re.sub(r"[^0-9\.\-]", "", s)
    try:
        return float(s)
    except Exception:
        return np.nan

def to_int_smart(x) -> float:
    """Converte para inteiro removendo tudo que n√£o for d√≠gito/sinal."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    s = re.sub(r"[^0-9\-]", "", str(x))
    try:
        return int(s)
    except Exception:
        return np.nan

# -------------------------
# Indicador discreto (‚àí1/0/+1)
# -------------------------
def catalog_return(row, x, name_return):
    try:
        val = float(row[name_return])
        std = float(row.get(f"Rolling_std_{name_return}", np.nan))
    except Exception:
        return 0
    if np.isnan(val) or np.isnan(std) or std == 0:
        return 0
    if val > x * std:   return 1
    if val < -x * std:  return -1
    return 0

# -------------------------
# Classe: Pre√ßos (expans√£o da sua DataProcessing)
# -------------------------
class PriceProcessing:
    def __init__(self, df_prices: pd.DataFrame, ticker: str):
        self.ticker = ticker
        self.df = df_prices.copy()
        # detec√ß√£o de separador autom√°tica + leitura como string recomend√°vel
        # (se vier pronto, apenas normaliza)
        # Normaliza√ß√£o de colunas esperadas
        # Esperado: Data, FechAjust, Var, FechHist, AbertAjust, MinAjust, MedAjust, MaxAjust, Vol, Neg, Fator, Tipo
        # 1) Data
        self.df["Data"] = pd.to_datetime(self.df["Data"], dayfirst=True, errors="coerce")
        # 2) Pre√ßos (float)
        float_cols = ["FechAjust", "FechHist", "AbertAjust", "MinAjust", "MedAjust", "MaxAjust", "Var", "Fator", "Vol"]
        for c in float_cols:
            if c in self.df.columns:
                self.df[c] = self.df[c].apply(to_float_smart)
        # 3) Volume/Neg√≥cios (int)
        for c in ["Neg"]:
            if c in self.df.columns:
                self.df[c] = self.df[c].apply(to_int_smart)
        # 4) Ordena
        self.df = self.df.sort_values("Data").drop_duplicates("Data").reset_index(drop=True)
        # 5) Fechamento preferido (ajustado se existir)
        if "FechAjust" in self.df.columns and self.df["FechAjust"].notna().any():
            self.df["Close"] = self.df["FechAjust"]
        else:
            if "AbertAjust" in self.df.columns:
                self.df["Close"] = self.df["AbertAjust"].shift(-1)
            else:
                self.df["Close"] = np.nan            

        # 6) Campo auxiliar ponderado
        if "Vol" in self.df.columns and self.df["Vol"].notna().any():
            vol_mean = self.df["Vol"].replace(0, np.nan).mean()
            self.df["FechPonderado"] = self.df["Close"] * self.df["Vol"] / (vol_mean if vol_mean else self.df["Vol"])
        else:
            self.df["FechPonderado"] = np.nan

    def create_return_by_period(self, name_return: str, period: int, column_name: str = "Close", remove_nan=False):
        self.df[name_return] = np.log(self.df[column_name] / self.df[column_name].shift(period))
        if remove_nan:
            self.df.dropna(subset=[name_return], inplace=True)

    def create_cumulative_std(self, name_return: str):
        self.df[f"Cumulative_std_{name_return}"] = self.df[name_return].expanding().std()
        
    def create_rolling_std(self, name_return: str, window: int = 20):
        """
        Calcula o desvio padr√£o em uma janela m√≥vel.
        Ex.: window=20 ‚Üí volatilidade de 20 per√≠odos.
        """
        self.df[f"Rolling_std_{name_return}_{window}"] = (
            self.df[name_return].rolling(window=window).std()
    )

    def create_indicator(self, name_return: str, factor: float = 0.1):
        self.df[f"Indicator_{name_return}"] = self.df.apply(lambda row: catalog_return(row, factor, name_return), axis=1)

    def finalize(self) -> pd.DataFrame:
        # garante colunas essenciais
        cols = ["Data", "Close", "FechPonderado", "Vol", "Neg", "Var"]
        cols += [c for c in self.df.columns if c.startswith("Daily_") or c.startswith("Week_") or c.startswith("Month_")]
        cols += [c for c in self.df.columns if c.startswith("Rolling_std_") or c.startswith("Indicator_")]
        cols = [c for c in cols if c in self.df.columns]
        out = self.df[cols].copy()
        out.insert(0, "Ticker", self.ticker)
        return out

# -------------------------
# Classe: Fundamentos (tri)
# -------------------------
class FundamentalProcessing:
    def __init__(self, df_fund: pd.DataFrame, ticker: str):
        self.ticker = ticker
        self.df = df_fund.copy()

        # Normaliza datas
        # Data_Publicacao (evento)
        if "Data_Publicacao" in self.df.columns:
            self.df["Data_Publicacao"] = pd.to_datetime(self.df["Data_Publicacao"], dayfirst=True, errors="coerce")
        # Data_Demonstracao / Data_Analise
        if "Data_Demonstracao" in self.df.columns:
            self.df["Data_Demonstracao"] = pd.to_datetime(self.df["Data_Demonstracao"], dayfirst=True, errors="coerce")
        if "Data_Analise" in self.df.columns:
            self.df["Data_Analise"] = pd.to_datetime(self.df["Data_Analise"], dayfirst=True, errors="coerce")

        # Converte colunas num√©ricas principais
        num_cols = [
            "RL","LL","EBITDA","Preco_Abertura","Preco_Fechamento","LPA","ROA","ROE","MEB",
            "CRESC_RL_12M","CRESC_LL_12M","CRESC_EBITDA_12M","CAPEX","FCO","FCF","Divida_Liquida",
            "PL","Divida_Bruta","AT","DVA_Despesas_Fin","PC","PNC","Outros_PC","LUB"
        ]
        for c in num_cols:
            if c in self.df.columns:
                self.df[c] = self.df[c].apply(to_float_smart)

        # Duplicatas (quando existirem)
        for dup_base in [("RL","RL_dup1","RL_dup2"), ("EBITDA","EBITDA_dup")]:
            keep = dup_base[0]
            alts = [c for c in dup_base[1:] if c in self.df.columns]
            if keep in self.df.columns and alts:
                self.df[keep] = self.df[[keep] + alts].bfill(axis=1).iloc[:,0]
                self.df.drop(columns=[c for c in alts if c in self.df.columns], inplace=True, errors="ignore")

        # Define QuarterEnd (prioridade Data_Demonstracao > Data_Analise)
        self.df["QuarterEnd"] = self.df.get("Data_Demonstracao", pd.NaT)
        if "QuarterEnd" in self.df.columns and self.df["QuarterEnd"].isna().all() and "Data_Analise" in self.df.columns:
            self.df["QuarterEnd"] = self.df["Data_Analise"]

        # Ordena e dedup
        self.df = self.df.sort_values(["QuarterEnd", "Data_Publicacao"]).drop_duplicates(["QuarterEnd"]).reset_index(drop=True)

    def get_publication_dates(self) -> pd.Series:
        """Retorna as datas de publica√ß√£o (eventos)."""
        return self.df["Data_Publicacao"].dropna().astype("datetime64[ns]")

    def features_for_asof_merge(self) -> pd.DataFrame:
        """
        Seleciona colunas num√©ricas + QuarterEnd + Data_Publicacao para merge_asof no pre√ßo.
        """
        keep = ["QuarterEnd", "Data_Publicacao"]
        num_cols = [c for c in self.df.columns if c not in keep and pd.api.types.is_numeric_dtype(self.df[c])]
        out = self.df[keep + num_cols].dropna(subset=["Data_Publicacao"]).sort_values("Data_Publicacao").reset_index(drop=True)
        # renomeia colunas para evitar colis√£o de nomes
        rename_map = {c: f"F_{c}" for c in num_cols}
        out = out.rename(columns=rename_map)
        return out

# -------------------------
# Classe: Merge & Orquestra√ß√£o
# -------------------------
class DataPrepPipeline:
    def __init__(self,
                 prices_dir: str = "dataset/prices",
                 fund_dir: str = "dataset/fundamental",
                 out_prices_dir: str = "dataset/prices_processed",
                 out_final_dir: str = "dataset/final"):
        self.prices_dir = prices_dir
        self.fund_dir = fund_dir
        self.out_prices_dir = out_prices_dir
        self.out_final_dir = out_final_dir
        ensure_dirs([prices_dir, fund_dir, out_prices_dir, out_final_dir])

    @staticmethod
    def _ticker_from_price_filename(fname: str) -> str:
        # "AZUL4.SA.csv" -> "AZUL4"
        base = os.path.basename(fname)
        if base.endswith(".csv"):
            base = base[:-4]
        return base.replace(".SA", "")

    @staticmethod
    def _fund_path_for_ticker(fund_dir: str, ticker: str) -> Optional[str]:
        # procura "<TICKER>_fUND.csv" (case-insensitive)
        for fn in os.listdir(fund_dir):
            if fn.lower() == f"{ticker.lower()}_fund.csv" or fn.lower() == f"{ticker.lower()}_fundamental.csv" or fn.lower() == f"{ticker.lower()}_fund.csv":
                return os.path.join(fund_dir, fn)
            if fn.lower().startswith(ticker.lower()) and "fund" in fn.lower():
                return os.path.join(fund_dir, fn)
        return None

    def process_one(self, price_csv_path: str,
                    indicator_factor: float = 0.1,
                    save_intermediate_prices: bool = True,
                    attach_fundamentals_asof: bool = True,
                    only_events: bool = False) -> Optional[pd.DataFrame]:
        """
        Processa um ticker:
          - pre√ßo -> retornos/STD/indicadores
          - evento -> flag 0/1 (Data ‚àà Data_Publicacao)
          - (opcional) merge_asof com fundamentos publicados
          - salva CSV final
        """
        # --- pre√ßos
        dfp = pd.read_csv(price_csv_path, sep=None, engine="python", dtype=str)
        tkr = self._ticker_from_price_filename(price_csv_path)
        price = PriceProcessing(dfp, tkr)
        # retornos
        price.create_return_by_period("Daily_Return", 1, column_name="Close", remove_nan=False)
        price.create_return_by_period("Week_Return", 5, column_name="Close", remove_nan=False)
        price.create_return_by_period("Month_Return", 22, column_name="Close", remove_nan=False)
        # stds
        price.create_rolling_std("Daily_Return", window=21)
        price.create_rolling_std("Week_Return", window=65)
        price.create_rolling_std("Month_Return", window=252)
        # indicadores discretos
        price.create_indicator("Daily_Return", indicator_factor)
        price.create_indicator("Week_Return", indicator_factor)
        price.create_indicator("Month_Return", indicator_factor)
        df_price_feat = price.finalize()

        # salva intermedi√°rio (compat√≠vel com seu script atual)
        if save_intermediate_prices:
            out_p_path = os.path.join(self.out_prices_dir, os.path.basename(price_csv_path))
            df_price_feat.to_csv(out_p_path, index=False)

        # --- fundamentos (se houver)
        fund_path = self._fund_path_for_ticker(self.fund_dir, tkr)
        if fund_path is None:
            # sem fundamentos -> apenas marca evento=0 e finaliza
            df_final = df_price_feat.copy()
            df_final["event"] = 0
        else:
            dff_raw = pd.read_csv(fund_path, sep=None, engine="python", dtype=str)
            fund = FundamentalProcessing(dff_raw, tkr)
            pub_dates = set(pd.to_datetime(fund.get_publication_dates(), errors="coerce").dropna().values)
            df_final = df_price_feat.copy()
            # flag de evento
            df_final["event"] = df_final["Data"].isin(pub_dates).astype(int)

            # (opcional) merge_asof colando fundamentos at√© a pr√≥xima publica√ß√£o
            if attach_fundamentals_asof:
                f_asof = fund.features_for_asof_merge()
                if not f_asof.empty:
                    df_final = df_final.sort_values("Data")
                    f_asof = f_asof.sort_values("Data_Publicacao")
                    df_final = pd.merge_asof(
                        df_final,
                        f_asof,
                        left_on="Data",
                        right_on="Data_Publicacao",
                        direction="backward"
                    )
                    # remove coluna-√¢ncora para n√£o poluir
                    if "Data_Publicacao" in df_final.columns:
                        df_final.drop(columns=["Data_Publicacao"], inplace=True)

            if only_events:
                df_final = df_final[df_final["event"] == 1].copy()

        # salva final por ticker
        out_final_path = os.path.join(self.out_final_dir, f"{tkr}.final.csv")
        df_final.to_csv(out_final_path, index=False)
        return df_final

    def process_all(self,
                    indicator_factor: float = 0.1,
                    save_intermediate_prices: bool = True,
                    attach_fundamentals_asof: bool = True,
                    only_events: bool = False) -> pd.DataFrame:
        """Processa todos os arquivos em dataset/prices e devolve consolidado."""
        all_final = []
        for fn in os.listdir(self.prices_dir):
            if not fn.lower().endswith(".csv"):
                continue
            try:
                path = os.path.join(self.prices_dir, fn)
                df_final = self.process_one(
                    path,
                    indicator_factor=indicator_factor,
                    save_intermediate_prices=save_intermediate_prices,
                    attach_fundamentals_asof=attach_fundamentals_asof,
                    only_events=only_events
                )
                if df_final is not None and not df_final.empty:
                    all_final.append(df_final.assign(Ticker=self._ticker_from_price_filename(fn)))
            except Exception as ex:
                print(f"Erro no ticker de {fn}: {ex}")
                continue

        if not all_final:
            return pd.DataFrame()

        df_all = pd.concat(all_final, ignore_index=True)
        df_all = df_all.sort_values(["Ticker", "Data"]).reset_index(drop=True)
        # salva consolidado
        df_all.to_csv(os.path.join(self.out_final_dir, "final_dataprep.csv"), index=False)
        return df_all


# -------------------------
# Exemplo de uso (script)
# -------------------------
if __name__ == "__main__":
    ensure_dirs()

    pipeline = DataPrepPipeline(
        prices_dir="dataset/prices",
        fund_dir="dataset/fundamental",
        out_prices_dir="dataset/prices_processed",
        out_final_dir="dataset/final"
    )

    # Processa todos os tickers:
    # - inclui fundamentos por merge_asof (attach_fundamentals_asof=True)
    # - filtra somente linhas de evento? -> only_events=True (opcional)
    df_consolidado = pipeline.process_all(
        indicator_factor=0.1,
        save_intermediate_prices=True,
        attach_fundamentals_asof=True,
        only_events=False
    )

    print("OK! Arquivos salvos em dataset/prices_processed/ e dataset/final/")
    print(df_consolidado.head())


### Pipeline quebrado em modulos:

In [138]:
##1) Utilidades (pastas, parsing num√©rico)
# ==========================
# Se√ß√£o 1 ‚Äî Utils e Parsing
# ==========================
import os, re
from typing import List, Optional, Dict
import numpy as np
import pandas as pd

DEFAULT_DIRS = ["dataset", "dataset/prices", "dataset/prices_processed",
                "dataset/fundamental", "dataset/final"]

def ensure_dirs(paths: List[str] = DEFAULT_DIRS) -> None:
    for p in paths:
        os.makedirs(p, exist_ok=True)

def to_float_smart(x) -> float:
    """Converte string BR (milhar com ponto, decimal com v√≠rgula) e outros formatos para float."""
    if x is None or (isinstance(x, float) and np.isnan(x)): return np.nan
    s = str(x).strip()
    if s == "" or s.lower() in {"nd", "nan", "none"}: return np.nan
    if "," in s:
        s = s.replace(".", "").replace(",", ".")
    else:
        s = re.sub(r"[^0-9\.\-]", "", s)
        if s.count(".") > 1:
            parts = s.split(".")
            s = "".join(parts[:-1]) + "." + parts[-1]
    try:
        return float(s)
    except Exception:
        return np.nan

def to_int_smart(x) -> float:
    """Converte texto para inteiro removendo n√£o-d√≠gitos."""
    if x is None or (isinstance(x, float) and np.isnan(x)): return np.nan
    s = re.sub(r"[^0-9\-]", "", str(x))
    try:
        return int(s)
    except Exception:
        return np.nan

def winsorize_series(s: pd.Series, p: float = 0.01) -> pd.Series:
    """Winsoriza 1%/1% (default) ‚Äî por s√©rie."""
    if s.notna().sum() < 5:
        return s
    lo, hi = np.nanpercentile(s, [p*100, (1-p)*100])
    return s.clip(lo, hi)


In [None]:
#2) Loader do Mercado (IBOV) e Risco-zero (CDI) a partir de CSV
# ================================================
# Se√ß√£o 2 ‚Äî Market/Risk Loader (IBOV e CDI locais)
# ================================================
class MarketAndRiskLoader:
    @staticmethod
    def load_ibov_csv(path_ibov: str) -> pd.DataFrame:
        """L√™ dataset/prices/IBOV.SA.csv ‚Üí DataFrame(Date, Close). Prioriza FechAjust."""
        df = pd.read_csv(path_ibov, dtype=str)
        df["Data"] = pd.to_datetime(df["Data"], dayfirst=True, errors="coerce")
        for c in ["FechAjust","FechHist"]:
            if c in df.columns: df[c] = df[c].apply(to_float_smart)
        close = "FechAjust" if "FechAjust" in df.columns and df["FechAjust"].notna().any() else "FechHist"
        out = df[["Data", close]].rename(columns={"Data":"Date", close:"Close"})
        return out.dropna(subset=["Date","Close"]).sort_values("Date").reset_index(drop=True)

    @staticmethod
    def load_cdi_csv(path_cdi: str) -> pd.DataFrame:
        """
        L√™ dataset/prices/CDI.SA.csv ‚Üí DataFrame(Date, rf_daily).
        Heur√≠stica principal: coluna Var como taxa di√°ria em % (ex.: ~0,03% ‚Üí 0.0003).
        Fallback: pct_change de FechAjust/FechHist.
        """
        df = pd.read_csv(path_cdi, dtype=str)
        df["Data"] = pd.to_datetime(df["Data"], dayfirst=True, errors="coerce")

        if "Var" in df.columns:
            df["Var"] = df["Var"].apply(to_float_smart)
            if df["Var"].notna().sum() > 3:
                out = df[["Data"]].copy()
                out["rf_daily"] = df["Var"] / 100.0
                return out.dropna().rename(columns={"Data":"Date"}).sort_values("Date").reset_index(drop=True)

        for c in ["FechAjust","FechHist"]:
            if c in df.columns:
                df[c] = df[c].apply(to_float_smart)
                if df[c].notna().sum() > 3:
                    r = df[c].pct_change()
                    out = df[["Data"]].copy()
                    out["rf_daily"] = r
                    return out.dropna().rename(columns={"Data":"Date"}).sort_values("Date").reset_index(drop=True)

        return pd.DataFrame(columns=["Date","rf_daily"])



In [None]:
#Se√ß√£o 3 ‚Äî PriceProcessing (filtro de datas nulas adicionado)
# =========================================
# Se√ß√£o 3 ‚Äî Classe de pre√ßos (PriceProcessing) [ATUALIZADA]
# =========================================
def catalog_return(row, x, name_return):
    val = row.get(name_return, np.nan)
    std = row.get(f"Rolling_std_{name_return}", np.nan)
    if pd.isna(val) or pd.isna(std) or std == 0: return 0
    if val > x * std:   return 1
    if val < -x * std:  return -1
    return 0

class PriceProcessing:
    def __init__(self, df_prices: pd.DataFrame, ticker: str):
        self.ticker = ticker
        self.df = df_prices.copy()

        # Datas
        self.df["Data"] = pd.to_datetime(self.df["Data"], dayfirst=True, errors="coerce")
        # üîß NOVO: remove linhas com Data = NaT (evita erro no merge_asof)
        self.df = self.df[~self.df["Data"].isna()].copy()

        # N√∫meros
        float_cols = ["FechAjust","FechHist","AbertAjust","MinAjust","MedAjust","MaxAjust","Var","Fator"]
        for c in float_cols:
            if c in self.df.columns: self.df[c] = self.df[c].apply(to_float_smart)
        for c in ["Vol","Neg"]:
            if c in self.df.columns: self.df[c] = self.df[c].apply(to_int_smart)

        # Ordena/dedup
        self.df = self.df.sort_values("Data").drop_duplicates("Data").reset_index(drop=True)

        # Close
        #self.df["Close"] = self.df["FechAjust"] if ("FechAjust" in self.df and self.df["FechAjust"].notna().any()) else self.df.get("FechHist", np.nan)
        if "FechAjust" in self.df.columns and self.df["FechAjust"].notna().any():
            self.df["Close"] = self.df["FechAjust"]
        else:
            if "AbertAjust" in self.df.columns:
                self.df["Close"] = self.df["AbertAjust"].shift(-1)
            else:
                self.df["Close"] = np.nan        

        # Fechamento ponderado por volume
        if "Vol" in self.df.columns and self.df["Vol"].notna().any():
            vol_mean = self.df["Vol"].replace(0, np.nan).mean()
            self.df["FechPonderado"] = self.df["Close"] * self.df["Vol"] / (vol_mean if vol_mean else self.df["Vol"])
        else:
            self.df["FechPonderado"] = np.nan

    def create_return_by_period(self, name_return: str, period: int, column_name: str = "Close", remove_nan=False):
        self.df[name_return] = np.log(self.df[column_name] / self.df[column_name].shift(period))
        if remove_nan:
            self.df.dropna(subset=[name_return], inplace=True)

   
    def create_rolling_std(self, name_return: str, window: int = 20):
        """
        Calcula o desvio padr√£o em uma janela m√≥vel.
        Ex.: window=20 ‚Üí volatilidade de 20 per√≠odos.
        """
        self.df[f"Rolling_std_{name_return}"] = (
            self.df[name_return].rolling(window=window).std()
    )    

    def create_indicator(self, name_return: str, factor: float = 0.1):
        self.df[f"Indicator_{name_return}"] = self.df.apply(lambda r: catalog_return(r, factor, name_return), axis=1)

    def finalize(self) -> pd.DataFrame:
        cols = ["Data","Close","FechPonderado","Vol","Neg","Var"]
        cols += [c for c in self.df.columns if c.startswith(("Daily_","Week_","Month_"))]
        cols += [c for c in self.df.columns if c.startswith(("Rolling_std_","Indicator_"))]
        cols = [c for c in cols if c in self.df.columns]
        out = self.df[cols].copy()
        out.insert(0, "Ticker", self.ticker)
        return out


In [141]:
#Se√ß√£o 4 ‚Äî Fundamentos (ŒîQoQ/ŒîYoY & EPS/Proxy) + ASOF
# ====================================================
# Se√ß√£o 4 ‚Äî Classe de fundamentos (FundamentalProcessing)
# ====================================================
class FundamentalProcessing:
    def __init__(self, df_fund: pd.DataFrame, ticker: str):
        self.ticker = ticker
        self.df = df_fund.copy()

        # Datas
        if "Data_Publicacao" in self.df.columns:
            self.df["Data_Publicacao"] = pd.to_datetime(self.df["Data_Publicacao"], dayfirst=True, errors="coerce")
        if "Data_Demonstracao" in self.df.columns:
            self.df["Data_Demonstracao"] = pd.to_datetime(self.df["Data_Demonstracao"], dayfirst=True, errors="coerce")
        if "Data_Analise" in self.df.columns:
            self.df["Data_Analise"] = pd.to_datetime(self.df["Data_Analise"], dayfirst=True, errors="coerce")

        # Num√©ricos relevantes
        num_cols = [
            "RL","LL","EBITDA","Preco_Abertura","Preco_Fechamento","LPA","ROA","ROE","MEB",
            "CRESC_RL_12M","CRESC_LL_12M","CRESC_EBITDA_12M","CAPEX","FCO","FCF",
            "Divida_Liquida","PL","Divida_Bruta","AT","DVA_Despesas_Fin","PC","PNC","Outros_PC","LUB"
        ]
        for c in num_cols:
            if c in self.df.columns: self.df[c] = self.df[c].apply(to_float_smart)

        # Duplicados comuns
        if set(["RL","RL_dup1","RL_dup2"]).issubset(self.df.columns):
            self.df["RL"] = self.df[["RL","RL_dup1","RL_dup2"]].bfill(axis=1).iloc[:,0]
            self.df.drop(columns=["RL_dup1","RL_dup2"], inplace=True, errors="ignore")
        if set(["EBITDA","EBITDA_dup"]).issubset(self.df.columns):
            self.df["EBITDA"] = self.df[["EBITDA","EBITDA_dup"]].bfill(axis=1).iloc[:,0]
            self.df.drop(columns=["EBITDA_dup"], inplace=True, errors="ignore")

        # QuarterEnd
        self.df["QuarterEnd"] = self.df.get("Data_Demonstracao", pd.NaT)
        if self.df["QuarterEnd"].isna().all() and "Data_Analise" in self.df.columns:
            self.df["QuarterEnd"] = self.df["Data_Analise"]

        # Ordena por Quarter/Publ.; 1 linha por quarter
        self.df = (self.df.sort_values(["QuarterEnd","Data_Publicacao"])
                        .drop_duplicates(["QuarterEnd"])
                        .reset_index(drop=True))

    def get_publication_dates(self) -> pd.Series:
        return self.df["Data_Publicacao"].dropna()

    def features_for_asof_merge(self) -> pd.DataFrame:
        keep = ["QuarterEnd","Data_Publicacao"]
        num_cols = [c for c in self.df.columns if c not in keep and pd.api.types.is_numeric_dtype(self.df[c])]
        out = self.df[keep + num_cols].dropna(subset=["Data_Publicacao"]).sort_values("Data_Publicacao").reset_index(drop=True)
        out = out.rename(columns={c: f"F_{c}" for c in num_cols})
        return out

    def build_qoq_yoy_and_eps(self) -> pd.DataFrame:
        """
        Tabela por Data_Publicacao/QuarterEnd com m√©tricas + ŒîQoQ/ŒîYoY e EPS Surprise (proxy ŒîLPA se n√£o houver consenso).
        """
        df = self.df.copy()
        out = df[["Data_Publicacao","QuarterEnd"]].copy()
        num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]

        for c in num_cols:
            out[c] = df[c].values
            out[f"{c}_Q_Change"] = df[c].diff(1).values
            out[f"{c}_Y_Change"] = (df[c] - df[c].shift(4)).values

        # EPS Surprise real se houver consenso; sen√£o proxy via ŒîLPA
        if "EPS_Consensus" in df.columns and "LPA" in df.columns:
            eps = df["LPA"] - df["EPS_Consensus"]
        elif "LPA" in df.columns:
            eps = df["LPA"].diff(1)
        else:
            eps = pd.Series([np.nan]*len(df))

        out["EPS_EarningsSurprise"] = eps
        out["EPS_Earnings_Surprise_Backward_Diff"] = eps - eps.shift(1)
        out["EPS_Earnings_Surprise_Backward_Ave_Diff"] = eps - eps.shift(3).rolling(3).mean()

        out = out.dropna(subset=["Data_Publicacao"]).sort_values("Data_Publicacao").reset_index(drop=True)
        return out.rename(columns={"Data_Publicacao":"AnnounceDate"})


In [None]:
# Se√ß√£o 5 ‚Äî Helpers PEAD: Œ≤ CAPM, CAR, T1  [ATUALIZADA]
# ==========================================================
from typing import Optional
import numpy as np
import pandas as pd

def detect_start_index(prices: pd.DataFrame,
                       announce_date: pd.Timestamp,
                       announce_time: Optional[str] = None) -> int:
    """
    Retorna o √≠ndice T1 na s√©rie de pre√ßos (coluna 'Date'):
      - Sem hor√°rio (None ou 'DUR'/'BMO'): 1¬∫ preg√£o >= announce_date
      - 'AMC' (after market):               1¬∫ preg√£o  > announce_date
    Compat√≠vel com chamadas de 2 ou 3 argumentos.
    """
    if prices is None or prices.empty or 'Date' not in prices:
        return 0

    # Garante dtype/ordem e vetor numpy ordenado
    ds = pd.to_datetime(prices['Date'], errors='coerce').sort_values().values
    if ds.size == 0 or pd.isna(announce_date):
        return 0

    idx_ge = np.searchsorted(ds, np.array(announce_date, dtype='datetime64[ns]'))

    # Se hor√°rio conhecido e for after-market, pula para o pr√≥ximo preg√£o
    if announce_time and str(announce_time).upper() == 'AMC':
        idx = idx_ge + 1
    else:
        idx = idx_ge

    # Limites seguros
    idx = int(max(0, min(idx, ds.size - 1)))
    return idx


def estimate_beta(stock_df: pd.DataFrame,
                  mkt_df: pd.DataFrame,
                  rf_df: pd.DataFrame,
                  event_idx: int,
                  estimation_window: int = 252) -> float:
    """
    Estima Œ≤ via OLS na janela [event_idx - estimation_window, event_idx-1],
    usando retornos em excesso (ri - rf) e (rm - rf).
    A janela j√° deve chegar 'adaptada' pela l√≥gica chamadora.
    """
    m = stock_df[['Date','Close']].merge(
            mkt_df[['Date','Close']], on='Date', suffixes=('_i','_m')
        )
    m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').ffill().sort_values('Date')

    #m['ri'] = m['Close_i'].pct_change()
    #m['rm'] = m['Close_m'].pct_change()
    m['ri'] = np.log(m['Close_i'] / m['Close_i'].shift(1))
    m['rm'] = np.log(m['Close_m'] / m['Close_m'].shift(1))

    if event_idx < 2:
        return np.nan

    event_date = stock_df.iloc[event_idx]['Date']
    eidx = m.index[m['Date'] == event_date]
    if len(eidx) == 0:
        return np.nan
    eidx = eidx[0]

    start = max(m.index.min(), eidx - estimation_window)
    end   = eidx - 1
    if end - start < 30:  # resguardo m√≠nimo para OLS
        return np.nan

    win = m.loc[start:end].dropna()
    if win.empty:
        return np.nan

    x = (win['rm'] - win['rf_daily']).values.reshape(-1, 1)
    y = (win['ri'] - win['rf_daily']).values.reshape(-1, 1)
    beta = np.linalg.lstsq(x, y, rcond=None)[0].ravel()[0]
    return float(beta)


def compute_car(stock_df: pd.DataFrame,
                mkt_df: pd.DataFrame,
                rf_df: pd.DataFrame,
                event_idx: int,
                beta: float,
                holding_days: int = 30) -> float:
    """
    CAR = soma dos retornos anormais no intervalo [T1, T1+holding_days-1].
    Modelo de expectativa: CAPM com Œ≤ estimado na janela de estima√ß√£o.
    """
    m = stock_df[['Date','Close']].merge(
            mkt_df[['Date','Close']], on='Date', suffixes=('_i','_m')
        )
    m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').ffill().sort_values('Date')

    #m['ri'] = m['Close_i'].pct_change()
    #m['rm'] = m['Close_m'].pct_change()
    m['ri'] = np.log(m['Close_i'] / m['Close_i'].shift(1))
    m['rm'] = np.log(m['Close_m'] / m['Close_m'].shift(1))

    start = event_idx
    end   = min(start + holding_days - 1, len(m) - 1)
    seg   = m.iloc[start:end+1].dropna()
    if seg.empty:
        return np.nan

    seg['E_ri'] = seg['rf_daily'] + beta * (seg['rm'] - seg['rf_daily'])
    seg['AR']   = seg['ri'] - seg['E_ri']
    return float(seg['AR'].sum())


In [None]:
#Se√ß√£o 6 ‚Äî DataPrepPipeline.process_all
def process_all(self,
                indicator_factor: float = 0.1,
                save_intermediate_prices: bool = True,
                attach_fundamentals_asof: bool = True,
                only_events: bool = False) -> pd.DataFrame:
    """Processa todos os arquivos em dataset/prices e devolve consolidado."""
    all_final = []
    for fn in os.listdir(self.prices_dir):
        if not fn.lower().endswith(".csv"):
            continue
        try:
            path = os.path.join(self.prices_dir, fn)
            df_final = self.process_one(
                path,
                indicator_factor=indicator_factor,
                save_intermediate_prices=save_intermediate_prices,
                attach_fundamentals_asof=attach_fundamentals_asof,
                only_events=only_events
            )
            if df_final is not None and not df_final.empty:
                all_final.append(df_final.assign(Ticker=self._ticker_from_price_filename(fn)))
        except Exception as ex:
            print(f"Erro no ticker de {fn}: {ex}")
            continue
    if not all_final:
        return pd.DataFrame()
    df_all = pd.concat(all_final, ignore_index=True)
    df_all = df_all.sort_values(["Ticker", "Data"]).reset_index(drop=True)
    # salva consolidado
    df_all.to_csv(os.path.join(self.out_final_dir, "final_dataprep.csv"), index=False)
    return df_all

In [None]:
#Se√ß√£o 6 ‚Äî DataPrepPipeline.process_one
# ============================================================
# Se√ß√£o 6 ‚Äî M√©todo process_one (DataPrepPipeline) [ATUALIZADO]
# ============================================================

def process_one(self, price_csv_path: str,
                indicator_factor: float = 0.1,
                save_intermediate_prices: bool = True,
                attach_fundamentals_asof: bool = True,
                only_events: bool = False) -> Optional[pd.DataFrame]:

    base = os.path.basename(price_csv_path).upper()
    if base in {"IBOV.SA.CSV", "CDI.SA.CSV"}:
        return pd.DataFrame()  # proxies ‚Äî n√£o s√£o pap√©is

    # Pre√ßo
    dfp = pd.read_csv(price_csv_path, sep=None, engine="python", dtype=str)
    tkr = self._ticker_from_price_filename(price_csv_path)
    price = PriceProcessing(dfp, tkr)

    # Retornos e indicadores
    price.create_return_by_period("Daily_Return", 1)
    price.create_return_by_period("Week_Return", 5)
    price.create_return_by_period("Month_Return", 22)
    price.create_rolling_std("Daily_Return", window=21)
    price.create_rolling_std("Week_Return", window=65)
    price.create_rolling_std("Month_Return", window=252)
    # indicadores discretos    
    price.create_indicator("Daily_Return", indicator_factor)
    price.create_indicator("Week_Return", indicator_factor)
    price.create_indicator("Month_Return", indicator_factor)

    df_price_feat = price.finalize()
    if save_intermediate_prices:
        df_price_feat.to_csv(os.path.join(self.out_prices_dir, os.path.basename(price_csv_path)), index=False)

    # Fundamentos
    fund_path = self._fund_path_for_pricefile(self.fund_dir, price_csv_path)
    if fund_path is None:
        df_final = df_price_feat.copy()
        df_final["event"] = 0
    else:
        dff_raw = pd.read_csv(fund_path, sep=None, engine="python", dtype=str)
        fund = FundamentalProcessing(dff_raw, tkr)

        # Diagn√≥stico opcional
        if fund.get_publication_dates().dropna().empty:
            print(f"[AVISO] Sem Data_Publicacao v√°lida para {tkr} em {os.path.basename(fund_path)}")

        pub_dates = set(pd.to_datetime(fund.get_publication_dates(), errors="coerce").dropna().values)
        df_final = df_price_feat.copy()
        df_final["event"] = df_final["Data"].isin(pub_dates).astype(int)

        if attach_fundamentals_asof:
            f_asof = fund.features_for_asof_merge()
            if not f_asof.empty:
                # üîß Chaves sem NaT + ordenadas (requisito do merge_asof)
                df_final = df_final.dropna(subset=["Data"]).sort_values("Data").reset_index(drop=True)
                f_asof   = f_asof.dropna(subset=["Data_Publicacao"]).sort_values("Data_Publicacao").reset_index(drop=True)

                df_final = pd.merge_asof(
                    df_final, f_asof,
                    left_on="Data",
                    right_on="Data_Publicacao",
                    direction="backward"
                )
                if "Data_Publicacao" in df_final.columns:
                    df_final.drop(columns=["Data_Publicacao"], inplace=True)

        if only_events:
            df_final = df_final[df_final["event"] == 1].copy()

    # Garante Ticker correto (sem duplicar)
    if "Ticker" in df_final.columns:
        df_final["Ticker"] = tkr
    else:
        df_final.insert(0, "Ticker", tkr)

    out_final_path = os.path.join(self.out_final_dir, f"{tkr}.final.csv")
    df_final.to_csv(out_final_path, index=False)
    return df_final


In [None]:
# Se√ß√£o 7 ‚Äî Builder do dataset de eventos (PT-BR)  [CORRIGIDA]
# ==========================================================
from typing import Optional, Dict, List
import os, re
import numpy as np
import pandas as pd

# ---------- Helpers de data em PT-BR (dd/mm/aaaa) + YYYYMMDD ----------
def _parse_date_br_str(s: str) -> pd.Timestamp:
    """
    Converte datas de 'dd/mm/aaaa' ou 'YYYYMMDD' para Timestamp normalizado (00:00).
    Aceita espa√ßos e caracteres soltos; retorna NaT se inv√°lida.
    """
    if s is None:
        return pd.NaT
    s = str(s).strip()
    if not s or s.lower() in {"nan", "none"}:
        return pd.NaT

    # limpa tudo que n√£o seja d√≠gito ou '/'
    raw = re.sub(r"[^0-9/]", "", s)

    # caso YYYYMMDD (8 d√≠gitos e sem '/'): parse expl√≠cito sem dayfirst
    if re.fullmatch(r"\d{8}", raw) and "/" not in raw:
        try:
            return pd.to_datetime(raw, format="%Y%m%d", errors="raise").normalize()
        except Exception:
            return pd.NaT

    # caso dd/mm/aaaa (PT-BR) ‚Äî parse expl√≠cito
    try:
        return pd.to_datetime(raw, format="%d/%m/%Y", errors="raise").normalize()
    except Exception:
        # fallback robusto com dayfirst
        return pd.to_datetime(raw, dayfirst=True, errors="coerce").normalize()

def _parse_series_date_br(sr: pd.Series) -> pd.Series:
    return sr.astype(str).map(_parse_date_br_str)

# ---------- Wrapper compat√≠vel para detect_start_index (2 ou 3 args) ----------
def _safe_detect_start_index(prices: pd.DataFrame,
                             announce_date: pd.Timestamp,
                             announce_time: Optional[str] = None) -> int:
    """
    Chama detect_start_index com 3 ou 2 argumentos conforme a vers√£o carregada.
    Evita TypeError no ambiente.
    """
    try:
        return detect_start_index(prices, announce_date, announce_time)  # nova assinatura
    except TypeError:
        return detect_start_index(prices, announce_date)                 # assinatura antiga


class EventDatasetBuilder:
    def __init__(self, mkt_df: pd.DataFrame, rf_df: pd.DataFrame,
                 estimation_window: int = 252, holding_days: int = 30, min_estimation: int = 60):
        """
        estimation_window: janela alvo para Œ≤ (m√°ximo). Ser√° reduzida se n√£o houver hist√≥rico.
        min_estimation   : m√≠nimo exigido para estimar Œ≤ (evita IPOs muito recentes).
        """
        self.mkt = mkt_df.sort_values('Date').reset_index(drop=True)
        self.rf  = rf_df.sort_values('Date').reset_index(drop=True)
        self.estimation_window = estimation_window
        self.holding_days      = holding_days
        self.min_estimation    = min_estimation

    def build_for_ticker(self, tkr: str,
                         price_final_csv: str,
                         fund_raw_csv: str,
                         announce_time_map: Optional[Dict[pd.Timestamp,str]] = None) -> pd.DataFrame:
        """
        price_final_csv: dataset/final/TICKER.final.csv ‚Äî precisa de [Data, Close]
        fund_raw_csv   : dataset/fundamental/TICKER.SA.csv ‚Äî fonte de Data_Publicacao (PT-BR)
        announce_time_map: opcional {Timestamp: 'BMO'|'AMC'|'DUR'}; se None, assume DUR/UNK
        """
        # ----- pre√ßos -----
        px = pd.read_csv(price_final_csv, parse_dates=['Data'])
        px = px.sort_values('Data').dropna(subset=['Close']).reset_index(drop=True)
        px_idx   = px[['Data']].rename(columns={'Data':'Date'})
        px_close = px[['Data','Close']].rename(columns={'Data':'Date'})

        # ----- fundamentos (parse PT-BR/YYYYMMDD) -----
        raw = pd.read_csv(fund_raw_csv, sep=None, engine='python', dtype=str)

        if 'Data_Publicacao' not in raw.columns:
            return pd.DataFrame()

        pub_series = _parse_series_date_br(raw['Data_Publicacao'])
        pub_dates  = pub_series.dropna().drop_duplicates().sort_values()
        if pub_dates.empty:
            return pd.DataFrame()

        # features de fundamentos (QoQ/YoY/EPS etc.), alinhadas √† AnnounceDate
        fund  = FundamentalProcessing(raw, tkr)
        feats = fund.build_qoq_yoy_and_eps()  # deve gerar 'AnnounceDate'
        if 'AnnounceDate' in feats.columns:
            feats['363'] = _parse_series_date_br(feats['AnnounceDate'])
            feats['AnnounceDate'] = feats['AnnounceDate'].dt.normalize()
            feats = feats.dropna(subset=['AnnounceDate'])\
                         .drop_duplicates(subset=['AnnounceDate'])\
                         .reset_index(drop=True)
        else:
            # sem coluna ‚Äî segue s√≥ com CAR/Beta
            feats = pd.DataFrame(columns=['AnnounceDate'])

        # ----- loop de eventos (AnnounceDate = Data_Publicacao) -----
        recs = []
        for ad in pub_dates:
            atime  = announce_time_map.get(ad) if announce_time_map else None  # sem hor√°rio ‚Üí DUR/UNK
            t1_idx = _safe_detect_start_index(px_idx, ad, atime)

            # janela adaptativa para Œ≤
            est_len = min(self.estimation_window, t1_idx)
            if est_len < self.min_estimation:
                continue  # pula eventos sem hist√≥rico suficiente

            beta = estimate_beta(px_close, self.mkt, self.rf, t1_idx, est_len)
            if pd.isna(beta): 
                continue
            car  = compute_car(px_close, self.mkt, self.rf, t1_idx, beta, self.holding_days)
            if pd.isna(car):
                continue

            row = {
                'Ticker'        : tkr,
                'AnnounceDate'  : ad,                             # = Data_Publicacao (PT-BR)
                'EventTradeDate': px_idx.iloc[t1_idx]['Date'],    # 1¬∫ preg√£o ‚â• an√∫ncio
                'CAR_30D'       : float(car),
                'CAR_Sign'      : int(car > 0),
                'Beta'          : float(beta),
                'EstimationLen' : int(est_len),
                'FundSource'    : os.path.basename(fund_raw_csv),
            }

            # anexa features exatamente na AnnounceDate (sem look-ahead)
            if not feats.empty:
                frow = feats.loc[feats['AnnounceDate'] == ad]\
                           .drop(columns=['AnnounceDate','QuarterEnd'], errors='ignore')
                if not frow.empty:
                    row.update(frow.iloc[0].to_dict())

            recs.append(row)

        return pd.DataFrame(recs)


def winsorize_and_standardize(events_df: pd.DataFrame,
                              by_col: str = 'Ticker',
                              exclude_cols: List[str] = ['Ticker','AnnounceDate','EventTradeDate','CAR_30D','CAR_Sign','FundSource']) -> pd.DataFrame:
    """
    Winsoriza 1%/1% por ticker e aplica z-score global.
    (EventTradeDate/FundSource s√£o n√£o-num√©ricas e ficam exclu√≠das.)
    """
    df = events_df.copy()
    num_cols = [c for c in df.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df[c])]

    # winsor por ticker
    def _w(g):
        for c in num_cols:
            g[c] = winsorize_series(g[c], p=0.01)
        return g
    df = df.groupby(by_col, group_keys=False).apply(_w)

    # standardize global
    if num_cols:
        m = df[num_cols].fillna(0.0).mean()
        s = df[num_cols].fillna(0.0).std(ddof=0).replace(0, 1.0)
        X = (df[num_cols].fillna(0.0) - m) / s
        X.columns = [f"STD_{c}" for c in X.columns]
        out = pd.concat([df[exclude_cols], X], axis=1)
    else:
        out = df[exclude_cols].copy()
    return out


In [145]:
# Se√ß√£o 8 ‚Äî Helper de fallback p/ fundamentos [NOVO]
# ==================================
import re
from pathlib import Path
from typing import Optional

def find_fund_path_for_tkr(fund_dir: str, tkr: str) -> Optional[Path]:
    """
    Procura dataset/fundamental/<MESMO BASENAME>.SA.csv e, se n√£o achar,
    tenta o 'ticker base' (remove sufixos de classe: 11, 34, 3, 4, 5, 6).
    Ex.: AZUL4 -> AZUL ; SANB11 -> SANB
    """
    # 1) mesmo basename
    p = Path(fund_dir) / f"{tkr}.SA.csv"
    if p.exists():
        return p

    # 2) fallback por ticker base
    t_base = re.sub(r"(11|34|3|4|5|6)$", "", tkr.upper())
    cand = Path(fund_dir) / f"{t_base}.SA.csv"
    if cand.exists():
        return cand

    matches = list(Path(fund_dir).glob(f"{t_base}*.csv"))
    return matches[0] if matches else None


In [146]:
# P√≥s-processamento: limpeza + recomputo Œ≤ & CAR (consist√™ncia) [ATUALIZADO]
# ==========================================================
from pathlib import Path
import numpy as np
import pandas as pd

# --- Wrapper compat√≠vel com detect_start_index de 2 OU 3 argumentos ---
def safe_detect_start_index(prices: pd.DataFrame,
                            announce_date: pd.Timestamp,
                            announce_time=None) -> int:
    """
    Tenta chamar detect_start_index(prices, announce_date, announce_time).
    Se a assinatura antiga (2 args) estiver carregada no ambiente, faz fallback.
    """
    try:
        return detect_start_index(prices, announce_date, announce_time)  # nova assinatura
    except TypeError:
        return detect_start_index(prices, announce_date)                 # assinatura antiga

def _load_price_final_for(ticker: str, final_dir: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Retorna (px_idx, px_close) a partir de dataset/final/<TICKER>.final.csv
    """
    p = Path(final_dir) / f"{ticker}.final.csv"
    if not p.exists():
        return pd.DataFrame(), pd.DataFrame()
    px = pd.read_csv(p, parse_dates=['Data'])
    px = px.dropna(subset=['Close']).sort_values('Data').reset_index(drop=True)
    px_idx   = px[['Data']].rename(columns={'Data':'Date'})
    px_close = px[['Data','Close']].rename(columns={'Data':'Date'})
    return px_idx, px_close

def clean_events(df: pd.DataFrame) -> pd.DataFrame:
    """
    1) Drop AnnounceDate nula
    2) Normaliza dtype e ordena
    3) Remove duplicatas (Ticker, AnnounceDate) mantendo o menor EventTradeDate
    """
    out = df.copy()

    # normaliza datas
    out['AnnounceDate']   = pd.to_datetime(out['AnnounceDate'], errors='coerce')
    if 'EventTradeDate' in out.columns:
        out['EventTradeDate'] = pd.to_datetime(out['EventTradeDate'], errors='coerce')

    # remove nulos de announce
    out = out.dropna(subset=['AnnounceDate'])

    # para desempate em duplicatas, manter menor EventTradeDate
    sort_cols = ['Ticker','AnnounceDate'] + (['EventTradeDate'] if 'EventTradeDate' in out.columns else [])
    out = out.sort_values(sort_cols).drop_duplicates(subset=['Ticker','AnnounceDate'], keep='first').reset_index(drop=True)
    return out

def recompute_beta_car(ready_df: pd.DataFrame,
                       final_dir: str,
                       mkt_df: pd.DataFrame,
                       rf_df: pd.DataFrame,
                       estimation_window: int = 252,
                       min_estimation: int = 60,
                       holding_days: int = 30) -> pd.DataFrame:
    """
    Recalcula Beta, CAR_30D, EstimationLen e EventTradeDate para TODO o dataset,
    garantindo consist√™ncia com Se√ß√£o 5 (detect_start_index/estimate_beta/compute_car).
    """
    df = ready_df.copy()
    df['AnnounceDate'] = pd.to_datetime(df['AnnounceDate'], errors='coerce')

    for tkr, grp in df.groupby('Ticker', sort=False):
        px_idx, px_close = _load_price_final_for(tkr, final_dir)
        if px_idx.empty or px_close.empty:
            df.loc[grp.index, ['Beta','CAR_30D','EstimationLen','EventTradeDate']] = np.nan
            continue

        for i in grp.index:
            ad = df.at[i, 'AnnounceDate']
            if pd.isna(ad):
                df.at[i, 'Beta'] = np.nan
                df.at[i, 'CAR_30D'] = np.nan
                df.at[i, 'EstimationLen'] = np.nan
                df.at[i, 'EventTradeDate'] = pd.NaT
                continue

            # T1 de acordo com DUR/UNK (sem hor√°rio) ‚Äî usa wrapper compat√≠vel
            t1_idx = safe_detect_start_index(px_idx, ad, None)

            # janela adaptativa
            est_len = min(estimation_window, t1_idx)
            if est_len < min_estimation:
                df.at[i, 'Beta'] = np.nan
                df.at[i, 'CAR_30D'] = np.nan
                df.at[i, 'EstimationLen'] = est_len
                df.at[i, 'EventTradeDate'] = px_idx.iloc[t1_idx]['Date'] if len(px_idx) else pd.NaT
                continue

            beta = estimate_beta(px_close, mkt_df, rf_df, t1_idx, est_len)
            car  = compute_car(px_close, mkt_df, rf_df, t1_idx, beta, holding_days)

            df.at[i, 'Beta'] = beta
            df.at[i, 'CAR_30D'] = car
            df.at[i, 'EstimationLen'] = est_len
            df.at[i, 'EventTradeDate'] = px_idx.iloc[t1_idx]['Date']

    return df.sort_values(['Ticker','AnnounceDate']).reset_index(drop=True)


In [None]:
# Uso
# ==================================
if __name__ == "__main__":
    from pathlib import Path

    # 0) Pastas
    ensure_dirs()

    # 1) Pr√©-processa todos os pap√©is (gera dataset/prices_processed/ e dataset/final/TICKER.final.csv)
    pipeline = DataPrepPipeline(
        prices_dir="dataset/prices",
        fund_dir="dataset/fundamental",
        out_prices_dir="dataset/prices_processed",
        out_final_dir="dataset/final"
    )
    df_all_final = pipeline.process_all(
        indicator_factor=0.1,
        save_intermediate_prices=True,
        attach_fundamentals_asof=True,  # fundamentos ‚Äúvigentes‚Äù at√© a pr√≥xima publica√ß√£o
        only_events=False               # se quiser s√≥ linhas em datas de evento, mude para True
    )
    print("Step 1 OK ‚Äî final_dataprep.csv salvo em dataset/final/")

    # 2) Carrega proxies de mercado e CDI dos CSVs locais
    mkt = MarketAndRiskLoader.load_ibov_csv("dataset/prices/IBOV.SA.csv")   # Date, Close
    rf  = MarketAndRiskLoader.load_cdi_csv("dataset/prices/CDI.SA.csv")     # Date, rf_daily

    # 3) Constr√≥i dataset de eventos (CAR_30D, Beta, ŒîQoQ/ŒîYoY, EPS/proxy)
    builder = EventDatasetBuilder(mkt_df=mkt, rf_df=rf, estimation_window=252, holding_days=30)
    all_events = []
    for f in Path("dataset/final").glob("*.final.csv"):
        tkr = f.stem.split(".")[0]
        # Fundamental com o MESMO basename do pre√ßo
        fund_path = Path("dataset/fundamental") / f"{tkr}.SA.csv"
        if not fund_path.exists():
            cand = next(Path("dataset/fundamental").glob(f"{tkr}*.csv"), None)
            fund_path = cand if cand else None
        if fund_path is None or not Path(fund_path).exists():
            continue

        # Sem hor√°rio ‚Üí DUR/UNK (T1 = 1¬∫ preg√£o ‚â• Data de Publica√ß√£o)
        ev_tkr = builder.build_for_ticker(tkr, str(f), str(fund_path), announce_time_map=None)
        if not ev_tkr.empty:
            all_events.append(ev_tkr)

    if all_events:
        events_df = pd.concat(all_events, ignore_index=True)
        ready_df  = winsorize_and_standardize(events_df)
        ready_df.to_csv("dataset/final/pead_event_dataset_2010_2019.csv", index=False)
        print("OK ‚Äî dataset de eventos salvo em dataset/final/pead_event_dataset_2010_2019.csv")
        print(ready_df.head())
    else:
        print("Nenhum evento encontrado. Verifique se 'Data_Publicacao' existe e est√° no formato dd/mm/aaaa nos CSVs de fundamentos.")


In [149]:
# 1) Limpeza (nulos + duplicatas)
ready_df = clean_events(ready_df)

# 2) Recalcula Œ≤, CAR e ajusta EventTradeDate para todo o dataset (consist√™ncia)
ready_df = recompute_beta_car(
    ready_df,
    final_dir="dataset/final",
    mkt_df=mkt,
    rf_df=rf,
    estimation_window=252,
    min_estimation=60,
    holding_days=30
)

# 3) Salva o dataset final pronto
ready_df.to_csv("dataset/final/pead_event_dataset_2010_2019.csv", index=False)
print("OK ‚Äî dataset de eventos (limpo e consistente) salvo em dataset/final/pead_event_dataset_2010_2019.csv")


OK ‚Äî dataset de eventos (limpo e consistente) salvo em dataset/final/pead_event_dataset_2010_2019.csv


In [133]:
# validate_pead_outputs.py
import os, re, random
import numpy as np
import pandas as pd
from pathlib import Path

# === Ajuste caminhos se precisar
PEAD_DATASET = "dataset/final/pead_event_dataset_2010_2019.csv"
FINAL_DIR    = "dataset/final"
FUND_DIR     = "dataset/fundamental"
PRICES_DIR   = "dataset/prices"
IBOV_CSV     = "dataset/prices/IBOV.SA.csv"
CDI_CSV      = "dataset/prices/CDI.SA.csv"

# ---------- helpers leves (iguais aos do pipeline) ----------
def to_float_smart(x):
    if x is None or (isinstance(x, float) and np.isnan(x)): return np.nan
    s = str(x).strip()
    if s == "" or s.lower() in {"nd","nan","none"}: return np.nan
    # tenta detectar formato com v√≠rgula decimal
    if "," in s and s.count(",") == 1 and s.count(".") >= 1:
        s = s.replace(".", "").replace(",", ".")
    else:
        s = re.sub(r"[^0-9\.\-]", "", s)
        if s.count(".") > 1:
            parts = s.split(".")
            s = "".join(parts[:-1]) + "." + parts[-1]
    try:
        return float(s)
    except Exception:
        return np.nan

def load_ibov(path_ibov: str) -> pd.DataFrame:
    df = pd.read_csv(path_ibov, dtype=str)
    df["Date"] = pd.to_datetime(df["Data"], dayfirst=True, errors="coerce")
    for c in ["FechAjust","FechHist"]:
        if c in df: df[c] = df[c].apply(to_float_smart)
    close = "FechAjust" if "FechAjust" in df and df["FechAjust"].notna().any() else "FechHist"
    out = df[["Date", close]].rename(columns={close:"Close"}).dropna()
    return out.sort_values("Date").reset_index(drop=True)

def load_cdi(path_cdi: str) -> pd.DataFrame:
    df = pd.read_csv(path_cdi, dtype=str)
    df["Date"] = pd.to_datetime(df["Data"], dayfirst=True, errors="coerce")
    if "Var" in df:
        df["Var"] = df["Var"].apply(to_float_smart)
        if df["Var"].notna().sum() > 3:
            return df[["Date"]].assign(rf_daily=df["Var"]/100.0).dropna().sort_values("Date").reset_index(drop=True)
    for c in ["FechAjust","FechHist"]:
        if c in df:
            df[c] = df[c].apply(to_float_smart)
            if df[c].notna().sum() > 3:
                r = df[c].pct_change()
                return df[["Date"]].assign(rf_daily=r).dropna().sort_values("Date").reset_index(drop=True)
    return pd.DataFrame(columns=["Date","rf_daily"])

def detect_start_index(prices: pd.DataFrame, announce_date: pd.Timestamp) -> int:
    # Sem hor√°rio (DUR/UNK): t1 = primeiro preg√£o >= announce_date
    ts = prices['Date'].values
    idx_ge = np.searchsorted(ts, np.array(announce_date, dtype='datetime64[ns]'))
    return min(idx_ge, len(prices)-1)

def estimate_beta(stock_df: pd.DataFrame, mkt_df: pd.DataFrame, rf_df: pd.DataFrame,
                  event_idx: int, estimation_window: int = 252) -> float:
    m = stock_df[['Date','Close']].merge(mkt_df[['Date','Close']], on='Date', suffixes=('_i','_m'))
    m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
    m['ri'] = m['Close_i'].pct_change();  m['rm'] = m['Close_m'].pct_change()
    if event_idx < 2: return np.nan
    event_date = stock_df.iloc[event_idx]['Date']
    eidx = m.index[m['Date'] == event_date]
    if len(eidx) == 0: return np.nan
    eidx = eidx[0]; start = max(m.index.min(), eidx - estimation_window); end = eidx - 1
    if end - start < 30: return np.nan
    win = m.loc[start:end].dropna()
    if win.empty: return np.nan
    x = (win['rm'] - win['rf_daily']).values.reshape(-1, 1)
    y = (win['ri'] - win['rf_daily']).values.reshape(-1, 1)
    return float(np.linalg.lstsq(x, y, rcond=None)[0][0])

def compute_car(stock_df: pd.DataFrame, mkt_df: pd.DataFrame, rf_df: pd.DataFrame,
                event_idx: int, beta: float, holding_days: int = 30) -> float:
    m = stock_df[['Date','Close']].merge(mkt_df[['Date','Close']], on='Date', suffixes=('_i','_m'))
    m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
    m['ri'] = m['Close_i'].pct_change();  m['rm'] = m['Close_m'].pct_change()
    start = event_idx; end = min(start + holding_days - 1, len(m) - 1)
    seg = m.iloc[start:end+1].dropna()
    if seg.empty: return np.nan
    seg['E_ri'] = seg['rf_daily'] + beta * (seg['rm'] - seg['rf_daily'])
    seg['AR'] = seg['ri'] - seg['E_ri']
    return float(seg['AR'].sum())

def find_fund_path_for_tkr(fund_dir: str, tkr: str) -> Path | None:
    p = Path(fund_dir) / f"{tkr}.SA.csv"
    if p.exists(): return p
    t_base = re.sub(r"(11|34|3|4|5|6)$", "", tkr.upper())
    cand = Path(fund_dir) / f"{t_base}.SA.csv"
    if cand.exists(): return cand
    matches = list(Path(fund_dir).glob(f"{t_base}*.csv"))
    return matches[0] if matches else None

# ---------- (A) INTEGRIDADE DO DATASET ----------
print("\n[A] Checando integridade do dataset final...")
df = pd.read_csv(PEAD_DATASET, dtype=str)
# normaliza tipos
date_col = 'AnnounceDate'
df[date_col] = pd.to_datetime(df[date_col], dayfirst=True, errors='coerce')

num_cols = [c for c in df.columns if c.startswith('CAR_') or c.startswith('STD_')]
for c in num_cols:
    df[c] = pd.to_numeric(df[c].apply(to_float_smart), errors='coerce')

required = {'Ticker','AnnounceDate','CAR_30D','CAR_Sign'}
missing = required - set(df.columns)
print(f" - Colunas obrigat√≥rias faltando: {missing}" if missing else " - Colunas obrigat√≥rias OK")

null_dates = df['AnnounceDate'].isna().sum()
null_cars  = df['CAR_30D'].isna().sum()
dups = df.duplicated(['Ticker','AnnounceDate']).sum()
print(f" - Datas nulas: {null_dates} | CAR_30D nulos: {null_cars} | Duplicatas (Ticker,AnnounceDate): {dups}")

print(" - Stats CAR_30D:", df['CAR_30D'].describe(percentiles=[.01,.05,.5,.95,.99]))

# ---------- (B) SANIDADE DAS FEATURES (Z-SCORES) ----------
print("\n[B] Checando z-scores (STD_*)...")
zcols = [c for c in df.columns if c.startswith('STD_')]
if not zcols:
    print(" - Nenhuma coluna STD_* encontrada.")
else:
    means = df[zcols].mean(numeric_only=True)
    stds  = df[zcols].std(ddof=0, numeric_only=True)
    worst_mean = means.abs().sort_values(ascending=False).head(5)
    worst_std  = (stds - 1).abs().sort_values(ascending=False).head(5)
    print(" - |mean(STD_*)| maiores (esperado ~0):")
    print(worst_mean)
    print(" - |std(STD_*) - 1| maiores (esperado ~0):")
    print(worst_std)

    # Red flags de magnitude absurda (prov√°vel problema de leitura/locale)
    big = (df[zcols].abs() > 8).sum().sort_values(ascending=False)
    offenders = big[big > 0]
    if len(offenders):
        print("\n !!! Red flags: valores |STD_*| > 8 detectados (checar locale/parse). Top problem√°ticos:")
        print(offenders.head(10))
    else:
        print(" - Magnitudes de STD_* dentro do esperado (<= 8).")

# ---------- (C) RECONTAGEM DE CAR/Œ≤ PARA UMA AMOSTRA ----------
print("\n[C] Recontando CAR/Œ≤ para amostra (confirma√ß√£o econom√©trica)...")
mkt = load_ibov(IBOV_CSV)
rf  = load_cdi(CDI_CSV)

# Escolhe at√© 5 eventos aleat√≥rios (com ticker presente no FINAL_DIR)
events = df.dropna(subset=['AnnounceDate']).copy()
events['exists_final'] = events['Ticker'].apply(lambda t: Path(FINAL_DIR, f"{t}.final.csv").exists())
sample = events[events['exists_final']].sample(min(5, len(events[events['exists_final']])) , random_state=42) if len(events[events['exists_final']])>0 else pd.DataFrame()

def reload_close_series(tkr: str) -> pd.DataFrame:
    f = Path(FINAL_DIR, f"{tkr}.final.csv")
    d = pd.read_csv(f, parse_dates=['Data'])
    d = d[['Data','Close']].rename(columns={'Data':'Date'}).dropna().sort_values('Date')
    return d

if sample.empty:
    print(" - N√£o h√° amostra eleg√≠vel (confira se existem dataset/final/<TICKER>.final.csv).")
else:
    for _, row in sample.iterrows():
        tkr = row['Ticker']
        ad  = row['AnnounceDate']
        car_saved = row['CAR_30D']
        try:
            px = reload_close_series(tkr)
            t1 = detect_start_index(px[['Date']], ad)
            beta = estimate_beta(px, mkt, rf, t1, 252)
            car  = compute_car(px, mkt, rf, t1, beta, 30)
            diff = float(car) - float(car_saved)
            print(f" - {tkr} | {ad.date()} | CAR_saved={car_saved:.6f}  CAR_recalc={car:.6f}  Œî={diff:.6e}  Œ≤={beta:.3f}")
        except Exception as e:
            print(f"   ! Falha ao recontar {tkr} {ad.date()}: {e}")

# ---------- (D) COER√äNCIA EVENTO x FUNDAMENTO ----------
print("\n[D] Conferindo que AnnounceDate ‚àà Data_Publicacao (amostra por ticker)...")
def publication_set(tkr: str):
    p = find_fund_path_for_tkr(FUND_DIR, tkr)
    if not p: return set()
    fr = pd.read_csv(p, dtype=str)
    if 'Data_Publicacao' not in fr: return set()
    s = pd.to_datetime(fr['Data_Publicacao'], dayfirst=True, errors='coerce').dropna()
    return set(s.dt.normalize().values)

for tkr in df['Ticker'].unique()[:5]:  # checa primeiros 5 tickers
    pub = publication_set(tkr)
    if not pub:
        print(f" - {tkr}: sem Data_Publicacao no CSV de fundamentos.")
        continue
    ann = pd.to_datetime(df.loc[df['Ticker']==tkr, 'AnnounceDate']).dt.normalize()
    miss = ann[~ann.isin(pd.to_datetime(list(pub)).normalize())]
    rate = 1 - (len(miss)/max(1,len(ann)))
    print(f" - {tkr}: {rate:.0%} das AnnounceDate batem com Data_Publicacao (faltando {len(miss)})")



[A] Checando integridade do dataset final...
 - Colunas obrigat√≥rias OK
 - Datas nulas: 2 | CAR_30D nulos: 0 | Duplicatas (Ticker,AnnounceDate): 1
 - Stats CAR_30D: count    9.000000
mean     0.333776
std      0.291750
min     -0.035384
1%      -0.031518
5%      -0.016055
50%      0.249886
95%      0.689457
99%      0.698490
max      0.700748
Name: CAR_30D, dtype: float64

[B] Checando z-scores (STD_*)...
 - |mean(STD_*)| maiores (esperado ~0):
STD_Beta                       1.011537e-15
STD_EstimationLen              1.233581e-16
STD_Divida_Bruta_Y_Change      2.467162e-17
STD_Divida_Liquida_Q_Change    2.467162e-17
STD_ROE_Q_Change               2.467162e-17
dtype: float64
 - |std(STD_*) - 1| maiores (esperado ~0):
STD_CAPEX                        1.0
STD_DVA_Despesas_Fin             1.0
STD_CRESC_RL_12M_Y_Change        1.0
STD_CRESC_LL_12M_Y_Change        1.0
STD_CRESC_EBITDA_12M_Y_Change    1.0
dtype: float64
 - Magnitudes de STD_* dentro do esperado (<= 8).

[C] Recontando CAR/Œ

  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  return float(np.linalg.lstsq(x, y, rcond=None)[0][0])
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  return float(np.linalg.lstsq(x, y, rcond=None)[0][0])
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  return float(np.linalg.lstsq(x, y, rcond=None)[0][0])
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  m = m.merge(rf_df[['Date','rf_daily']], on='Date', how='left').fillna(method='ffill').sort_values('Date')
  return float(np.linalg.lstsq(x, y, rcond=None)[0][0])
  m = m.merge(rf_df[

#### Testes 

In [None]:
#üîß C√©lula A ‚Äî Helpers robustos (NOVA)
import pandas as pd
import numpy as np

def coerce_date_any(s: pd.Series) -> pd.Series:
    """Aceita 'dd/mm/aaaa' e 'aaaa-mm-dd'. Retorna datetime64[ns]."""
    s = s.astype(str).str.strip()
    d1 = pd.to_datetime(s, format="%d/%m/%Y", errors="coerce")
    d2 = pd.to_datetime(s, format="%Y-%m-%d", errors="coerce")
    d3 = pd.to_datetime(s, errors="coerce")  # fallback
    out = d1.fillna(d2).fillna(d3)
    return out

def coerce_num_br(x: pd.Series) -> pd.Series:
    """
    Converte strings com milhares '.' e decimal ',' para float.
    Se j√° for num√©rico, mant√©m.
    """
    if pd.api.types.is_numeric_dtype(x):
        return x.astype(float)
    s = x.astype(str).str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
    return pd.to_numeric(s, errors="coerce")

def read_events_csv(path: str) -> pd.DataFrame:
    """L√™ o dataset de eventos com datas robustas."""
    df = pd.read_csv(path)
    if "AnnounceDate" in df.columns:
        df["AnnounceDate"] = coerce_date_any(df["AnnounceDate"])
    if "EventTradeDate" in df.columns:
        df["EventTradeDate"] = coerce_date_any(df["EventTradeDate"])
    return df


In [158]:
#C√©lula 1 ‚Äî Splits temporais + Purged K-Fold com embargo
import numpy as np
import pandas as pd
from typing import Iterator, Tuple

def year_splits(df: pd.DataFrame,
                date_col: str = "EventTradeDate",
                train_end: int = 2016,
                val_end: int = 2018) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Split temporal fixo: Treino=<=train_end, Val=(train_end, val_end], Test=>val_end.
    Retorna arrays de √≠ndices (posi√ß√µes no df recebido).
    """
    y = pd.to_datetime(df[date_col]).dt.year
    i_train = np.where(y <= train_end)[0]
    i_val   = np.where((y > train_end) & (y <= val_end))[0]
    i_test  = np.where(y > val_end)[0]
    return i_train, i_val, i_test

def _intervals_from_events(dates: pd.Series, hold: int = 30) -> np.ndarray:
    """
    Para cada amostra, define o intervalo [T1, T1+hold-1] em segundos UNIX.
    Usado para 'purge' de sobreposi√ß√£o.
    """
    d = pd.to_datetime(dates).dt.floor("D").astype("int64") // 10**9
    start = d.values
    end   = d.values + 60*60*24*(hold-1)  # (hold-1) dias corridos (aprox conservadora)
    return np.vstack([start, end]).T  # shape (n,2)

def purged_kfold(df: pd.DataFrame,
                 n_splits: int = 5,
                 date_col: str = "EventTradeDate",
                 holding_days: int = 30,
                 embargo_days: int = 30) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    """
    K-fold temporal com 'purge' (remove do treino eventos que colidem com a janela de teste)
    e 'embargo' (remove eventos imediatamente ap√≥s a janela de teste).
    Retorna tuplas (idx_treino, idx_teste) como √≠ndices do df original.
    """
    df = df.reset_index(drop=True).copy()
    order = np.argsort(pd.to_datetime(df[date_col]).values)
    n = len(df)
    fold_edges = np.linspace(0, n, n_splits+1, dtype=int)

    # janelas [start,end] em segundos
    intervals = _intervals_from_events(df[date_col], hold=holding_days)

    for k in range(n_splits):
        lo, hi = fold_edges[k], fold_edges[k+1]
        test_idx = order[lo:hi]
        train_idx = np.setdiff1d(order, test_idx, assume_unique=False)

        # janela agregada de teste
        t_lo = intervals[test_idx, 0].min()
        t_hi = intervals[test_idx, 1].max()

        # embargo p√≥s-teste
        emb_hi = t_hi + 60*60*24*embargo_days

        # mant√©m no treino apenas quem N√ÉO intersecta [t_lo, emb_hi]
        keep = []
        for i in train_idx:
            s, e = intervals[i]
            # Sem interse√ß√£o se totalmente antes OU totalmente depois do embargo
            if (e < t_lo) or (s > emb_hi):
                keep.append(i)

        train_idx_purged = np.array(keep, dtype=int)
        yield np.sort(train_idx_purged), np.sort(test_idx)


In [166]:
#üß† C√©lula 2 ‚Äî prepare_xy / XGBoost / CV (SUBSTITUA A SUA)
import os, json
from typing import Dict, Tuple, List
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error, r2_score

RANDOM_STATE = 42

def prepare_xy(events_csv: str,
               target_mode: str = "cls",
               min_estimation: int = 60,
               drop_na_target: bool = True) -> Tuple[pd.DataFrame, pd.Series, List[str]]:
    df = read_events_csv(events_csv)  # << usa parser robusto
    if "EstimationLen" in df.columns:
        df = df[df["EstimationLen"] >= min_estimation].copy()

    # alvo
    if target_mode == "cls":
        y = (df["CAR_30D"] > 0).astype(int)
    else:
        y = df["CAR_30D"].astype(float)

    # features = apenas STD_* (evita vazamento)
    feats = [c for c in df.columns if c.startswith("STD_")]
    X = df[feats].copy().fillna(0.0)

    if drop_na_target:
        m = ~y.isna()
        X, y = X.loc[m].reset_index(drop=True), y.loc[m].reset_index(drop=True)

    # colunas auxiliares
    X["EventTradeDate"] = df.loc[X.index, "EventTradeDate"].values
    X["Ticker"] = df.loc[X.index, "Ticker"].values
    return X, y, feats

def xgb_default_params(mode: str) -> Dict:
    if mode == "cls":
        return dict(
            n_estimators=800, learning_rate=0.03, max_depth=4,
            subsample=0.8, colsample_bytree=0.8, min_child_weight=5,
            reg_lambda=1.0, gamma=0.0, random_state=RANDOM_STATE,
            objective="binary:logistic", eval_metric="auc"
        )
    else:
        return dict(
            n_estimators=800, learning_rate=0.03, max_depth=4,
            subsample=0.8, colsample_bytree=0.8, min_child_weight=5,
            reg_lambda=1.0, gamma=0.0, random_state=RANDOM_STATE,
            objective="reg:squarederror", eval_metric="rmse"
        )

# === Purged K-Fold com embargo (mesma assinatura de antes) ===
def year_splits(df: pd.DataFrame, date_col: str = "EventTradeDate", train_end: int = 2016, val_end: int = 2018):
    y = pd.to_datetime(df[date_col]).dt.year
    i_train = np.where(y <= train_end)[0]
    i_val   = np.where((y > train_end) & (y <= val_end))[0]
    i_test  = np.where(y > val_end)[0]
    return i_train, i_val, i_test

def _intervals_from_events(dates: pd.Series, hold: int = 30) -> np.ndarray:
    d = coerce_date_any(dates).dt.floor("D").astype("int64") // 10**9
    start = d.values
    end   = d.values + 60*60*24*(hold-1)
    return np.vstack([start, end]).T

def purged_kfold(df: pd.DataFrame,
                 n_splits: int = 5,
                 date_col: str = "EventTradeDate",
                 holding_days: int = 30,
                 embargo_days: int = 30):
    df = df.reset_index(drop=True).copy()
    order = np.argsort(coerce_date_any(df[date_col]).values)
    n = len(df)
    fold_edges = np.linspace(0, n, n_splits+1, dtype=int)
    intervals = _intervals_from_events(df[date_col], hold=holding_days)
    for k in range(n_splits):
        lo, hi = fold_edges[k], fold_edges[k+1]
        test_idx = order[lo:hi]
        train_idx = np.setdiff1d(order, test_idx, assume_unique=False)
        t_lo = intervals[test_idx, 0].min()
        t_hi = intervals[test_idx, 1].max()
        emb_hi = t_hi + 60*60*24*embargo_days
        keep = []
        for i in train_idx:
            s, e = intervals[i]
            if (e < t_lo) or (s > emb_hi):
                keep.append(i)
        train_idx_purged = np.array(keep, dtype=int)
        yield np.sort(train_idx_purged), np.sort(test_idx)

def cv_purged_scores(X: pd.DataFrame, y: pd.Series,
                     mode: str = "cls",
                     n_splits: int = 5,
                     holding_days: int = 30,
                     embargo_days: int = 30):
    params = xgb_default_params(mode)
    model = XGBClassifier(**params) if mode == "cls" else XGBRegressor(**params)
    metrics = {}
    for fold, (itr, ite) in enumerate(
        purged_kfold(X.assign(EventTradeDate=X["EventTradeDate"]),
                     n_splits=n_splits,
                     date_col="EventTradeDate",
                     holding_days=holding_days,
                     embargo_days=embargo_days), 1
    ):
        Xtr = X.iloc[itr].drop(columns=["EventTradeDate","Ticker"])
        Xte = X.iloc[ite].drop(columns=["EventTradeDate","Ticker"])
        ytr, yte = y.iloc[itr], y.iloc[ite]
        model.fit(Xtr, ytr)
        if mode == "cls":
            proba = model.predict_proba(Xte)[:,1]
            auc = roc_auc_score(yte, proba)
            metrics[fold] = float(auc)
        else:
            pred = model.predict(Xte)
            rmse = float(np.sqrt(mean_squared_error(yte, pred)))
            metrics[fold] = rmse
    avg = float(np.mean(list(metrics.values()))) if metrics else float("nan")
    return avg, metrics

def train_holdout_and_save(X: pd.DataFrame, y: pd.Series, feats: List[str],
                           out_dir: str = "models",
                           mode: str = "cls"):
    os.makedirs(out_dir, exist_ok=True)
    i_tr, i_va, i_te = year_splits(X, date_col="EventTradeDate", train_end=2016, val_end=2018)
    Xtr, Xva, Xte = X.iloc[i_tr], X.iloc[i_va], X.iloc[i_te]
    ytr, yva, yte = y.iloc[i_tr], y.iloc[i_va], y.iloc[i_te]
    params = xgb_default_params(mode)
    model = XGBClassifier(**params) if mode == "cls" else XGBRegressor(**params)
    Xtv = pd.concat([Xtr, Xva]).drop(columns=["EventTradeDate","Ticker"])
    ytv = pd.concat([ytr, yva])
    Xte2 = Xte.drop(columns=["EventTradeDate","Ticker"])
    model.fit(Xtv, ytv)
    if mode == "cls":
        proba = model.predict_proba(Xte2)[:,1]
        auc = roc_auc_score(y.iloc[i_te], proba)
        acc = accuracy_score(y.iloc[i_te], (proba>=0.5).astype(int))
        test_metrics = {"AUC": float(auc), "ACC": float(acc)}
    else:
        pred = model.predict(Xte2)
        rmse = float(np.sqrt(mean_squared_error(y.iloc[i_te], pred)))
        r2   = float(r2_score(y.iloc[i_te], pred))
        test_metrics = {"RMSE": rmse, "R2": r2}
    imp = pd.DataFrame({"feature": feats, "gain": model.feature_importances_}).sort_values("gain", ascending=False)
    model.save_model(os.path.join(out_dir, f"xgb_{mode}.json"))
    imp.to_csv(os.path.join(out_dir, f"feature_importance_{mode}.csv"), index=False)
    with open(os.path.join(out_dir, f"test_metrics_{mode}.json"), "w") as f:
        json.dump(test_metrics, f, indent=2)
    return test_metrics


In [167]:
#üìà C√©lula 3 ‚Äî Event Study (SUBSTITUA A SUA)
import os, math
from typing import Dict, Tuple

def load_market_rf(mkt_csv: str, rf_csv: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    mkt = pd.read_csv(mkt_csv)
    mkt["Date"]  = coerce_date_any(mkt["Data"])
    mkt["Close"] = coerce_num_br(mkt["FechAjust"])
    mkt = mkt.dropna(subset=["Date","Close"])[["Date","Close"]].sort_values("Date").reset_index(drop=True)

    rf = pd.read_csv(rf_csv)
    rf["Date"] = coerce_date_any(rf["Data"])
    if "rf_daily" not in rf.columns:
        if "Var" in rf.columns:
            rf["rf_daily"] = coerce_num_br(rf["Var"])  # CDI di√°rio (decimal)
        else:
            # fallback neutro (0) se n√£o houver taxa di√°ria; ajuste se tiver outra coluna apropriada
            rf["rf_daily"] = 0.0
    rf = rf.dropna(subset=["Date"])[["Date","rf_daily"]].sort_values("Date").reset_index(drop=True)

    # *garante* dtype datetime64
    mkt["Date"] = pd.to_datetime(mkt["Date"], errors="coerce")
    rf["Date"]  = pd.to_datetime(rf["Date"],  errors="coerce")
    return mkt, rf

def _merge_stock_mkt_rf(px: pd.DataFrame, mkt: pd.DataFrame, rf: pd.DataFrame) -> pd.DataFrame:
    # px: Data, Close_i
    px = px.copy()
    px["Date"]   = pd.to_datetime(px["Date"], errors="coerce")
    px["Close_i"]= coerce_num_br(px["Close_i"]) if not pd.api.types.is_numeric_dtype(px["Close_i"]) else px["Close_i"]
    px = px.dropna(subset=["Date","Close_i"])

    # garantir tipos
    m = (px.merge(mkt, on="Date", suffixes=("_i","_m"))
            .merge(rf, on="Date", how="left")
            .sort_values("Date")
            .reset_index(drop=True))
    m["rf_daily"] = m["rf_daily"].fillna(method="ffill").fillna(0.0)
    m["ri"] = m["Close_i"].pct_change()
    m["rm"] = m["Close"].pct_change()
    return m

def _estimate_beta_and_sigma(m: pd.DataFrame, event_idx: int, est_len: int) -> Tuple[float, float, pd.DataFrame]:
    start = max(0, event_idx - est_len)
    end   = event_idx - 1
    win = m.iloc[start:end+1].dropna(subset=["ri","rm","rf_daily"]).copy()
    if len(win) < 30:
        return np.nan, np.nan, win
    x = (win["rm"] - win["rf_daily"]).values.reshape(-1,1)
    y = (win["ri"] - win["rf_daily"]).values.reshape(-1,1)
    beta = np.linalg.lstsq(x, y, rcond=None)[0].ravel()[0]
    win["E_ri"] = win["rf_daily"] + beta * (win["rm"] - win["rf_daily"])
    win["eps"]  = (win["ri"] - win["E_ri"])
    sigma = float(win["eps"].std(ddof=1))
    return float(beta), sigma, win

def _event_window_ar(m: pd.DataFrame, event_idx: int, beta: float, L: int = 30) -> pd.Series:
    seg = m.iloc[event_idx:event_idx+L].dropna(subset=["ri","rm","rf_daily"]).copy()
    if seg.empty:
        return pd.Series(dtype=float)
    seg["E_ri"] = seg["rf_daily"] + beta * (seg["rm"] - seg["rf_daily"])
    seg["AR"]   = seg["ri"] - seg["E_ri"]
    seg["t"]    = range(len(seg))
    return seg.set_index("t")["AR"]

def aar_caar_by_quintile(events_csv: str,
                         final_dir: str,
                         mkt_csv: str,
                         rf_csv: str,
                         signal_col: str = "STD_EPS_EarningsSurprise",
                         holding_days: int = 30,
                         estimation_window: int = 252,
                         min_estimation: int = 60) -> Dict[str, pd.DataFrame]:
    mkt, rf = load_market_rf(mkt_csv, rf_csv)
    ev = read_events_csv(events_csv)  # robusto
    ev = ev.dropna(subset=["EventTradeDate"]).copy()

    if signal_col not in ev.columns:
        signal_col = "STD_LPA" if "STD_LPA" in ev.columns else ev.filter(like="EarningsSurprise").columns[0]

    ev["Year"] = ev["EventTradeDate"].dt.year
    ev["Q"] = ev.groupby("Year")[signal_col].transform(lambda s: pd.qcut(s.rank(method="first"), 5, labels=False)+1)

    out = {}
    for q in [1,2,3,4,5]:
        sub = ev[ev["Q"]==q].copy()
        if sub.empty:
            continue
        ars, sars = [], []
        for _, r in sub.iterrows():
            tkr = r["Ticker"]
            t1  = r["EventTradeDate"]

            p = os.path.join(final_dir, f"{tkr}.final.csv")
            if not os.path.exists(p):
                continue
            px = pd.read_csv(p)
            px["Date"]    = coerce_date_any(px["Data"])
            px["Close_i"] = coerce_num_br(px["Close"])
            px = px.dropna(subset=["Date","Close_i"])[["Date","Close_i"]].sort_values("Date").reset_index(drop=True)

            m = _merge_stock_mkt_rf(px, mkt, rf)
            ds = m["Date"].values
            eidx = int(np.searchsorted(ds, np.array(pd.Timestamp(t1), dtype="datetime64[ns]")))
            est_len = min(estimation_window, eidx)
            if est_len < min_estimation:
                continue

            beta, sigma, _ = _estimate_beta_and_sigma(m, eidx, est_len)
            if not np.isfinite(beta) or not np.isfinite(sigma) or sigma==0:
                continue
            ar = _event_window_ar(m, eidx, beta, L=holding_days)
            if ar.empty:
                continue
            ars.append(ar)
            sars.append(ar / sigma)

        if not ars:
            continue

        L = holding_days
        M_ar   = pd.DataFrame({i: s.reindex(range(L)) for i, s in enumerate(ars)})
        M_sar  = pd.DataFrame({i: s.reindex(range(L)) for i, s in enumerate(sars)})

        AAR  = M_ar.mean(axis=1, skipna=True).rename("AAR")
        CAAR = AAR.cumsum().rename("CAAR")

        n_t = M_ar.count(axis=1).astype(float)

        # Patell
        Z_pat = (M_sar.mean(axis=1) * np.sqrt(n_t)).rename("PatellZ")
        P_pat = Z_pat.apply(lambda z: 2*(1-0.5*(1+math.erf(abs(z)/np.sqrt(2)))))  # two-sided

        # BMP
        std_cs = M_ar.std(axis=1, ddof=1)
        T_bmp  = (AAR / (std_cs / np.sqrt(n_t))).replace([np.inf, -np.inf], np.nan).rename("BMP_t")
        P_bmp  = T_bmp.apply(lambda z: 2*(1-0.5*(1+math.erf(abs(z)/np.sqrt(2)))))  # approx normal

        # Corrado (rank)
        ranks = M_ar.rank(axis=1, method="average")
        Z_cor = ((ranks - (1 + ranks.shape[1])/2.0) / ranks.std(axis=1, ddof=1)).mean(axis=1)
        Z_cor = (Z_cor * np.sqrt(n_t)).rename("CorradoZ")
        P_cor = Z_cor.apply(lambda z: 2*(1-0.5*(1+math.erf(abs(z)/np.sqrt(2)))))

        out[f"Q{q}"] = pd.concat([AAR, CAAR, Z_pat, P_pat, T_bmp, P_bmp, Z_cor, P_cor], axis=1)

    return out



In [168]:
#C√©lula 4 ‚Äî Backtest simples (equal-weight, custos, filtros)
import os, math
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd

@dataclass
class BTConfig:
    events_csv: str = "dataset/final/pead_event_dataset_2010_2019.csv"
    final_dir: str  = "dataset/final"
    out_dir: str    = "dataset/backtest"
    holding_days: int = 30
    quantile: float = 0.2
    long_short: bool = False
    signal_col: str = "STD_EPS_EarningsSurprise"
    slippage_bps: float = 5.0
    commission_bps: float = 1.0
    min_price: float = 1.0
    vol_filter_quantile: float = 0.1
    use_market_hedge: bool = False  # manter False por enquanto

def _ensure_dir(d: str):
    Path(d).mkdir(parents=True, exist_ok=True)

def _resolve_signal_col(df: pd.DataFrame, pref: str) -> str:
    if pref in df.columns:
        return pref
    if "STD_LPA" in df.columns:
        return "STD_LPA"
    cand = [c for c in df.columns if c.startswith("STD_EPS") or c.endswith("EarningsSurprise")]
    if cand: return cand[0]
    raise ValueError("Nenhuma coluna de sinal encontrada (ex.: STD_EPS_EarningsSurprise ou STD_LPA).")

def _load_price_final(ticker: str, final_dir: str) -> pd.DataFrame:
    p = Path(final_dir) / f"{ticker}.final.csv"
    if not p.exists(): return pd.DataFrame()
    px = pd.read_csv(p, parse_dates=["Data"], dayfirst=True)
    px = px.sort_values("Data").dropna(subset=["Close"]).reset_index(drop=True)
    return px

def _entry_exit_indices(px: pd.DataFrame, t1: pd.Timestamp, holding: int) -> Optional[Tuple[int, int]]:
    if px.empty: return None
    ds = px["Data"].values
    idx = np.searchsorted(ds, np.array(t1, dtype="datetime64[ns]"))
    if idx >= len(px): return None
    idx_entry = int(idx)
    idx_exit  = int(min(idx_entry + holding - 1, len(px) - 1))
    return idx_entry, idx_exit

def _apply_roundtrip_cost(ret_series: pd.Series, roundtrip_bps: float) -> pd.Series:
    if ret_series.empty or roundtrip_bps <= 0: return ret_series
    r = ret_series.copy()
    r.iloc[0] = r.iloc[0] - (roundtrip_bps / 10000.0)
    return r

def _drawdown_stats(series: pd.Series) -> Tuple[float, float]:
    if series.empty: return 0.0, 0.0
    eq = (1.0 + series.fillna(0)).cumprod()
    peak = eq.cummax()
    dd = (eq / peak) - 1.0
    maxdd = dd.min()
    end = dd.idxmin()
    start = (eq.loc[:end].idxmax() if end in eq.index else eq.idxmax())
    duration = (pd.Timestamp(end) - pd.Timestamp(start)).days if (isinstance(end, pd.Timestamp) and isinstance(start, pd.Timestamp)) else 0
    return float(maxdd), float(duration)

def build_trades_and_pnl(cfg: BTConfig) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, float]]:
    _ensure_dir(cfg.out_dir)
    ev = pd.read_csv(cfg.events_csv, parse_dates=["AnnounceDate", "EventTradeDate"], dayfirst=True)
    ev = ev.sort_values(["EventTradeDate", "Ticker"]).reset_index(drop=True)

    sigcol = _resolve_signal_col(ev, cfg.signal_col)
    ev["Year"] = ev["EventTradeDate"].dt.year

    def _label_side(g: pd.DataFrame) -> pd.DataFrame:
        q_hi = g[sigcol].quantile(1 - cfg.quantile)
        q_lo = g[sigcol].quantile(cfg.quantile)
        if cfg.long_short:
            g["Side"] = np.where(g[sigcol] >= q_hi, 1, np.where(g[sigcol] <= q_lo, -1, 0))
        else:
            g["Side"] = np.where(g[sigcol] >= q_hi, 1, 0)
        return g

    ev = ev.groupby("Year", group_keys=False).apply(_label_side)
    ev = ev[ev["Side"] != 0].reset_index(drop=True)

    roundtrip_bps = 2.0 * (cfg.slippage_bps + cfg.commission_bps)
    trade_rows = []
    pnl_rows: Dict[pd.Timestamp, List[float]] = {}

    for _, row in ev.iterrows():
        tkr = row["Ticker"]
        t1  = row["EventTradeDate"]
        side = int(row["Side"])
        beta = float(row["Beta"]) if "Beta" in row and not pd.isna(row["Beta"]) else np.nan

        px = _load_price_final(tkr, cfg.final_dir)
        if px.empty: continue

        idxs = _entry_exit_indices(px, t1, cfg.holding_days)
        if idxs is None: continue
        i_entry, i_exit = idxs

        px_entry = float(px.iloc[i_entry]["Close"])
        if px_entry < cfg.min_price: continue
        if "Vol" in px.columns:
            v_hist = px["Vol"].dropna()
            v_thr = v_hist.quantile(cfg.vol_filter_quantile) if not v_hist.empty else 0.0
            if float(px.iloc[i_entry].get("Vol", v_thr)) < float(v_thr):
                continue

        seg = px.loc[i_entry:i_exit, ["Data", "Close"]].copy().reset_index(drop=True)
        seg["ret"] = seg["Close"].pct_change()
        seg = seg.dropna().reset_index(drop=True)  # come√ßa em T1+1
        seg["ret"] = _apply_roundtrip_cost(seg["ret"], roundtrip_bps)
        seg["ret"] = side * seg["ret"]

        for _, r in seg.iterrows():
            d = pd.Timestamp(r["Data"])
            pnl_rows.setdefault(d, []).append(float(r["ret"]))

        gross = float(seg["ret"].sum() + (roundtrip_bps/10000.0 if side != 0 else 0.0))
        net   = float(seg["ret"].sum())
        trade_rows.append({
            "Ticker": tkr,
            "AnnounceDate": row.get("AnnounceDate"),
            "EventTradeDate": t1,
            "EntryDate": seg["Data"].iloc[0] - pd.Timedelta(days=1),
            "ExitDate": seg["Data"].iloc[-1],
            "Side": side,
            "Signal": float(row[sigcol]),
            "EntryPrice": px_entry,
            "GrossRet": gross,
            "NetRet": net,
            "Beta": beta if not math.isnan(beta) else None
        })

    if not pnl_rows:
        raise RuntimeError("Nenhum trade v√°lido foi gerado. Verifique sinal, liquidez e cobertura de pre√ßos.")

    pnl_df = (pd.DataFrame([{"Date": d, "PortRet": np.mean(v)} for d, v in pnl_rows.items()]
        ).sort_values("Date").reset_index(drop=True))

    daily = pnl_df["PortRet"].values
    ann_factor = 252.0
    avg = np.nanmean(daily)
    std = np.nanstd(daily, ddof=0)
    cagr = float((1.0 + pnl_df["PortRet"]).prod() ** (ann_factor / len(pnl_df)) - 1.0) if len(pnl_df) > 0 else 0.0
    sharpe = float(np.sqrt(ann_factor) * avg / std) if std > 0 else 0.0
    maxdd, dd_dur = _drawdown_stats(pnl_df.set_index("Date")["PortRet"])
    hit = float((pnl_df["PortRet"] > 0).mean())

    metrics = {"N_trades": len(trade_rows), "Daily_mean": avg, "Daily_std": std,
               "Sharpe": sharpe, "CAGR": cagr, "MaxDD": maxdd, "MaxDD_days": dd_dur, "Hit_ratio": hit}

    _ensure_dir(cfg.out_dir)
    trades_df = pd.DataFrame(trade_rows)
    trades_df.to_csv(Path(cfg.out_dir) / "pead_backtest_trades.csv", index=False)
    pnl_out = pnl_df.copy()
    pnl_out["Equity"] = (1.0 + pnl_out["PortRet"]).cumprod()
    pnl_out.to_csv(Path(cfg.out_dir) / "pead_backtest_daily_pnl.csv", index=False)

    return trades_df, pnl_out, metrics


In [None]:
# Caminhos padr√£o
EVENTS = "dataset/final/pead_event_dataset_2010_2019.csv"
FINAL_DIR = "dataset/final"
IBOV = "dataset/prices/IBOV.SA.csv"
CDI  = "dataset/prices/CDI.SA.csv"

# ===== 1) Modelagem com Purged K-Fold =====
X, y, feats = prepare_xy(EVENTS, target_mode="cls", min_estimation=60)
avg_auc, auc_by_fold = cv_purged_scores(X, y, mode="cls", n_splits=5, holding_days=30, embargo_days=30)
print("[CV Purged] AUC m√©dio:", avg_auc, "| por fold:", auc_by_fold)

# ===== 2) Holdout temporal (Test=2019) =====
metrics_test = train_holdout_and_save(X, y, feats, out_dir="models", mode="cls")
print("[Holdout 2019] M√©tricas:", metrics_test)

# ===== 3) Event study por quintis (AAR/CAAR + Patell/BMP/Corrado) =====
out = aar_caar_by_quintile(
    events_csv=EVENTS,
    final_dir=FINAL_DIR,
    mkt_csv=IBOV,
    rf_csv=CDI,
    signal_col="STD_EPS_EarningsSurprise",
    holding_days=30,
    estimation_window=252,
    min_estimation=60
)
import os
os.makedirs("dataset/event_study", exist_ok=True)
for k, dfq in out.items():
    dfq.to_csv(f"dataset/event_study/aar_caar_tests_{k}.csv", index_label="t")
    print(f"[EventStudy] {k} salvo com shape {dfq.shape}")

# ===== 4) Backtest com custos e filtros =====
cfg = BTConfig(
    events_csv=EVENTS,
    final_dir=FINAL_DIR,
    out_dir="dataset/backtest",
    holding_days=30,
    quantile=0.2,
    long_short=False,
    signal_col="STD_EPS_EarningsSurprise",
    slippage_bps=5,
    commission_bps=1,
    min_price=1.0,
    vol_filter_quantile=0.1,
    use_market_hedge=False
)
trades, pnl, met = build_trades_and_pnl(cfg)
print("[Backtest] M√©tricas:", met)
display(trades.head())
display(pnl.tail())


## Teste 2

In [None]:
# =========================
# Se√ß√£o "Teste" (vers√£o aprimorada)
# =========================

import os
import warnings
import pandas as pd
import numpy as np

# 0) Warnings menos verbosos (mant√©m erros reais vis√≠veis)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, message="Parsing dates in %Y-%m-%d")

# 1) Caminhos padr√£o
EVENTS_RAW = "dataset/final/pead_event_dataset_2010_2019.csv"
FINAL_DIR  = "dataset/final"
IBOV       = "dataset/prices/IBOV.SA.csv"
CDI        = "dataset/prices/CDI.SA.csv"

# 2) Carrega e SANEIA eventos (datas e colunas essenciais)
def load_and_sanitize_events(path: str) -> pd.DataFrame:
    # For√ßa leitura com dayfirst=False (formato ISO YYYY-MM-DD)
    ev = pd.read_csv(
        path,
        parse_dates=["AnnounceDate", "EventTradeDate"],
        dayfirst=False
    )
    # Normaliza nomes esperados
    required_cols = ["Ticker", "EventTradeDate"]
    for c in required_cols:
        if c not in ev.columns:
            raise ValueError(f"[Eventos] Coluna obrigat√≥ria ausente: {c}")

    # Garante tipos corretos
    ev["Ticker"] = ev["Ticker"].astype(str).str.strip()
    ev = ev.dropna(subset=["Ticker", "EventTradeDate"]).copy()

    # Cria coluna Year (se n√£o existir)
    if "Year" not in ev.columns:
        ev["Year"] = ev["EventTradeDate"].dt.year

    # Opcional: assert m√≠nimo de janelas de estima√ß√£o, se existir a coluna
    if "EstimationLen" in ev.columns:
        ev = ev.loc[ev["EstimationLen"].fillna(0) >= 60].copy()

    return ev

ev = load_and_sanitize_events(EVENTS_RAW)

# 3) Pr√©-filtro de ELEGIBILIDADE para o BACKTEST
#    Mant√©m apenas eventos cujo ticker tem pre√ßos no intervalo [T1, T2]
#    usando os CSVs de pre√ßos por ticker em: dataset/prices_processed/<TICKER>.csv
HOLDING_DAYS = 30
PRICES_DIR   = "dataset/prices_processed"  # ajuste se seu reposit√≥rio for outro (ex.: dataset/prices)

def load_prices_for_ticker(tkr: str) -> pd.DataFrame | None:
    """
    Tenta abrir dataset/prices_processed/<TICKER>.csv.
    Aceita 'Date' ou 'Data' como coluna de data e v√°rias colunas de pre√ßo.
    Retorna DF com ['Date','Adj Close','Close','WClose'] quando poss√≠vel.
    """
    csv_path = os.path.join(PRICES_DIR, f"{tkr}.csv")
    if not os.path.exists(csv_path):
        return None
    df = pd.read_csv(csv_path)
    # Detecta coluna de data
    date_col = "Date" if "Date" in df.columns else ("Data" if "Data" in df.columns else None)
    if date_col is None:
        return None
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col]).sort_values(date_col)
    df = df.rename(columns={date_col: "Date"})
    return df

def has_window_prices(tkr: str, t1: pd.Timestamp, t2: pd.Timestamp) -> bool:
    px = load_prices_for_ticker(tkr)
    if px is None:
        return False
    # Verifica se h√° ao menos 1 preg√£o no intervalo
    mask = (px["Date"] >= t1) & (px["Date"] <= t2)
    return bool(mask.any())

def filter_events_with_prices(ev: pd.DataFrame, holding_days: int) -> pd.DataFrame:
    kept = []
    miss_cnt = 0
    for _, row in ev.iterrows():
        tkr = str(row["Ticker"]).strip()
        t0  = pd.to_datetime(row["EventTradeDate"])
        # Conven√ß√£o: entrar em T+1 e carregar por HOLDING_DAYS
        t1  = t0 + pd.Timedelta(days=1)
        t2  = t1 + pd.Timedelta(days=holding_days - 1)
        if has_window_prices(tkr, t1, t2):
            kept.append(row)
        else:
            miss_cnt += 1
    kept = pd.DataFrame(kept).reset_index(drop=True)
    print(f"[Backtest Pre-filter] Eventos eleg√≠veis: {len(kept)} | ignorados por janela sem pre√ßo: {miss_cnt}")
    return kept

ev_bt = filter_events_with_prices(ev, HOLDING_DAYS)

# Salva uma c√≥pia SANEADA e FILTRADA para o backtest
os.makedirs(FINAL_DIR, exist_ok=True)
EVENTS_CLEAN = os.path.join(FINAL_DIR, "pead_event_dataset_2010_2019_sanitized.csv")
EVENTS_BT    = os.path.join(FINAL_DIR, "pead_event_dataset_2010_2019_sanitized_backtest.csv")
ev.to_csv(EVENTS_CLEAN, index=False)
ev_bt.to_csv(EVENTS_BT, index=False)

# 4) ===== 1) Modelagem com Purged K-Fold =====
# Nota: usamos o EVENTS_CLEAN (datas saneadas), n√£o o RAW
X, y, feats = prepare_xy(EVENTS_CLEAN, target_mode="cls", min_estimation=60)
print(f"[Target balance] n={len(y)} | %positivos={100.0 * (y==1).mean():.1f}% | %negativos={100.0 * (y==0).mean():.1f}%")

avg_auc, auc_by_fold = cv_purged_scores(
    X, y, mode="cls", n_splits=5, holding_days=HOLDING_DAYS, embargo_days=30
)
print("[CV Purged] AUC m√©dio:", avg_auc, "| por fold:", auc_by_fold)

# 5) ===== 2) Holdout temporal (Test=2019) =====
metrics_test = train_holdout_and_save(X, y, feats, out_dir="models", mode="cls")
print("[Holdout 2019] M√©tricas:", metrics_test)

# 6) ===== 3) Event study por quintis (AAR/CAAR + Patell/BMP/Corrado) =====
out = aar_caar_by_quintile(
    events_csv=EVENTS_CLEAN,          # usa o saneado p/ evitar warning de dayfirst
    final_dir=FINAL_DIR,
    mkt_csv=IBOV,
    rf_csv=CDI,
    signal_col="STD_EPS_EarningsSurprise",
    holding_days=HOLDING_DAYS,
    estimation_window=252,
    min_estimation=60
)
os.makedirs("dataset/event_study", exist_ok=True)
for k, dfq in out.items():
    dfq.to_csv(f"dataset/event_study/aar_caar_tests_{k}.csv", index_label="t")
    print(f"[EventStudy] {k} salvo com shape {dfq.shape}")

# 7) ===== 4) Backtest com custos e filtros =====
# Aqui usamos o EVENTS_BT (pr√©-filtrado para garantir que cada evento tem pre√ßos na janela).
cfg = BTConfig(
    events_csv=EVENTS_BT,
    final_dir=FINAL_DIR,
    out_dir="dataset/backtest",
    holding_days=HOLDING_DAYS,
    quantile=0.2,
    long_short=False,
    signal_col="STD_EPS_EarningsSurprise",
    slippage_bps=5,
    commission_bps=1,
    min_price=1.0,
    vol_filter_quantile=0.1,
    use_market_hedge=False
)

os.makedirs("dataset/backtest", exist_ok=True)
try:
    trades, pnl, met = build_trades_and_pnl(cfg)
except IndexError as e:
    # Em caso de alguma ocorr√™ncia residual, relaxa o filtro de volume e re-tenta
    print(f"[Backtest] IndexError detectado ({e}). Re-tentando com filtros mais brandos...")
    cfg = BTConfig(
        events_csv=EVENTS_BT,
        final_dir=FINAL_DIR,
        out_dir="dataset/backtest",
        holding_days=HOLDING_DAYS,
        quantile=0.2,
        long_short=False,
        signal_col="STD_EPS_EarningsSurprise",
        slippage_bps=5,
        commission_bps=1,
        min_price=0.5,            # relaxa
        vol_filter_quantile=0.0,  # desliga corte de liquidez
        use_market_hedge=False
    )
    trades, pnl, met = build_trades_and_pnl(cfg)

print("[Backtest] M√©tricas:", met)
display(trades.head())
display(pnl.tail())
