In [52]:
# BIBLIOTECAS
# =====================
from __future__ import annotations
import os
import io
import math
import json
import time
import enum
import warnings
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
from dotenv import load_dotenv
import re

import numpy as np
import pandas as pd
import requests




import pandas_ta as ta
from collections import OrderedDict


In [99]:
#Utils
# -------------------------
# Utils de I/O e diret√≥rios
# -------------------------

DEFAULT_DIRS = ["dataset", "dataset/prices", "dataset/prices_processed",
                "dataset/fundamental", "dataset/final"]

def ensure_dirs(paths: List[str] = DEFAULT_DIRS) -> None:
    for p in paths:
        os.makedirs(p, exist_ok=True)

# -------------------------
# Utils de parsing num√©rico
# -------------------------

#def _parse_publication_dates(series: pd.Series) -> pd.Series:
#    """
#    Converte a coluna de datas de publica√ß√£o para datetime,
#    assumindo SEMPRE padr√£o brasileiro (DD/MM/AAAA) quando houver ambiguidade.
#
#    Regras:
#    - Se estiver no padr√£o ISO 'YYYY-MM-DD', usamos isso direto (n√£o √© amb√≠guo).
#    - Se estiver no padr√£o brasileiro 'DD/MM/YYYY', interpretamos como dia/m√™s/ano.
#    - Se vier em qualquer outro formato, tentamos parse com dayfirst=True.
#    - No final, retornamos datetime normalizado (sem hora).
#    """
#
#    s = series.astype(str).str.strip()
#
#    # 1) tenta ISO claro: 2024-03-31
#    iso_mask = s.str.match(r"^\d{4}-\d{2}-\d{2}$")
#    out_iso = pd.to_datetime(
#        s.where(iso_mask),
#        format="%Y-%m-%d",
#        errors="coerce"
#    )
#
#    # 2) tenta BR claro: 31/03/2024
#    br_mask = s.str.match(r"^\d{2}/\d{2}/\d{4}$")
#    out_br = pd.to_datetime(
#        s.where(br_mask),
#        format="%d/%m/%Y",
#        dayfirst=True,
#        errors="coerce"
#    )
#
#    # 3) come√ßa com ISO e preenche lacunas com BR
#    out = out_iso.fillna(out_br)
#
#    # 4) fallback gen√©rico:
#    #    qualquer coisa que sobrou a gente interpreta assumindo padr√£o brasileiro (dayfirst=True)
#    still_nat = out.isna()
#    if still_nat.any():
#        out_fallback = pd.to_datetime(
#            s[still_nat],
#            errors="coerce",
#            dayfirst=True   # <- for√ßa sem√¢ntica brasileira
#        )
#        out.loc[still_nat] = out_fallback
#
#    # 5) normaliza para "apenas a data" (zera hora)
#    out = out.dt.normalize()
#
#    return out

def to_float_smart(x):
    """
    Converte strings tipo '1.234,56' (BR) ou '1,234.56' (US) e variantes em float.
    Trata negativos e milhares. Retorna NaN se n√£o der.
    """
    import re
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float, np.number)):
        return float(x)

    s = str(x).strip()
    if s in {"", "-", "--", "nan", "NaN", "None", "NULL"}:
        return np.nan

    # mant√©m apenas d√≠gitos, sinais e separadores
    s = re.sub(r"[^0-9\-\.,]", "", s)

    has_dot   = "." in s
    has_comma = "," in s

    try:
        if has_dot and has_comma:
            # decide pelo separador mais √† direita
            if s.rfind(",") > s.rfind("."):
                # BR: 1.234,56 -> 1234.56
                s = s.replace(".", "").replace(",", ".")
            else:
                # US: 1,234.56 -> 1234.56
                s = s.replace(",", "")
            return float(s)

        if has_comma and not has_dot:
            # BR decimal: 1234,56 -> 1234.56
            return float(s.replace(",", "."))

        if has_dot and not has_comma:
            # Pode ser decimal (um ponto) ou milhares (v√°rios pontos)
            if s.count(".") == 1:
                return float(s)  # 1234.56
            else:
                # 109.641.290.194 -> 109641290194
                return float(s.replace(".", ""))

        # S√≥ d√≠gitos e talvez sinal
        return float(s)
    except Exception:
        return np.nan

    
def to_int_smart(x) -> float:
    """Converte texto para inteiro removendo n√£o-d√≠gitos."""
    if x is None or (isinstance(x, float) and np.isnan(x)): return np.nan
    s = re.sub(r"[^0-9\-]", "", str(x))
    try:
        return int(s)
    except Exception:
        return np.nan
 
def catalog_return(row, x, name_return):
    val = row.get(name_return, np.nan)
    std = row.get(f"Rolling_std_{name_return}", np.nan)
    if pd.isna(val) or pd.isna(std) or std == 0: return 0
    if val > x * std:   return 1
    if val < -x * std:  return -1
    return 0

#def rsi_wilder(close: pd.Series, window: int = 14) -> pd.Series:
#    close = close.astype(float)
#    delta = close.diff()
#    gain = delta.clip(lower=0)
#    loss = -delta.clip(upper=0)
#    # Wilder: alpha = 1/window
#    avg_gain = gain.ewm(alpha=1/window, adjust=False).mean()
#    avg_loss = loss.ewm(alpha=1/window, adjust=False).mean()
#    rs = avg_gain / (avg_loss + 1e-12)
#    rsi = 100 - (100 / (1 + rs))
#    return rsi

In [100]:
#priceprocessing
class PriceProcessing:
    def __init__(self, df_prices: pd.DataFrame, ticker: str):
        self.ticker = ticker
        self.df = df_prices.copy()

        # Datas
        self.df["Data"] = pd.to_datetime(self.df["Data"], dayfirst=True, errors="coerce", format="%Y-%m-%d")
        # üîß NOVO: remove linhas com Data = NaT (evita erro no merge_asof)
        self.df = self.df[~self.df["Data"].isna()].copy()

        # N√∫meros
        float_cols = ["FechAjust","FechHist","AbertAjust","MinAjust","MedAjust","MaxAjust","Var","Fator"]
        for c in float_cols:
            if c in self.df.columns: self.df[c] = self.df[c].apply(to_float_smart)
        for c in ["Vol","Neg"]:
            if c in self.df.columns: self.df[c] = self.df[c].apply(to_int_smart)

        # Ordena/dedup
        self.df = self.df.sort_values("Data").drop_duplicates("Data").reset_index(drop=True)

        # Close
        #self.df["Close"] = self.df["FechAjust"] if ("FechAjust" in self.df and self.df["FechAjust"].notna().any()) else self.df.get("FechHist", np.nan)
        if "FechAjust" in self.df.columns and self.df["FechAjust"].notna().any():
            self.df["Close"] = self.df["FechAjust"]
        else:
            #if "AbertAjust" in self.df.columns:
            #    self.df["Close"] = self.df["AbertAjust"].shift(-1)
            #else:
                self.df["Close"] = np.nan   
                
        # Limpa linhas sem Close
        
        before = len(self.df)
        close_num = self.df["Close"]
        mask = close_num.notna() & (close_num != 0)
        dfp = self.df.loc[mask].copy()
        after = len(dfp)
        removed = before - after
        pct = 0.0 if before == 0 else (removed / before) * 100
        
        print(f"{ticker}: Linhas removidas (Close NaN/0): {removed} de {before} = {pct:.2f}%")
        self.df = dfp

        # Fechamento ponderado por volume
        if "Vol" in self.df.columns and self.df["Vol"].notna().any():
            vol_mean = self.df["Vol"].replace(0, np.nan).mean()
            if vol_mean and not np.isnan(vol_mean):
                self.df["FechPonderado"] = self.df["Close"] * self.df["Vol"] / vol_mean
            else:
                # fallback caso vol_mean vire NaN
                self.df["FechPonderado"] = self.df["Close"]
        else:
            self.df["FechPonderado"] = np.nan

        self.df["Close"] = self.df["Close"].apply(to_float_smart)

    def add_momentum_indicators(self):
        """
        Adiciona indicadores de 'momento':
        - M√©dias m√≥veis simples (5, 50, 200)
        - RSI (9, 30)
        - Raz√µes de MAs (5/50, 5/200, 50/200)
        - Momentum (1M, 3M, 6M, 12M) em log-return acumulado
        """
        df = self.df

        # MAs
        df["MA5"]   = ta.sma(df["Close"], length=5)
        df["MA50"]  = ta.sma(df["Close"], length=50)
        df["MA200"] = ta.sma(df["Close"], length=200)

        # RSI
        df["RSI9"]  = ta.rsi(df["Close"], lenght=9)
        df["RSI14"]  = ta.rsi(df["Close"], lenght=14)
        df["RSI30"] = ta.rsi(df["Close"], lenght=30)

        # Raz√µes
        df["MA5_50"]   = df["MA5"]   / (df["MA50"].replace(0, np.nan))
        df["MA5_200"]  = df["MA5"]   / (df["MA200"].replace(0, np.nan))
        df["MA50_200"] = df["MA50"]  / (df["MA200"].replace(0, np.nan))

        # Momentum cl√°ssico (opcional; comente se n√£o quiser)
        # ~21, ~63, ~126, ~252 preg√µes
        df["MOM_1M"] = ta.mom(df["Close"], length=21)
        df["MOM_3M"] = ta.mom(df["Close"], length=63)
        df["MOM_6M"] = ta.mom(df["Close"], length=126)
        df["MOM_12M"] = ta.mom(df["Close"], length=252) 

        self.df = df            

    def create_return_by_period(self, name_return: str, period: int, column_name: str = "Close", remove_nan=False):
        self.df[name_return] = np.log(self.df[column_name] / self.df[column_name].shift(period))
        if remove_nan:
            self.df.dropna(subset=[name_return], inplace=True)
        
        if name_return == "Daily_Return":
            #Proximo retorno
            self.df["ret_t0_t1"] = self.df["Daily_Return"].shift(-1)
            
    
    def create_return_by_period(self, name_return: str, period: int, pre_event: bool = False, column_name: str = "Close", remove_nan=False):
        px = self.df[column_name].astype(float)
        if pre_event:
            num = px.shift(1)
            den = px.shift(1 + period)
        else:
            num = px
            den = px.shift(period)
    
        self.df[name_return] = np.log(num/den)
        if remove_nan:
            self.df.dropna(subset=[name_return], inplace=True)    

   
    def create_rolling_std(self, name_return: str, window: int = 22):
        """
        Calcula o desvio padr√£o em uma janela m√≥vel.
        Ex.: window=22 ‚Üí volatilidade de 22 per√≠odos.
        """
        self.df[f"Rolling_std_{name_return}"] = (
            self.df[name_return].rolling(window=window).std()
    )    

    def create_indicator(self, name_return: str, factor: float = 0.1):
        self.df[f"Indicator_{name_return}"] = self.df.apply(lambda r: catalog_return(r, factor, name_return), axis=1)
    
    def set_event(self, fund_dir: str = "dataset/fundamental") -> None:
        """
        Cria a coluna self.df["event"] = 1 se houve publica√ß√£o de resultado naquela data,
        sen√£o 0.

        A data de publica√ß√£o vem de dataset/fundamental/<TICKER>.SA.csv
        na coluna 'Data_Publicacao'.
        """

        # Monta o caminho para o arquivo fundamental correspondente
        # Ex.: dataset/fundamental/ABEV3.SA.csv
        fund_path = os.path.join(fund_dir, f"{self.ticker}.SA.csv")

        # Se n√£o existir dado fundamental pra esse papel (empresa antiga, delistada etc.),
        # n√£o falha: simplesmente marca tudo como 0.
        if not os.path.exists(fund_path):
            self.df["event"] = 0
            return

        df_fund = pd.read_csv(fund_path)

        # Se n√£o houver coluna de data de publica√ß√£o, tamb√©m fallback = 0
        if "Data_Publicacao" not in df_fund.columns:
            self.df["event"] = 0
            return

        # Converte datas de publica√ß√£o para datetime normalizado (sem hora)
        #df_fund["Data_Publicacao"] = _parse_publication_dates(df_fund["Data_Publicacao"])

        #df_fund["Data_Publicacao"] = pd.to_datetime(df_fund["Data_Publicacao"], errors="coerce", dayfirst=True, format="%Y-%m-%d")
        #pub_dates = set(
        #    df_fund["Data_Publicacao"]
        #    .dropna()
        #    .dt.normalize()
        #    .values
        #)
#
        ## Normaliza as datas de pre√ßo pra s√≥ comparar YYYY-MM-DD
        #self.df["Data"] = pd.to_datetime(self.df["Data"], errors="coerce", dayfirst=True, format="%Y-%m-%d")
        #data_normalized = self.df["Data"].dt.normalize().values
        ##data_normalized = self.df["Data"].values
#
        ## Marca 1 se houve resultado publicado naquela data, sen√£o 0
        #self.df["event"] = np.array(
        #    [1 if d in pub_dates else 0 for d in data_normalized],
        #    dtype=int
        #)
        df_fund["Data_Publicacao"] = pd.to_datetime(
           df_fund["Data_Publicacao"],
            errors="coerce",
            dayfirst=True
        )
        
        pub_dates = df_fund["Data_Publicacao"].dropna().dt.normalize()
        
        self.df["Data"] = pd.to_datetime(
            self.df["Data"],
            errors="coerce",
            dayfirst=True
        )
        
        self.df["Data_norm"] = self.df["Data"].dt.normalize()
        self.df["event"] = self.df["Data_norm"].isin(pub_dates).astype(int)


    def finalize(self) -> pd.DataFrame:
        cols = ["Data","Close","FechPonderado","Vol","Neg","Var"]
        cols += [c for c in self.df.columns if c.startswith(("Daily_","ret_t0_t1", "Week_","Month_"))]
        cols += [c for c in self.df.columns if c.startswith(("Rolling_std_","Indicator_", "event"))]        
        cols += [c for c in self.df.columns if c.startswith(("MOM","RSI", "MA"))]
        cols = [c for c in cols if c in self.df.columns]
        out = self.df[cols].copy()
        out.insert(0, "Ticker", self.ticker)
        return out
        

In [101]:
#DataPrepPrices
class DataPrepPrices:
    def __init__(self,
                 prices_dir: str = "dataset/prices",
                 fund_dir: str = "dataset/fundamental",
                 out_prices_dir: str = "dataset/prices_processed",
                 out_final_dir: str = "dataset/final"):
        self.prices_dir = prices_dir
        self.fund_dir = fund_dir
        self.out_prices_dir = out_prices_dir
        self.out_final_dir = out_final_dir
        ensure_dirs([prices_dir, fund_dir, out_prices_dir, out_final_dir])

    @staticmethod
    def _ticker_from_price_filename(fname: str) -> str:
        # "AZUL4.SA.csv" -> "AZUL4"
        base = os.path.basename(fname)
        if base.endswith(".csv"):
            base = base[:-4]
        return base.replace(".SA", "")


    def process_one(self, 
                    price_csv_path: str,
                    indicator_factor: float = 0.1,
                    save_intermediate_prices: bool = True,                
                    only_events: bool = False) -> Optional[pd.DataFrame]:
        
        """
        Processa um √∫nico CSV de pre√ßo:
        - calcula retornos (daily/week/month)
        - calcula volatilidade rolling
        - cria indicadores categ√≥ricos
        - cria flag de evento (publica√ß√£o de resultado)
        - salva vers√£o processada
        - retorna dataframe final desse ticker
        """

        base = os.path.basename(price_csv_path).upper()
        if base in {"IBOV.SA.CSV", "CDI.SA.CSV"}:
            return pd.DataFrame()  # proxies ‚Äî n√£o s√£o pap√©is

        # Pre√ßo
        dfp = pd.read_csv(price_csv_path, sep=None, engine="python", dtype=str)      
    
        tkr = self._ticker_from_price_filename(price_csv_path)
        price = PriceProcessing(dfp, tkr)

        # Retornos e indicadores
        price.create_return_by_period("Daily_Return", 1)
        price.create_return_by_period("Week_Return", 5)
        price.create_return_by_period("Month_Return", 22)
        
        price.create_rolling_std("Daily_Return", window=21)
        price.create_rolling_std("Week_Return", window=65)
        price.create_rolling_std("Month_Return", window=252)
        
        # indicadores discretos    
        price.create_indicator("Daily_Return", indicator_factor)
        price.create_indicator("Week_Return", indicator_factor)
        price.create_indicator("Month_Return", indicator_factor)   
        
        #Momento
        price.add_momentum_indicators()
        
        #Eventos
        price.set_event(fund_dir=self.fund_dir)


        df_price_feat = price.finalize()
        
        if save_intermediate_prices:
            df_price_feat.to_csv(os.path.join(self.out_prices_dir, os.path.basename(price_csv_path)), index=False)

        # Garante Ticker correto (sem duplicar)
        if "Ticker" in df_price_feat.columns:
            df_price_feat["Ticker"] = tkr
        else:
            df_price_feat.insert(0, "Ticker", tkr)
        return df_price_feat
    
    def process_all(self,
                indicator_factor: float = 0.1,
                    save_intermediate_prices: bool = True,                
                    only_events: bool = False) -> pd.DataFrame:
        """Processa todos os arquivos em dataset/prices e devolve consolidado."""
        all_final = []
        for fn in os.listdir(self.prices_dir):
            if not fn.lower().endswith(".csv"):
                continue
            try:
                path = os.path.join(self.prices_dir, fn)
                df_final = self.process_one(
                    path,
                    indicator_factor=indicator_factor,
                    save_intermediate_prices=save_intermediate_prices,                
                    only_events=only_events
                )
                if df_final is not None and not df_final.empty:
                    all_final.append(df_final.assign(Ticker=self._ticker_from_price_filename(fn)))
            except Exception as ex:
                print(f"Erro no ticker de {fn}: {ex}")
                continue
        if not all_final:
            return pd.DataFrame()
        df_all = pd.concat(all_final, ignore_index=True)
        # Ordena por Ticker e Data
        df_all = df_all.sort_values(["Ticker", "Data"]).reset_index(drop=True)
        # salva consolidado
        df_all.to_csv(os.path.join(self.out_final_dir, "final_price_process.csv"), index=False)
        return df_all
    

In [102]:
if __name__ == "__main__":
    from pathlib import Path

    # 0) Garante que todas as pastas base existem
    ensure_dirs([
        "dataset/prices",
        "dataset/fundamental",
        "dataset/prices_processed",
        "dataset/final"
    ])

    # 1) Pr√©-processa todos os pap√©is
    pipeline = DataPrepPrices(
        prices_dir="dataset/prices",
        fund_dir="dataset/fundamental",
        out_prices_dir="dataset/prices_processed",
        out_final_dir="dataset/final"
    )

    df_all_final = pipeline.process_all(
        indicator_factor=0.1,
        save_intermediate_prices=True,
        only_events=False   # se quiser s√≥ linhas nas datas de evento, troque para True
    )

    print("Step 1 OK ‚Äî final_price_process.csv salvo em dataset/final/")
    

ABEV3: Linhas removidas (Close NaN/0): 998 de 2513 = 39.71%
Step 1 OK ‚Äî final_price_process.csv salvo em dataset/final/


  df_fund["Data_Publicacao"] = pd.to_datetime(


In [104]:
df = pd.read_csv("dataset/prices_processed/ABEV3.SA.csv", dtype=str)
df["event"].sum()

'000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000001000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000010000000000000000000

In [71]:
df_fund = pd.read_csv("dataset/fundamental/ABEV3.SA.csv")

In [76]:
if "Data_Publicacao" not in df_fund.columns:
        df["event"] = 0
        print("nok")
else:
    print("ok")
    

ok


In [77]:
df_fund["Data_Publicacao"] = pd.to_datetime(df_fund["Data_Publicacao"], errors="coerce", dayfirst=True, format="%Y-%m-%d")

In [78]:
pub_dates = set(
            df_fund["Data_Publicacao"]
            .dropna()
            .dt.normalize()
            .values
        )

In [79]:
pub_dates

{numpy.datetime64('2013-10-31T00:00:00.000000000'),
 numpy.datetime64('2014-03-24T00:00:00.000000000'),
 numpy.datetime64('2014-05-07T00:00:00.000000000'),
 numpy.datetime64('2014-07-31T00:00:00.000000000'),
 numpy.datetime64('2014-10-31T00:00:00.000000000'),
 numpy.datetime64('2015-02-26T00:00:00.000000000'),
 numpy.datetime64('2015-05-06T00:00:00.000000000'),
 numpy.datetime64('2015-07-30T00:00:00.000000000'),
 numpy.datetime64('2015-10-30T00:00:00.000000000'),
 numpy.datetime64('2016-02-25T00:00:00.000000000'),
 numpy.datetime64('2016-05-04T00:00:00.000000000'),
 numpy.datetime64('2016-07-29T00:00:00.000000000'),
 numpy.datetime64('2016-10-28T00:00:00.000000000'),
 numpy.datetime64('2017-05-04T00:00:00.000000000'),
 numpy.datetime64('2017-07-27T00:00:00.000000000'),
 numpy.datetime64('2017-10-26T00:00:00.000000000'),
 numpy.datetime64('2018-05-09T00:00:00.000000000'),
 numpy.datetime64('2018-07-05T00:00:00.000000000'),
 numpy.datetime64('2018-07-26T00:00:00.000000000'),
 numpy.datet

In [80]:
df["Data"] = pd.to_datetime(df["Data"], errors="coerce", dayfirst=True, format="%Y-%m-%d")

In [81]:
data_normalized = df["Data"].dt.normalize().values

In [84]:
df["event"] = np.array(
    [1 if d in pub_dates else 0 for d in data_normalized],
    dtype=int
        )

In [91]:
pub_dates

<DatetimeArray>
['2020-02-27 00:00:00', '2019-10-25 00:00:00', '2019-07-25 00:00:00',
 '2019-05-07 00:00:00', '2019-02-28 00:00:00', '2018-10-25 00:00:00',
 '2018-07-26 00:00:00', '2018-05-09 00:00:00', '2018-07-05 00:00:00',
 '2017-10-26 00:00:00', '2017-07-27 00:00:00', '2017-05-04 00:00:00',
 '2016-10-28 00:00:00', '2016-07-29 00:00:00', '2016-05-04 00:00:00',
 '2016-02-25 00:00:00', '2015-10-30 00:00:00', '2015-07-30 00:00:00',
 '2015-05-06 00:00:00', '2015-02-26 00:00:00', '2014-10-31 00:00:00',
 '2014-07-31 00:00:00', '2014-05-07 00:00:00', '2014-03-24 00:00:00',
 '2013-10-31 00:00:00']
Length: 25, dtype: datetime64[ns]

In [92]:
data_normalized

array(['2013-11-11T00:00:00.000000000', '2013-11-12T00:00:00.000000000',
       '2013-11-13T00:00:00.000000000', ...,
       '2019-12-27T00:00:00.000000000', '2019-12-30T00:00:00.000000000',
       '2020-01-02T00:00:00.000000000'], dtype='datetime64[ns]')

In [95]:
len([df["event"]==1])

1

In [93]:
import numpy as np
import pandas as pd

# 1) Converte e normaliza as datas de publica√ß√£o
df_fund["Data_Publicacao"] = pd.to_datetime(
    df_fund["Data_Publicacao"],
    errors="coerce",
    dayfirst=True
)
pub_dates = (
    df_fund["Data_Publicacao"]
    .dropna()
    .dt.normalize()
)

# 2) Converte e normaliza as datas de pre√ßo
df["Data"] = pd.to_datetime(
    df["Data"],
    errors="coerce",
    dayfirst=True
)
df["Data_norm"] = df["Data"].dt.normalize()

# 3) DEBUG: ver interse√ß√£o pra voc√™ n√£o ficar maluco
pub_unique   = pub_dates.unique()
price_unique = df["Data_norm"].unique()

intersection = set(pub_unique) & set(price_unique)

print("‚¨á Datas de publica√ß√£o (unique):")
print(sorted(pub_unique))

print("\n‚¨á Faixa de datas de pre√ßo:")
print("min:", df["Data_norm"].min(), "max:", df["Data_norm"].max())

print("\n‚¨á Interse√ß√£o entre publica√ß√£o e pre√ßos:")
print(sorted(intersection))
print("Qtd de datas em comum:", len(intersection))

# 4) Finalmente, marca os eventos
df["event"] = df["Data_norm"].isin(pub_unique).astype(int)

print("\nQtd de dias de preg√£o marcados como event=1:", df["event"].sum())
print(df.loc[df["event"] == 1, ["Data", "Data_norm"]].head(20))


‚¨á Datas de publica√ß√£o (unique):
[Timestamp('2013-10-31 00:00:00'), Timestamp('2014-03-24 00:00:00'), Timestamp('2014-05-07 00:00:00'), Timestamp('2014-07-31 00:00:00'), Timestamp('2014-10-31 00:00:00'), Timestamp('2015-02-26 00:00:00'), Timestamp('2015-05-06 00:00:00'), Timestamp('2015-07-30 00:00:00'), Timestamp('2015-10-30 00:00:00'), Timestamp('2016-02-25 00:00:00'), Timestamp('2016-05-04 00:00:00'), Timestamp('2016-07-29 00:00:00'), Timestamp('2016-10-28 00:00:00'), Timestamp('2017-05-04 00:00:00'), Timestamp('2017-07-27 00:00:00'), Timestamp('2017-10-26 00:00:00'), Timestamp('2018-05-09 00:00:00'), Timestamp('2018-07-05 00:00:00'), Timestamp('2018-07-26 00:00:00'), Timestamp('2018-10-25 00:00:00'), Timestamp('2019-02-28 00:00:00'), Timestamp('2019-05-07 00:00:00'), Timestamp('2019-07-25 00:00:00'), Timestamp('2019-10-25 00:00:00'), Timestamp('2020-02-27 00:00:00')]

‚¨á Faixa de datas de pre√ßo:
min: 2013-11-11 00:00:00 max: 2020-01-02 00:00:00

‚¨á Interse√ß√£o entre publica√

In [98]:
df["event"].sum()

23