In [30]:

import os
import numpy as np
import pandas as pd
import random
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [80]:
# -------------------- Paths --------------------
TRAIN_CSV = "pead_preproc/train_processed.csv"
TEST_CSV  = "pead_preproc/test_processed.csv"

OUT_DIR = "results_6_1"
os.makedirs(OUT_DIR, exist_ok=True)

In [61]:
# -------------------- Config --------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

TARGET = "CAR_30D"            # alvo para regressão
EVENT_DATE_COL = "EventTradeDate"  # usado para partições/ano
ID_COLS = ["Ticker", "EventTradeDate", "AnnounceDate", "SectorName", "SectorID", "EstimationLen", "FundSource", "Data", "Empresa" ]

LEAK_PREFIX  = ("CAR_", "AR_", "RET_")   # não usar como feature
# Se quiser banir mais colunas, pode adicionar aqui:
BAN_COLS = set([TARGET])


In [33]:
FEATS = [

    # ===== Fundamental – níveis MET =====
    "RL_MET", "LL_MET", "EBITDA_MET",
    "Preco_Abertura_MET", "Preco_Fechamento_MET",
    "LPA_MET", "ROA_MET", "ROE_MET", "MEB_MET",
    "CRESC_RL_12M_MET", "CRESC_LL_12M_MET", "CRESC_EBITDA_12M_MET",
    "CAPEX_MET", "FCO_MET", "FCF_MET",
    "Divida_Liquida_MET", "PL_MET", "Divida_Bruta_MET",
    "AT_MET", "DVA_Despesas_Fin_MET",
    "PC_MET", "PNC_MET", "Outros_PC_MET",
    "LUB_MET",

    # ===== Fundamental – variações Q (quarter-over-quarter) =====
    "RL_Q_Change", "LL_Q_Change", "EBITDA_Q_Change",
    "Preco_Abertura_Q_Change", "Preco_Fechamento_Q_Change",
    "LPA_Q_Change", "ROA_Q_Change", "ROE_Q_Change", "MEB_Q_Change",
    "CRESC_RL_12M_Q_Change", "CRESC_LL_12M_Q_Change", "CRESC_EBITDA_12M_Q_Change",
    "CAPEX_Q_Change", "FCO_Q_Change", "FCF_Q_Change",
    "Divida_Liquida_Q_Change", "PL_Q_Change", "Divida_Bruta_Q_Change",
    "AT_Q_Change", "DVA_Despesas_Fin_Q_Change",
    "PC_Q_Change", "PNC_Q_Change", "Outros_PC_Q_Change",
    "LUB_Q_Change",

    # ===== Fundamental – variações Y (year-over-year) =====
    "RL_Y_Change", "LL_Y_Change", "EBITDA_Y_Change",
    "Preco_Abertura_Y_Change", "Preco_Fechamento_Y_Change",
    "LPA_Y_Change", "ROA_Y_Change", "ROE_Y_Change", "MEB_Y_Change",
    "CRESC_RL_12M_Y_Change", "CRESC_LL_12M_Y_Change", "CRESC_EBITDA_12M_Y_Change",
    "CAPEX_Y_Change", "FCO_Y_Change", "FCF_Y_Change",
    "Divida_Liquida_Y_Change", "PL_Y_Change", "Divida_Bruta_Y_Change",
    "AT_Y_Change", "DVA_Despesas_Fin_Y_Change",
    "PC_Y_Change", "PNC_Y_Change", "Outros_PC_Y_Change",
    "LUB_Y_Change",

    # ===== EPS Surprise Features =====
    "EPS_EarningsSurprise",
    "EPS_Earnings_Surprise_Backward_Diff",
    "EPS_Earnings_Surprise_Backward_Ave_Diff",

    # ===== Momentum & Technical Indicators =====
    "MA5", "MA50", "MA200",
    "RSI9", "RSI14", "RSI30",
    "MA5_50", "MA5_200", "MA50_200",
    "MOM_1M", "MOM_3M", "MOM_6M", "MOM_12M",
]


In [81]:
# Função de otimização GA simples
# (exemplo realista mas leve para 100 runs)
# ===============================

import random

def ga_optimize_xgb(X_train, y_train, generations=5, pop_size=10, verbose=True):
    """
    GA + 5-fold CV para XGBoost
    Corrigido para obedecer todos os limites do XGBoost.
    Suporta 100 execuções sem quebrar.
    """

    # --------------------------
    # Espaço de busca (do paper)
    # --------------------------
    SEARCH_SPACE = {
        "max_depth":        [3,4,5,6,7],
        "learning_rate":    [0.01, 0.03, 0.05, 0.07, 0.1],
        "subsample":        [0.6, 0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        "min_child_weight": [1,2,3,5,7],
        "gamma":            [0, 0.01, 0.05, 0.1, 0.2],
    }

    def random_params():
        return {k: np.random.choice(v) for k, v in SEARCH_SPACE.items()}

    # --------------------------
    # CV Fitness (igual ao paper)
    # --------------------------
    def fitness(params):
        kf = KFold(n_splits=5, shuffle=True)

        acc_list = []
        for tr, va in kf.split(X_train):
            model = XGBClassifier(
                n_estimators=200,
                objective="binary:logistic",
                eval_metric="logloss",
                **params
            )
            model.fit(X_train[tr], y_train[tr])
            preds = model.predict(X_train[va])
            acc_list.append(accuracy_score(y_train[va], preds))

        return np.mean(acc_list)

    # --------------------------
    # Inicialização
    # --------------------------
    population = [random_params() for _ in range(pop_size)]

    # --------------------------
    # Evolução genética
    # --------------------------
    for gen in range(generations):
        if verbose:
            print(f"    [GA] geração {gen+1}/{generations}", flush=True)
        
        scores = [fitness(p) for p in population]

        # Seleção — top 50%
        ranked = sorted(zip(scores, population), key=lambda x: -x[0])
        survivors = [p for _, p in ranked[:pop_size // 2]]

        # Reprodução
        children = []
        while len(children) < pop_size - len(survivors):
            p1, p2 = random.sample(survivors, 2)
            child = {}

            # Crossover seguro
            for k in SEARCH_SPACE:
                child[k] = p1[k] if random.random() < 0.5 else p2[k]

            # Mutação segura
            if np.random.rand() < 0.20:  # prob. maior para explorar
                mut_key = np.random.choice(list(SEARCH_SPACE.keys()))
                child[mut_key] = np.random.choice(SEARCH_SPACE[mut_key])

            children.append(child)

        population = survivors + children

    # Melhor indivíduo final
    final_scores = [fitness(p) for p in population]
    best_params = population[int(np.argmax(final_scores))]

    return best_params

In [82]:
# Carrega os dados
# ===============================

df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

df = pd.concat([df_train, df_test], ignore_index=True)

df["Year"] = pd.to_datetime(df["EventTradeDate"]).dt.year
df["SectorName"] = df["SectorName"].astype(str)

# target binário
df["CAR_sign_bin"] = (df["CAR_30D"] > 0).astype(int)

# anos do paper 6.1
YEARS = [2017, 2018, 2019]   # ajuste: paper usa 2015–2018, mas seu dataset é 2017–2019

In [None]:
# LOOP PRINCIPAL – 100 RUNS
# ===============================

results_year = {y: [] for y in YEARS}
results_sector = {y: {} for y in YEARS}

for year in YEARS:

    print(f"\n=== ANO {year} ===")

    df_train_y = df[df["Year"] < year].copy()
    df_test_y  = df[df["Year"] == year].copy()

    X_train = df_train_y[FEATS].values
    y_train = df_train_y["CAR_sign_bin"].values

    X_test  = df_test_y[FEATS].values
    y_test  = df_test_y["CAR_sign_bin"].values
    sector_test = df_test_y["SectorName"].values

    # ----------- 100 RUNS -------------
    #for run in range(100):
    #    print(f"  Run {run+1}/100", end="\r")
#
    #    # GA obtém hiperparâmetros válidos
    #    best_params = ga_optimize_xgb(
    #        X_train, y_train,
    #        generations=8,
    #        pop_size=12
    #    )
#
    #    # Treina modelo final
    #    model = XGBClassifier(
    #        n_estimators=300,
    #        objective="binary:logistic",
    #        eval_metric="logloss",
    #        **best_params
    #    )
    best_params_year = ga_optimize_xgb(X_train, y_train, generations=5, pop_size=10, verbose=True)
    for run in range(100):
        print(f"  Run {run+1}/100", flush=True)
        model = XGBClassifier(
        n_estimators=300,
        objective="binary:logistic",
        eval_metric="logloss",
        **best_params_year)
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

        results_year[year].append(acc)

        # Por setor
        for s in np.unique(sector_test):
            mask = (sector_test == s)
            if mask.sum() > 0:
                acc_s = accuracy_score(y_test[mask], preds[mask])
                results_sector[year].setdefault(s, []).append(acc_s)

In [None]:
# Salvar resultados
# ===============================

# Média por ano
df_year = pd.DataFrame({
    "Year": YEARS,
    "Accuracy_mean": [np.mean(results_year[y]) for y in YEARS],
    "Accuracy_std": [np.std(results_year[y]) for y in YEARS]
})
df_year.to_csv(os.path.join(OUT_DIR, "accuracy_yearly.csv"), index=False)


# Média por setor por ano
rows = []
for y in YEARS:
    for s, vals in results_sector[y].items():
        rows.append({
            "Year": y,
            "SectorName": s,
            "Accuracy_mean": np.mean(vals),
            "Accuracy_std": np.std(vals)
        })
df_sector = pd.DataFrame(rows)
df_sector.to_csv(os.path.join(OUT_DIR, "accuracy_sector_yearly.csv"), index=False)

print("\nResultados salvos em:", OUT_DIR)