In [2]:
# -*- coding: utf-8 -*-
"""
Replicação Seção 6.2 do paper:
- GA + XGBRegressor para prever CAR_30D
- Ranking por previsão dentro de cada trimestre
- Moving portfolios de 100 ações (Fig. 3)
- Portfólios por quantis (Q1–Q5)
- Loop automático para todos os trimestres 2017–2019

Pré-requisito:
  - Diretório pead_preproc com:
        train_processed.csv (2010–2016)
        test_processed.csv  (2017–2019)
"""

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


In [20]:
FEATS = [

    # ===== Fundamental – níveis MET =====
    "RL_MET", "LL_MET", "EBITDA_MET",
    "Preco_Abertura_MET", "Preco_Fechamento_MET",
    "LPA_MET", "ROA_MET", "ROE_MET", "MEB_MET",
    "CRESC_RL_12M_MET", "CRESC_LL_12M_MET", "CRESC_EBITDA_12M_MET",
    "CAPEX_MET", "FCO_MET", "FCF_MET",
    "Divida_Liquida_MET", "PL_MET", "Divida_Bruta_MET",
    "AT_MET", "DVA_Despesas_Fin_MET",
    "PC_MET", "PNC_MET", "Outros_PC_MET",
    "LUB_MET",

    # ===== Fundamental – variações Q (quarter-over-quarter) =====
    "RL_Q_Change", "LL_Q_Change", "EBITDA_Q_Change",
    "Preco_Abertura_Q_Change", "Preco_Fechamento_Q_Change",
    "LPA_Q_Change", "ROA_Q_Change", "ROE_Q_Change", "MEB_Q_Change",
    "CRESC_RL_12M_Q_Change", "CRESC_LL_12M_Q_Change", "CRESC_EBITDA_12M_Q_Change",
    "CAPEX_Q_Change", "FCO_Q_Change", "FCF_Q_Change",
    "Divida_Liquida_Q_Change", "PL_Q_Change", "Divida_Bruta_Q_Change",
    "AT_Q_Change", "DVA_Despesas_Fin_Q_Change",
    "PC_Q_Change", "PNC_Q_Change", "Outros_PC_Q_Change",
    "LUB_Q_Change",

    # ===== Fundamental – variações Y (year-over-year) =====
    "RL_Y_Change", "LL_Y_Change", "EBITDA_Y_Change",
    "Preco_Abertura_Y_Change", "Preco_Fechamento_Y_Change",
    "LPA_Y_Change", "ROA_Y_Change", "ROE_Y_Change", "MEB_Y_Change",
    "CRESC_RL_12M_Y_Change", "CRESC_LL_12M_Y_Change", "CRESC_EBITDA_12M_Y_Change",
    "CAPEX_Y_Change", "FCO_Y_Change", "FCF_Y_Change",
    "Divida_Liquida_Y_Change", "PL_Y_Change", "Divida_Bruta_Y_Change",
    "AT_Y_Change", "DVA_Despesas_Fin_Y_Change",
    "PC_Y_Change", "PNC_Y_Change", "Outros_PC_Y_Change",
    "LUB_Y_Change",

    # ===== EPS Surprise Features =====
    "EPS_EarningsSurprise",
    "EPS_Earnings_Surprise_Backward_Diff",
    "EPS_Earnings_Surprise_Backward_Ave_Diff",

    # ===== Momentum & Technical Indicators =====
    "MA5", "MA50", "MA200",
    "RSI9", "RSI14", "RSI30",
    "MA5_50", "MA5_200", "MA50_200",
    "MOM_1M", "MOM_3M", "MOM_6M", "MOM_12M",
]


In [3]:
# -------------------- CONFIGURAÇÕES --------------------
BASE_PRE  = "pead_preproc"
TRAIN_CSV = os.path.join(BASE_PRE, "train_processed.csv")
TEST_CSV  = os.path.join(BASE_PRE,  "test_processed.csv")

OUT_DIR   = "results_6_2"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
# === Carrega o pré-processado e cria Year/Quarter ===
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)
df_all   = pd.concat([df_train, df_test], ignore_index=True)

df_all["EventTradeDate"] = pd.to_datetime(df_all["EventTradeDate"])
df_all["Year"]    = df_all["EventTradeDate"].dt.year
df_all["Quarter"] = df_all["EventTradeDate"].dt.quarter

In [None]:
# === (opcional) GA + CV (Fig. 2) para o REGRESSOR; pode desligar com use_ga=False ===
def ga_optimize_xgb_reg(X_train, y_train, generations=5, pop_size=10, verbose=False):
    """
    GA + 5-fold CV para XGBRegressor.
    Fitness = RMSE médio (quanto menor, melhor).
    Compatível com versões antigas do sklearn (sem parâmetro 'squared').
    """

    SEARCH_SPACE = {
        "max_depth":        [3,4,5,6,7],
        "learning_rate":    [0.01, 0.03, 0.05, 0.07, 0.1],
        "subsample":        [0.6, 0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        "min_child_weight": [1,2,3,5,7],
        "gamma":            [0, 0.01, 0.05, 0.1, 0.2],
    }

    def random_params():
        return {k: np.random.choice(v) for k, v in SEARCH_SPACE.items()}

    def fitness(params):
        # KFold com embaralhamento para aproximar Fig.2 (GA + CV)
        kf = KFold(n_splits=5, shuffle=True)
        rmses = []

        for tr, va in kf.split(X_train):
            model = XGBRegressor(
                n_estimators=200,
                objective="reg:squarederror",
                **params
            )
            model.fit(X_train[tr], y_train[tr])
            pred = model.predict(X_train[va])
            # Versões antigas do sklearn não aceitam 'squared'; calculamos RMSE manualmente
            mse  = mean_squared_error(y_train[va], pred)
            rmse = np.sqrt(mse)
            rmses.append(rmse)

        return float(np.mean(rmses))  # menor é melhor

    # População inicial
    population = [random_params() for _ in range(pop_size)]

    for gen in range(generations):
        if verbose:
            print(f"[GA-REG] geração {gen+1}/{generations}", flush=True)

        scores = [fitness(p) for p in population]                 # RMSEs
        ranked = sorted(zip(scores, population), key=lambda x: x[0])
        survivors = [p for _, p in ranked[:pop_size // 2]]        # menores RMSE

        # Reprodução + mutação segura (mantém limites válidos do XGBoost)
        children = []
        while len(children) < pop_size - len(survivors):
            p1, p2 = np.random.choice(survivors, 2, replace=True)
            child = {k: (p1[k] if np.random.rand() < 0.5 else p2[k]) for k in SEARCH_SPACE}
            if np.random.rand() < 0.2:
                mk = np.random.choice(list(SEARCH_SPACE.keys()))
                child[mk] = np.random.choice(SEARCH_SPACE[mk])
            children.append(child)

        population = survivors + children

    # Escolhe o melhor (menor RMSE)
    final_scores = [fitness(p) for p in population]
    best_params  = population[int(np.argmin(final_scores))]
    if verbose:
        print("[GA-REG] melhor RMSE:", min(final_scores))
        print("[GA-REG] best_params:", best_params)
    return best_params

In [None]:
# === Função central: ranking por CAR_30D predito + gráfico com CAR_30D REAL ===
def rank_and_plot_quarter(year, quarter, use_ga=True, save=True, show=True):
    # Seleciona eventos do trimestre (test set daquele trimestre)
    mask_q = (df_all["Year"] == year) & (df_all["Quarter"] == quarter)
    df_q = df_all.loc[mask_q].copy()
    if df_q.empty:
        print(f"[{year} Q{quarter}] Sem eventos.")
        return None

    # Treino: tudo ANTES do início do trimestre (conforme o artigo)
    q_start = df_q["EventTradeDate"].min()
    df_hist = df_all[df_all["EventTradeDate"] < q_start].copy()
    if df_hist.empty:
        print(f"[{year} Q{quarter}] Sem histórico suficiente antes do trimestre.")
        return None

    # X, y
    X_tr = df_hist[FEATS].to_numpy()
    y_tr = df_hist["CAR_30D"].to_numpy()
    X_te = df_q[FEATS].to_numpy()

    # Hiperparâmetros (GA + CV Fig.2, ou defaults leves para depurar)
    if use_ga:
        params = ga_optimize_xgb_reg(X_tr, y_tr, generations=5, pop_size=10, verbose=True)
    else:
        params = {"max_depth":5,"learning_rate":0.05,"subsample":0.8,
                  "colsample_bytree":0.8,"min_child_weight":3,"gamma":0.05}

    # Modelo final e predição
    reg = XGBRegressor(n_estimators=300, objective="reg:squarederror", **params)
    reg.fit(X_tr, y_tr)
    df_q["CAR_pred"] = reg.predict(X_te)

    # RANKING: ordenar por CAR_30D predito (maior → menor)
    df_rank = df_q.sort_values("CAR_pred", ascending=False).reset_index(drop=True)
    df_rank["Rank"] = np.arange(1, len(df_rank)+1)

    # GRÁFICO: barras por ação (na ordem de ranking) usando CAR_30D REAL
    plt.figure(figsize=(12, 5))
    plt.bar(df_rank["Rank"], df_rank["CAR_30D"] * 100)  # em %
    plt.axhline(0, color="black", linewidth=1)
    plt.title(f"Ranking por CAR_30D predito — Exibe CAR_30D REAL | {year} Q{quarter}", fontsize=13)
    plt.xlabel("Rank (ordenado por CAR_30D predito)", fontsize=10)
    plt.ylabel("CAR_30D REAL (%)", fontsize=10)
    plt.tight_layout()

    # salvar/mostrar
    if save:
        csv_path = os.path.join(OUT_DIR, f"ranking_{year}_Q{quarter}.csv")
        png_path = os.path.join(OUT_DIR, f"ranking_{year}_Q{quarter}.png")
        df_rank.to_csv(csv_path, index=False)
        plt.savefig(png_path, dpi=150)
        print("Salvos:", csv_path, "e", png_path)
    if show:
        plt.show()
    else:
        plt.close()

    return df_rank


In [None]:
# Um trimestre específico (ex.: 2018 Q4)
rank_and_plot_quarter(2017, 2, use_ga=True)

In [None]:
# Todos os trimestres 2017–2019, em sequência:
for y in [2017, 2018, 2019]:
    for q in [1,2,3,4]:
        rank_and_plot_quarter(y, q, use_ga=True, save=True, show=False)
print("Concluído.")