In [10]:

import os
import numpy as np
import pandas as pd
import random
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [11]:
# -------------------- Paths --------------------
TRAIN_CSV = "pead_preproc/train_processed.csv"
TEST_CSV  = "pead_preproc/test_processed.csv"

OUT_DIR = "results_6_1"
os.makedirs(OUT_DIR, exist_ok=True)

In [12]:
# -------------------- Config --------------------

TOP_K  = 5

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

TARGET = "CAR_30D"            # alvo para regressão
EVENT_DATE_COL = "EventTradeDate"  # usado para partições/ano
ID_COLS = ["Ticker", "EventTradeDate", "AnnounceDate", "SectorName", "SectorID", "EstimationLen", "FundSource", "Data", "Empresa" ]

LEAK_PREFIX  = ("CAR_", "AR_", "RET_")   # não usar como feature
# Se quiser banir mais colunas, pode adicionar aqui:
BAN_COLS = set([TARGET])


In [13]:
FEATS = [

    # ===== Fundamental – níveis MET =====
    "RL_MET", "LL_MET", "EBITDA_MET",
    "Preco_Abertura_MET", "Preco_Fechamento_MET",
    "LPA_MET", "ROA_MET", "ROE_MET", "MEB_MET",
    "CRESC_RL_12M_MET", "CRESC_LL_12M_MET", "CRESC_EBITDA_12M_MET",
    "CAPEX_MET", "FCO_MET", "FCF_MET",
    "Divida_Liquida_MET", "PL_MET", "Divida_Bruta_MET",
    "AT_MET", "DVA_Despesas_Fin_MET",
    "PC_MET", "PNC_MET", "Outros_PC_MET",
    "LUB_MET",

    # ===== Fundamental – variações Q (quarter-over-quarter) =====
    "RL_Q_Change", "LL_Q_Change", "EBITDA_Q_Change",
    "Preco_Abertura_Q_Change", "Preco_Fechamento_Q_Change",
    "LPA_Q_Change", "ROA_Q_Change", "ROE_Q_Change", "MEB_Q_Change",
    "CRESC_RL_12M_Q_Change", "CRESC_LL_12M_Q_Change", "CRESC_EBITDA_12M_Q_Change",
    "CAPEX_Q_Change", "FCO_Q_Change", "FCF_Q_Change",
    "Divida_Liquida_Q_Change", "PL_Q_Change", "Divida_Bruta_Q_Change",
    "AT_Q_Change", "DVA_Despesas_Fin_Q_Change",
    "PC_Q_Change", "PNC_Q_Change", "Outros_PC_Q_Change",
    "LUB_Q_Change",

    # ===== Fundamental – variações Y (year-over-year) =====
    "RL_Y_Change", "LL_Y_Change", "EBITDA_Y_Change",
    "Preco_Abertura_Y_Change", "Preco_Fechamento_Y_Change",
    "LPA_Y_Change", "ROA_Y_Change", "ROE_Y_Change", "MEB_Y_Change",
    "CRESC_RL_12M_Y_Change", "CRESC_LL_12M_Y_Change", "CRESC_EBITDA_12M_Y_Change",
    "CAPEX_Y_Change", "FCO_Y_Change", "FCF_Y_Change",
    "Divida_Liquida_Y_Change", "PL_Y_Change", "Divida_Bruta_Y_Change",
    "AT_Y_Change", "DVA_Despesas_Fin_Y_Change",
    "PC_Y_Change", "PNC_Y_Change", "Outros_PC_Y_Change",
    "LUB_Y_Change",

    # ===== EPS Surprise Features =====
    "EPS_EarningsSurprise",
    "EPS_Earnings_Surprise_Backward_Diff",
    "EPS_Earnings_Surprise_Backward_Ave_Diff",

    # ===== Momentum & Technical Indicators =====
    "MA5", "MA50", "MA200",
    "RSI9", "RSI14", "RSI30",
    "MA5_50", "MA5_200", "MA50_200",
    "MOM_1M", "MOM_3M", "MOM_6M", "MOM_12M",
]


In [14]:
# Função de otimização GA simples
# (exemplo realista mas leve para 100 runs)
# ===============================

import random

def ga_optimize_xgb(X_train, y_train, generations=5, pop_size=10, verbose=True):
    """
    GA + 5-fold CV para XGBoost
    Corrigido para obedecer todos os limites do XGBoost.
    Suporta 100 execuções sem quebrar.
    """

    # --------------------------
    # Espaço de busca (do paper)
    # --------------------------
    SEARCH_SPACE = {
        "max_depth":        [3,4,5,6,7],
        "learning_rate":    [0.01, 0.03, 0.05, 0.07, 0.1],
        "subsample":        [0.6, 0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        "min_child_weight": [1,2,3,5,7],
        "gamma":            [0, 0.01, 0.05, 0.1, 0.2],
    }

    def random_params():
        return {k: np.random.choice(v) for k, v in SEARCH_SPACE.items()}

    # --------------------------
    # CV Fitness (igual ao paper)
    # --------------------------
    def fitness(params):
        kf = KFold(n_splits=5, shuffle=True)

        acc_list = []
        for tr, va in kf.split(X_train):
            model = XGBClassifier(
                n_estimators=200,
                objective="binary:logistic",
                eval_metric="logloss",
                **params
            )
            model.fit(X_train[tr], y_train[tr])
            preds = model.predict(X_train[va])
            acc_list.append(accuracy_score(y_train[va], preds))

        return np.mean(acc_list)

    # --------------------------
    # Inicialização
    # --------------------------
    population = [random_params() for _ in range(pop_size)]

    # --------------------------
    # Evolução genética
    # --------------------------
    for gen in range(generations):
        if verbose:
            print(f"    [GA] geração {gen+1}/{generations}", flush=True)
        
        scores = [fitness(p) for p in population]

        # Seleção — top 50%
        ranked = sorted(zip(scores, population), key=lambda x: -x[0])
        survivors = [p for _, p in ranked[:pop_size // 2]]

        # Reprodução
        children = []
        while len(children) < pop_size - len(survivors):
            p1, p2 = random.sample(survivors, 2)
            child = {}

            # Crossover seguro
            for k in SEARCH_SPACE:
                child[k] = p1[k] if random.random() < 0.5 else p2[k]

            # Mutação segura
            if np.random.rand() < 0.20:  # prob. maior para explorar
                mut_key = np.random.choice(list(SEARCH_SPACE.keys()))
                child[mut_key] = np.random.choice(SEARCH_SPACE[mut_key])

            children.append(child)

        population = survivors + children

    # Melhor indivíduo final
    final_scores = [fitness(p) for p in population]
    best_params = population[int(np.argmax(final_scores))]

    return best_params

In [15]:
# Carrega os dados
# ===============================

df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

df = pd.concat([df_train, df_test], ignore_index=True)

df["Year"] = pd.to_datetime(df["EventTradeDate"]).dt.year
df["SectorName"] = df["SectorName"].astype(str)

# target binário
df["CAR_sign_bin"] = (df["CAR_30D"] > 0).astype(int)

# anos do paper 6.1
YEARS = [2017, 2018, 2019]

In [16]:
# mapeia f0,f1,... -> nome da feature (para get_score do XGBoost)
fmap = {f"f{i}": feat for i, feat in enumerate(FEATS)}

# para Tabela 6 (ALL stocks por ano)
importance_counts_all = {y: defaultdict(int)   for y in YEARS}
importance_sum_all    = {y: defaultdict(float) for y in YEARS}

# para Tabelas 7–15 (por setor e ano)
importance_counts_sector = {y: {} for y in YEARS}  # dict[year][sector] -> defaultdict(int)
importance_sum_sector    = {y: {} for y in YEARS}  # dict[year][sector] -> defaultdict(float)

In [17]:
# LOOP PRINCIPAL – 100 RUNS
# ===============================

results_year = {y: [] for y in YEARS}
results_sector = {y: {} for y in YEARS}

for year in YEARS:

    print(f"\n=== ANO {year} ===")

    df_train_y = df[df["Year"] < year].copy()
    df_test_y  = df[df["Year"] == year].copy()

    X_train = df_train_y[FEATS].values
    y_train = df_train_y["CAR_sign_bin"].values

    X_test  = df_test_y[FEATS].values
    y_test  = df_test_y["CAR_sign_bin"].values
    sector_test = df_test_y["SectorName"].values

    # ----------- 100 RUNS -------------

    best_params_year = ga_optimize_xgb(X_train, y_train, generations=5, pop_size=10, verbose=True)
    for run in range(100):
        print(f"  Run {run+1}/100", flush=True)
        model = XGBClassifier(
        n_estimators=200,
        objective="binary:logistic",
        eval_metric="logloss",
        **best_params_year)
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

        results_year[year].append(acc)

        # Por setor
        for s in np.unique(sector_test):
            mask = (sector_test == s)
            if mask.sum() > 0:
                acc_s = accuracy_score(y_test[mask], preds[mask])
                results_sector[year].setdefault(s, []).append(acc_s)
    
        # ---------- IMPORTÂNCIAS PARA TABELAS 6–15 ----------
        booster = model.get_booster()
        raw_scores = booster.get_score(importance_type="gain")  # ganho

        # converte f0,f1,... -> nome da feature
        scores_named = {}
        for k, v in raw_scores.items():
            feat_name = fmap.get(k, k)
            scores_named[feat_name] = float(v)

        if not scores_named:
            continue

        # normaliza importâncias para somarem 1 neste run
        total_gain = sum(scores_named.values())
        if total_gain <= 0:
            continue
        for f in scores_named:
            scores_named[f] /= total_gain

        # TOP_K para o grupo "all stocks" (Tabela 6)
        top_feats_all = sorted(scores_named.items(), key=lambda x: -x[1])[:TOP_K]
        for feat, imp_norm in top_feats_all:
            importance_counts_all[year][feat] += 1
            importance_sum_all[year][feat]    += imp_norm

        # TOP_K por setor (Tabelas 7–15)
        # aqui usamos os MESMOS scores_named para todos os setores,
        # pois o modelo é global; o paper treina modelos por grupo,
        # mas esta é a aproximação mais leve.
        sectors_in_year = np.unique(sector_test)
        for s in sectors_in_year:
            if s not in importance_counts_sector[year]:
                importance_counts_sector[year][s] = defaultdict(int)
                importance_sum_sector[year][s]    = defaultdict(float)

            top_feats_sec = top_feats_all  # mesma ordem do global
            for feat, imp_norm in top_feats_sec:
                importance_counts_sector[year][s][feat] += 1
                importance_sum_sector[year][s][feat]    += imp_norm    


=== ANO 2017 ===
    [GA] geração 1/5


    [GA] geração 2/5
    [GA] geração 3/5
    [GA] geração 4/5
    [GA] geração 5/5
  Run 1/100
  Run 2/100
  Run 3/100
  Run 4/100
  Run 5/100
  Run 6/100
  Run 7/100
  Run 8/100
  Run 9/100
  Run 10/100
  Run 11/100
  Run 12/100
  Run 13/100
  Run 14/100
  Run 15/100
  Run 16/100
  Run 17/100
  Run 18/100
  Run 19/100
  Run 20/100
  Run 21/100
  Run 22/100
  Run 23/100
  Run 24/100
  Run 25/100
  Run 26/100
  Run 27/100
  Run 28/100
  Run 29/100
  Run 30/100
  Run 31/100
  Run 32/100
  Run 33/100
  Run 34/100
  Run 35/100
  Run 36/100
  Run 37/100
  Run 38/100
  Run 39/100
  Run 40/100
  Run 41/100
  Run 42/100
  Run 43/100
  Run 44/100
  Run 45/100
  Run 46/100
  Run 47/100
  Run 48/100
  Run 49/100
  Run 50/100
  Run 51/100
  Run 52/100
  Run 53/100
  Run 54/100
  Run 55/100
  Run 56/100
  Run 57/100
  Run 58/100
  Run 59/100
  Run 60/100
  Run 61/100
  Run 62/100
  Run 63/100
  Run 64/100
  Run 65/100
  Run 66/100
  Run 67/100
  Run 68/100
  Run 69/100
  Run 70/100
  Run 71/100
  

In [18]:
# Salvar resultados
# ===============================

# Média por ano
df_year = pd.DataFrame({
    "Year": YEARS,
    "Accuracy_mean": [np.mean(results_year[y]) for y in YEARS],
    "Accuracy_std": [np.std(results_year[y]) for y in YEARS]
})
df_year.to_csv(os.path.join(OUT_DIR, "accuracy_yearly.csv"), index=False)


# Média por setor por ano
rows = []
for y in YEARS:
    for s, vals in results_sector[y].items():
        rows.append({
            "Year": y,
            "SectorName": s,
            "Accuracy_mean": np.mean(vals),
            "Accuracy_std": np.std(vals)
        })
df_sector = pd.DataFrame(rows)
df_sector.to_csv(os.path.join(OUT_DIR, "accuracy_sector_yearly.csv"), index=False)

print("\nResultados salvos em:", OUT_DIR)


Resultados salvos em: results_6_1


In [20]:
N_RUNS = 100

In [None]:
# ---------------------- TABELA 6: ALL STOCKS (por ano) ----------------------
rows_all = []

for year in YEARS:
    for feat, count in importance_counts_all[year].items():
        mean_imp = importance_sum_all[year][feat] / float(N_RUNS)
        rows_all.append({
            "Year": year,
            "Feature": feat,
            "OccurrenceCount": count,
            "NormalizedImportanceMean": mean_imp
        })

df_all_imp = pd.DataFrame(rows_all)

# ordena dentro de cada ano
df_all_imp = df_all_imp.sort_values(
    ["Year", "OccurrenceCount", "NormalizedImportanceMean"],
    ascending=[True, False, False]
)

# pega TOP 5 de cada ano (F1..F5)
table6_parts = []
for year in YEARS:
    sub = df_all_imp[df_all_imp["Year"] == year].head(5).copy()
    # adiciona labels F1..F5
    sub["F_label"] = [f"F{i}" for i in range(1, len(sub)+1)]
    table6_parts.append(sub)

table6 = pd.concat(table6_parts, ignore_index=True)
table6_path = os.path.join(OUT_DIR, "table6_all_stocks_top5.csv")
table6.to_csv(table6_path, index=False)
print("Tabela 6 salva em:", table6_path)


# ---------------------- TABELAS 7–15: POR SETOR ----------------------
# cada setor vira uma tabela separada (top 5 por ano)
for year in YEARS:
    for sector, counts_dict in importance_counts_sector[year].items():
        rows_sec = []
        for feat, count in counts_dict.items():
            mean_imp = importance_sum_sector[year][sector][feat] / float(N_RUNS)
            rows_sec.append({
                "Year": year,
                "SectorName": sector,
                "Feature": feat,
                "OccurrenceCount": count,
                "NormalizedImportanceMean": mean_imp
            })
        if not rows_sec:
            continue

        df_sec = pd.DataFrame(rows_sec).sort_values(
            ["OccurrenceCount", "NormalizedImportanceMean"],
            ascending=[False, False]
        ).head(5)  # top 5 do setor naquele ano
        df_sec["F_label"] = [f"F{i}" for i in range(1, len(df_sec)+1)]

        # nome de arquivo tipo: table_sector_<nome>_<ano>.csv
        safe_sector = sector.replace(" ", "_").replace("/", "_")
        fname = f"table_sector_{safe_sector}_{year}.csv"
        fpath = os.path.join(OUT_DIR, fname)
        df_sec.to_csv(fpath, index=False)
        print(f"Tabela setor {sector} ano {year} salva em:", fpath)


Tabela 6 salva em: results_6_1\table6_all_stocks_top5.csv
Tabela setor Bens Industriais ano 2017 salva em: results_6_1\table_sector_Bens_Industriais_2017.csv
Tabela setor Consumo Cíclico ano 2017 salva em: results_6_1\table_sector_Consumo_Cíclico_2017.csv
Tabela setor Consumo Não Cíclico ano 2017 salva em: results_6_1\table_sector_Consumo_Não_Cíclico_2017.csv
Tabela setor Financeiro e Outros ano 2017 salva em: results_6_1\table_sector_Financeiro_e_Outros_2017.csv
Tabela setor Materiais Básicos ano 2017 salva em: results_6_1\table_sector_Materiais_Básicos_2017.csv
Tabela setor Petróleo, Gás e Biocombustíveis ano 2017 salva em: results_6_1\table_sector_Petróleo,_Gás_e_Biocombustíveis_2017.csv
Tabela setor Saúde ano 2017 salva em: results_6_1\table_sector_Saúde_2017.csv
Tabela setor Tecnologia da Informação ano 2017 salva em: results_6_1\table_sector_Tecnologia_da_Informação_2017.csv
Tabela setor Telecomunicações ano 2017 salva em: results_6_1\table_sector_Telecomunicações_2017.csv
Tabela

NameError: name 'TABLES_6_15_TEXT' is not defined