In [8]:
# ===============================================================
# 01_b_processing_v19_test.py
# ===============================================================
# Constrói dataset TEST v19 (para previsão final)
# Usa APENAS dados permitidos pelo professor (coaches, players_teams, teams)
# NUNCA usa rank, vitórias, pontos ou estatísticas do ano atual.
# ===============================================================

import pandas as pd
import numpy as np
import os

print(">>> [v19 TEST] A processar test set...")

# ===============================================================
# 1️⃣ Ler dados DO TEST SET (dados do professor)
# ===============================================================
players_teams_test = pd.read_csv("test/players_teams.csv")
teams_test = pd.read_csv("test/teams.csv")
coaches_test = pd.read_csv("test/coaches.csv")   # usado só se precisares mais tarde

# ===============================================================
# 2️⃣ Ler dados históricos (para obter T−1)
# ===============================================================
players_teams_hist = pd.read_csv("data/players_teams.csv")
teams_hist = pd.read_csv("data/teams.csv")
awards_hist = pd.read_csv("data/awards_players.csv")

# Garantir ano numérico
for df in (players_teams_hist, players_teams_test, teams_hist, teams_test, awards_hist):
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
        df.dropna(subset=["year"], inplace=True)
        df["year"] = df["year"].astype(int)

test_year = teams_test["year"].iloc[0]
prev_year = test_year - 1

print(f"Preparar previsão para o ano {test_year} (usando T-1 = {prev_year})")

# ===============================================================
# 3️⃣ Calcular player_rating_prev usando histórico
# ===============================================================
cols_needed = ["playerID","year","points","rebounds","assists","steals","blocks",
               "turnovers","minutes","threeMade","threeAttempted","fgMade",
               "fgAttempted","PF","GP"]
for c in cols_needed:
    if c not in players_teams_hist.columns:
        players_teams_hist[c] = 0

agg = (
    players_teams_hist.groupby(["playerID", "year"], as_index=False)
    .agg(
        minutes=("minutes","sum"),
        points=("points","sum"),
        rebounds=("rebounds","sum"),
        assists=("assists","sum"),
        steals=("steals","sum"),
        blocks=("blocks","sum"),
        turnovers=("turnovers","sum"),
        threeMade=("threeMade","sum"),
        threeAttempted=("threeAttempted","sum"),
        fgMade=("fgMade","sum"),
        fgAttempted=("fgAttempted","sum"),
        PF=("PF","sum"),
        GP=("GP","sum"),
    )
)

# rating
agg["player_rating"] = (
      agg["points"]
    + agg["rebounds"] * 1.2
    + agg["assists"] * 1.5
    + agg["steals"] * 3
    + agg["blocks"] * 3
    - agg["turnovers"] * 2
    - agg["PF"] * 0.5
) / (agg["minutes"] / 36 + 1)

# filtrar players com minutos
agg = agg[agg["minutes"] > 50]

# criar T−1 map
prev_map = agg[agg["year"] == prev_year].copy()
prev_map = prev_map.rename(columns={"player_rating": "player_rating_prev"})
prev_map = prev_map[["playerID", "player_rating_prev", "minutes"]]
prev_map = prev_map.rename(columns={"minutes": "minutes_prev"})

# ===============================================================
# 4️⃣ Integrar prémios T−1
# ===============================================================
awards_prev = awards_hist[awards_hist["year"] == prev_year]
awards_prev = awards_prev[awards_prev["award"] != "Coach of the Year"]

award_weights = {
    "Most Valuable Player": 5,
    "WNBA Finals Most Valuable Player": 4,
    "Defensive Player of the Year": 3,
    "Rookie of the Year": 2,
}

awards_prev["award_weight"] = awards_prev["award"].map(award_weights).fillna(1.0)

award_counts = (
    awards_prev.groupby("playerID")["award_weight"]
    .sum().reset_index()
    .rename(columns={"award_weight":"num_awards"})
)

prev_map = prev_map.merge(award_counts, on="playerID", how="left")
prev_map["num_awards"] = prev_map["num_awards"].fillna(0)
prev_map["player_rating_prev"] *= (1 + 0.3 * prev_map["num_awards"])

# ===============================================================
# 5️⃣ Roster T−1 baseado no test/players_teams
# ===============================================================
roster_test = players_teams_test[["playerID","tmID","year"]].drop_duplicates()
roster_test = roster_test.merge(prev_map, on="playerID", how="left")

mean_rating_prev = prev_map["player_rating_prev"].mean()
roster_test["player_rating_prev"].fillna(mean_rating_prev, inplace=True)
roster_test["minutes_prev"].fillna(0, inplace=True)
roster_test["num_awards"].fillna(0, inplace=True)

# ===============================================================
# 6️⃣ Construir features por equipa (somente as 4)
# ===============================================================
def build_team_features(df):
    w = df["minutes_prev"].replace(0, np.nan).fillna(1.0)
    return pd.Series({
        "avg_player_rating": np.average(df["player_rating_prev"], weights=w),
        "team_total_awards": df["num_awards"].sum(),
    })

team_stats = (
    roster_test.groupby(["tmID","year"], as_index=False)
    .apply(build_team_features)
    .reset_index()
    .drop(columns=["level_2"], errors="ignore")
)

# ELITE RATIO T−1
elite_cutoff = prev_map["player_rating_prev"].quantile(0.90)
prev_map["is_elite"] = (prev_map["player_rating_prev"] >= elite_cutoff).astype(int)

elite_ratio = (
    roster_test.merge(prev_map[["playerID","is_elite"]], on="playerID", how="left")
    .groupby(["tmID","year"], as_index=False)["is_elite"]
    .mean()
    .rename(columns={"is_elite":"elite_ratio_prev"})
)

team_stats = team_stats.merge(elite_ratio, on=["tmID","year"], how="left")
team_stats["elite_ratio_prev"].fillna(0, inplace=True)

# ===============================================================
# 7️⃣ margin_prev usando histórico (teams_hist)
# ===============================================================
teams_hist = teams_hist.sort_values(["tmID", "year"])

teams_hist["o_pts_prev"] = teams_hist.groupby("tmID")["o_pts"].shift(1)
teams_hist["d_pts_prev"] = teams_hist.groupby("tmID")["d_pts"].shift(1)
teams_hist["GP_prev"] = teams_hist.groupby("tmID")["GP"].shift(1)

teams_hist["margin_prev"] = (
    teams_hist["o_pts_prev"] - teams_hist["d_pts_prev"]
) / teams_hist["GP_prev"].replace(0, np.nan)

teams_prev_year = teams_hist[teams_hist["year"] == prev_year][["tmID","margin_prev"]]

# ===============================================================
# 8️⃣ Merge final
# ===============================================================
test_ready = teams_test.merge(team_stats, on=["tmID","year"], how="left")
test_ready = test_ready.merge(teams_prev_year, on="tmID", how="left")

test_ready.fillna(0, inplace=True)

# ===============================================================
# 9️⃣ Guardar TEST FINAL
# ===============================================================
final_cols = [
    "tmID","year","confID",
    "margin_prev","avg_player_rating","team_total_awards","elite_ratio_prev"
]

final_test = test_ready[final_cols].copy()

os.makedirs("a", exist_ok=True)
out_path = "a/teams_test_v19.csv"
final_test.to_csv(out_path, index=False)

print(f"\n✅ TEST v19 criado com sucesso → {out_path}")


>>> [v19 TEST] A processar test set...


FileNotFoundError: [Errno 2] No such file or directory: 'test/players_teams.csv'

In [9]:
# ===============================================================
# 04_a_predict_v19.py  (VERSÃO CORRETA)
# ===============================================================

import pandas as pd
import numpy as np
import pickle
import os

print("\n>>> [v19 TEST] Prever ranking do test set...\n")

# ---------------------------------------------------------------
# 1) Carregar modelo treinado
# ---------------------------------------------------------------
MODEL_PATH = "a/model_v19.pkl"
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

print("✔ Modelo carregado.")

# ---------------------------------------------------------------
# 2) Carregar test set já processado (feito no script 02)
# ---------------------------------------------------------------
TEST_PATH = "a/teams_test_v19.csv"

if not os.path.exists(TEST_PATH):
    raise FileNotFoundError("❌ ERRO: 'teams_test_v19.csv' não existe.\nCorre o script 02 primeiro.")

test = pd.read_csv(TEST_PATH)

print("✔ Test set carregado.")
print(test.head())

# ---------------------------------------------------------------
# 3) Features
# ---------------------------------------------------------------
X_COLS = ["margin_prev", "avg_player_rating", "team_total_awards", "elite_ratio_prev"]
X = test[X_COLS]

# ---------------------------------------------------------------
# 4) Previsão
# ---------------------------------------------------------------
test["pred_raw"] = model.predict(X)

# Ranking por conferência
test["rank_pred"] = test.groupby("confID")["pred_raw"].rank(ascending=True, method="min")

# ---------------------------------------------------------------
# 5) Guardar
# ---------------------------------------------------------------
OUTPUT = test[["tmID", "confID", "year", "rank_pred"]].sort_values(["confID", "rank_pred"])

OUTPUT_PATH = "a/predictions_test_v19.csv"
OUTPUT.to_csv(OUTPUT_PATH, index=False)

print("\n✔ Previsões criadas com sucesso!")
print("Guardado em:", OUTPUT_PATH)
print("\nPreview:")
print(OUTPUT)



>>> [v19 TEST] Prever ranking do test set...

✔ Modelo carregado.


FileNotFoundError: ❌ ERRO: 'teams_test_v19.csv' não existe.
Corre o script 02 primeiro.