In [None]:
import pandas as pd, numpy as np, itertools
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone

In [None]:
# Carregar base
df = pd.read_csv("base_fermentacao_suja_5000.csv")

In [None]:
# Limpar base
cols = ["Tempo (h)", "Temperatura (°C)", "pH", "Glicose (g/L)", "Agitação (rpm)", "Rendimento Etanol (%)"]
for c in cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.replace([np.inf, -np.inf], np.nan)

df_limpa = df.dropna(subset=["Tempo (h)"])  # alvo não pode ser NaN
df_limpa = df_limpa[
    (df_limpa["pH"].between(3.5, 6.0)) &
    (df_limpa["Glicose (g/L)"] <= 300) &
    (df_limpa["Tempo (h)"].between(12, 48))
]

In [None]:
# Criar variável binária de eficiência
features = ["Temperatura (°C)", "pH", "Glicose (g/L)", "Agitação (rpm)", "Rendimento Etanol (%)"]
X = df_limpa[features].copy()
y = df_limpa["Tempo (h)"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

base_est = Pipeline(steps=[
    ("imp_med", SimpleImputer(strategy="median")),
    ("imp_const", SimpleImputer(strategy="constant", fill_value=0, keep_empty_features=True)),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("linreg", LinearRegression())
])

def rmse_cv(cols):
    Xsub = X_train[cols]
    neg_mse = cross_val_score(
        base_est, Xsub, y_train,
        cv=cv, scoring="neg_mean_squared_error",
        error_score="raise"
    )
    return np.sqrt(-neg_mse).mean()

def try_rmse_cv(cols, verbose=False):
    try:
        val = rmse_cv(cols)
        if np.isnan(val) or np.isinf(val):
            raise ValueError("rmse_cv retornou NaN/Inf")
        return float(val)
    except Exception as e:
        if verbose:
            print(f"[WARN] Falha no CV para cols={cols}: {e}")
        return np.inf

In [None]:
# ===== Best Subset =====
best_subset_result = {"features": None, "rmse_cv": np.inf}
all_results_subset = []
for k in range(1, len(features)+1):
    for comb in itertools.combinations(features, k):
        score = try_rmse_cv(list(comb))
        all_results_subset.append((comb, score))
        if score < best_subset_result["rmse_cv"]:
            best_subset_result = {"features": list(comb), "rmse_cv": score}

if best_subset_result["features"] is None:  # fallback
    singles = [(f, try_rmse_cv([f], verbose=True)) for f in features]
    singles.sort(key=lambda t: t[1])
    best_subset_result = {"features": [singles[0][0]], "rmse_cv": singles[0][1]}
# ===== Forward (com semente) =====
remaining = set(features)
single_scores = [(f, try_rmse_cv([f])) for f in remaining]
single_scores.sort(key=lambda t: t[1])
forward_selected = [single_scores[0][0]]
best_score = single_scores[0][1]
remaining.remove(forward_selected[0])

improved = True
while improved and remaining:
    improved = False
    cand_best = None
    cand_score = best_score
    for f in list(remaining):
        cols = forward_selected + [f]
        score = try_rmse_cv(cols)
        if score < cand_score - 1e-6:
            cand_score, cand_best, improved = score, f, True
    if improved:
        forward_selected.append(cand_best)
        remaining.remove(cand_best)
        best_score = cand_score

forward_result = {"features": forward_selected, "rmse_cv": best_score}

# ===== Backward (resiliente) =====
current = features.copy()
best_score = try_rmse_cv(current)
while not np.isfinite(best_score) and len(current) > 1:
    candidates = []
    for f in current:
        cols = [c for c in current if c != f]
        candidates.append((cols, try_rmse_cv(cols)))
    candidates.sort(key=lambda t: t[1])
    current, best_score = candidates[0]

improved = True
while improved and len(current) > 1:
    improved = False
    best_drop = None
    cand_score = best_score
    for f in list(current):
        cols = [c for c in current if c != f]
        score = try_rmse_cv(cols)
        if score <= cand_score - 1e-6:
            cand_score, best_drop, improved = score, f, True
    if improved and best_drop:
        current.remove(best_drop)
        best_score = cand_score

backward_result = {"features": current, "rmse_cv": best_score}

print("\n[BestSubset]", best_subset_result)
print("[Forward]", forward_result)
print("[Backward]", backward_result)



[BestSubset] {'features': ['Temperatura (°C)', 'pH', 'Glicose (g/L)', 'Agitação (rpm)', 'Rendimento Etanol (%)'], 'rmse_cv': 5.8244295461680125}
[Forward] {'features': ['Rendimento Etanol (%)', 'Temperatura (°C)', 'Glicose (g/L)', 'Agitação (rpm)', 'pH'], 'rmse_cv': 5.8244295461680125}
[Backward] {'features': ['Temperatura (°C)', 'pH', 'Glicose (g/L)', 'Agitação (rpm)', 'Rendimento Etanol (%)'], 'rmse_cv': 5.8244295461680125}


In [None]:
def fit_and_eval(cols, name):
    assert cols and len(cols) > 0, "Lista de variáveis está vazia."
    est = clone(base_est)
    est.fit(X_train[cols], y_train)
    pred = est.predict(X_test[cols])

    mse = mean_squared_error(y_test, pred)     # sem 'squared'
    rmse = float(mse**0.5)                     # raiz para virar RMSE

    return {
        "modelo": name,
        "variaveis": cols,
        "RMSE_test": rmse,
        "MAE_test": mean_absolute_error(y_test, pred),
        "R2_test": r2_score(y_test, pred),
    }

res_best = fit_and_eval(best_subset_result["features"], "Best Subset")
res_forward = fit_and_eval(forward_result["features"], "Forward")
res_backward = fit_and_eval(backward_result["features"], "Backward")

comparacao = pd.DataFrame([res_best, res_forward, res_backward]).sort_values("RMSE_test")
print("\n=== Comparação (ordem por RMSE_test) ===")
print(comparacao[["modelo", "RMSE_test", "MAE_test", "R2_test", "variaveis"]])

melhor = comparacao.iloc[0]
cols_melhor = melhor["variaveis"]
est_melhor = clone(base_est).fit(X_train[cols_melhor], y_train)
y_pred = est_melhor.predict(X_test[cols_melhor])
residuos = y_test - y_pred
print(f"\nModelo vencedor: {melhor['modelo']} | Variáveis: {cols_melhor}")
print(f"RMSE={melhor['RMSE_test']:.2f} | MAE={mean_squared_error(y_test, y_pred)**0.5:.2f} | R2={r2_score(y_test, y_pred):.3f}")


=== Comparação (ordem por RMSE_test) ===
        modelo  RMSE_test  MAE_test   R2_test  \
0  Best Subset     5.8753   4.72845  0.683474   
1      Forward     5.8753   4.72845  0.683474   
2     Backward     5.8753   4.72845  0.683474   

                                           variaveis  
0  [Temperatura (°C), pH, Glicose (g/L), Agitação...  
1  [Rendimento Etanol (%), Temperatura (°C), Glic...  
2  [Temperatura (°C), pH, Glicose (g/L), Agitação...  

Modelo vencedor: Best Subset | Variáveis: ['Temperatura (°C)', 'pH', 'Glicose (g/L)', 'Agitação (rpm)', 'Rendimento Etanol (%)']
RMSE=5.88 | MAE=5.88 | R2=0.683


In [None]:
# Reajustar modelo final com todas as variáveis escolhidas
final_est = clone(base_est).fit(X_train[cols_melhor], y_train)

# Extrair coeficientes na escala original
lin = final_est.named_steps["linreg"]
scaler = final_est.named_steps["scaler"]

coef_pad = lin.coef_
intercept_pad = lin.intercept_
scales = scaler.scale_
means = scaler.mean_

betas = coef_pad / scales
intercept = intercept_pad - np.sum(betas * means)

# Mostrar função final
print("\nFunção do modelo ajustado:")
print(f"Tempo (h) = {intercept:.3f}", end="")
for var, b in zip(cols_melhor, betas):
    print(f" + ({b:.3f} * {var})", end="")
print()


Função do modelo ajustado:
Tempo (h) = 10.908 + (-0.344 * Temperatura (°C)) + (-0.368 * pH) + (-0.027 * Glicose (g/L)) + (-0.010 * Agitação (rpm)) + (0.925 * Rendimento Etanol (%))
