# feature selection and model training

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, r2_score
import joblib

In [None]:
os.chdir("../")
print(os.getcwd())

c:\Users\valen\Desktop\etl_workshop003


In [None]:
df_model = pd.read_csv('data/merged_data.csv')

In [None]:
X = df_model.drop(['happiness_score', 'region', 'perceptions_of_corruption', 'generosity', 'year'], axis=1)
y = df_model['happiness_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Data Preprocessing

***One-Hot Encoding for Categorical Features***

Apply One-Hot Encoding to categorical features to convert them into numerical format.

In [None]:
numeric_features = ['gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'freedom']
categorical_features = ['country']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "RidgeCV": RidgeCV(alphas=[0.1, 1.0, 10.0]),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

In [None]:
print("📊 VALIDACIÓN CRUZADA (solo en el 80% de entrenamiento):\n")

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])

    print(f"🔎 Evaluando modelo: {name}")
    r2s, maes, rmses = [], [], []

    for i, (train_idx, val_idx) in enumerate(cv.split(X_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        pipeline.fit(X_tr, y_tr)
        y_pred = pipeline.predict(X_val)

        r2 = r2_score(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))

        r2s.append(r2)
        maes.append(mae)
        rmses.append(rmse)

        print(f" Fold {i+1}: R2={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}")

    results[name] = {
        "R2_mean": np.mean(r2s),
        "R2_std": np.std(r2s),
        "MAE_mean": np.mean(maes),
        "MAE_std": np.std(maes),
        "RMSE_mean": np.mean(rmses),
        "RMSE_std": np.std(rmses)
    }

print("\n📈 RESUMEN VALIDACIÓN CRUZADA:")
for name, res in results.items():
    print(f"\nModelo: {name}")
    print(f" R2   → Promedio: {res['R2_mean']:.4f} | Desviación: {res['R2_std']:.4f}")
    print(f" MAE  → Promedio: {res['MAE_mean']:.4f} | Desviación: {res['MAE_std']:.4f}")
    print(f" RMSE → Promedio: {res['RMSE_mean']:.4f} | Desviación: {res['RMSE_std']:.4f}")


In [None]:
best_model_name = max(results, key=lambda x: results[x]['R2_mean'])
best_model = models[best_model_name]
print(f"\n✅ Mejor modelo según validación cruzada: {best_model_name}")

In [None]:
final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', best_model)
])
final_pipeline.fit(X_train, y_train)

In [None]:
y_pred_test = final_pipeline.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"\n📊 EVALUACIÓN FINAL en el 20% de TEST:")
print(f" R2: {r2_test:.4f}")
print(f" MAE: {mae_test:.4f}")
print(f" RMSE: {rmse_test:.4f}")

In [None]:
joblib.dump(final_pipeline, 'model/trained_model.pkl')
print(f"\n📦 Modelo '{best_model_name}' entrenado y guardado como 'modelo_entrenado.pkl'")