# feature selection and model training

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, r2_score
import joblib

In [2]:
os.chdir("../")
print(os.getcwd())

c:\Users\valen\Desktop\etl_workshop003


In [3]:
df_model = pd.read_csv('data/merged_data.csv')

In [4]:
X = df_model.drop(['happiness_score', 'region', 'perceptions_of_corruption', 'generosity', 'year'], axis=1)
y = df_model['happiness_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Data Preprocessing

***One-Hot Encoding for Categorical Features***

Apply One-Hot Encoding to categorical features to convert them into numerical format.

In [5]:
numeric_features = ['gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'freedom']
categorical_features = ['country']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [6]:
models = {
    "LinearRegression": LinearRegression(),
    "RidgeCV": RidgeCV(alphas=[0.1, 1.0, 10.0]),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

In [7]:
print("ðŸ“Š VALIDACIÃ“N CRUZADA (solo en el 80% de entrenamiento):\n")

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])

    print(f"ðŸ”Ž Evaluando modelo: {name}")
    r2s, maes, rmses = [], [], []

    for i, (train_idx, val_idx) in enumerate(cv.split(X_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        pipeline.fit(X_tr, y_tr)
        y_pred = pipeline.predict(X_val)

        r2 = r2_score(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))

        r2s.append(r2)
        maes.append(mae)
        rmses.append(rmse)

        print(f" Fold {i+1}: R2={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}")

    results[name] = {
        "R2_mean": np.mean(r2s),
        "R2_std": np.std(r2s),
        "MAE_mean": np.mean(maes),
        "MAE_std": np.std(maes),
        "RMSE_mean": np.mean(rmses),
        "RMSE_std": np.std(rmses)
    }

print("\nðŸ“ˆ RESUMEN VALIDACIÃ“N CRUZADA:")
for name, res in results.items():
    print(f"\nModelo: {name}")
    print(f" R2   â†’ Promedio: {res['R2_mean']:.4f} | DesviaciÃ³n: {res['R2_std']:.4f}")
    print(f" MAE  â†’ Promedio: {res['MAE_mean']:.4f} | DesviaciÃ³n: {res['MAE_std']:.4f}")
    print(f" RMSE â†’ Promedio: {res['RMSE_mean']:.4f} | DesviaciÃ³n: {res['RMSE_std']:.4f}")


ðŸ“Š VALIDACIÃ“N CRUZADA (solo en el 80% de entrenamiento):

ðŸ”Ž Evaluando modelo: LinearRegression
 Fold 1: R2=0.9163, MAE=0.2065, RMSE=0.3229
 Fold 2: R2=0.9372, MAE=0.2014, RMSE=0.2926
 Fold 3: R2=0.9390, MAE=0.1923, RMSE=0.2739
 Fold 4: R2=0.9175, MAE=0.2093, RMSE=0.3270
 Fold 5: R2=0.8903, MAE=0.2397, RMSE=0.3630
ðŸ”Ž Evaluando modelo: RidgeCV
 Fold 1: R2=0.9078, MAE=0.2243, RMSE=0.3389
 Fold 2: R2=0.9340, MAE=0.2082, RMSE=0.3000
 Fold 3: R2=0.9289, MAE=0.2079, RMSE=0.2957
 Fold 4: R2=0.9487, MAE=0.1814, RMSE=0.2579
 Fold 5: R2=0.9027, MAE=0.2359, RMSE=0.3420
ðŸ”Ž Evaluando modelo: RandomForest
 Fold 1: R2=0.8270, MAE=0.3518, RMSE=0.4643
 Fold 2: R2=0.8547, MAE=0.3502, RMSE=0.4450
 Fold 3: R2=0.8380, MAE=0.3478, RMSE=0.4463
 Fold 4: R2=0.8637, MAE=0.3294, RMSE=0.4203
 Fold 5: R2=0.7780, MAE=0.3919, RMSE=0.5165

ðŸ“ˆ RESUMEN VALIDACIÃ“N CRUZADA:

Modelo: LinearRegression
 R2   â†’ Promedio: 0.9201 | DesviaciÃ³n: 0.0176
 MAE  â†’ Promedio: 0.2098 | DesviaciÃ³n: 0.0160
 RMSE â†’ Pro

In [8]:
best_model_name = max(results, key=lambda x: results[x]['R2_mean'])
best_model = models[best_model_name]
print(f"\nâœ… Mejor modelo segÃºn validaciÃ³n cruzada: {best_model_name}")


âœ… Mejor modelo segÃºn validaciÃ³n cruzada: RidgeCV


In [9]:
final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', best_model)
])
final_pipeline.fit(X_train, y_train)

In [10]:
y_pred_test = final_pipeline.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"\nðŸ“Š EVALUACIÃ“N FINAL en el 20% de TEST:")
print(f" R2: {r2_test:.4f}")
print(f" MAE: {mae_test:.4f}")
print(f" RMSE: {rmse_test:.4f}")


ðŸ“Š EVALUACIÃ“N FINAL en el 20% de TEST:
 R2: 0.9502
 MAE: 0.1839
 RMSE: 0.2487


In [11]:
joblib.dump(final_pipeline, 'model/trained_model.pkl')
print(f"\nðŸ“¦ Modelo '{best_model_name}' entrenado y guardado como 'modelo_entrenado.pkl'")


ðŸ“¦ Modelo 'RidgeCV' entrenado y guardado como 'modelo_entrenado.pkl'
