In [6]:
# Importar bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 1. Cargar el dataset
data = pd.read_csv('../data/proccesed/final_happiness_data.csv')

# Definir características (X) y variable objetivo (y)
X = data[['GDP_per_Capita', 'Healthy_life_expectancy', 'Freedom', 
            'Perceptions_of_corruption', 'Generosity']]
y = data['Happiness_Score']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crear y entrenar el modelo usando el conjunto escalado
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predecir valores para el conjunto de prueba escalado
y_pred = model.predict(X_test_scaled)

# Evaluación del modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")


Mean Squared Error (MSE): 0.36670545843598357
R-squared (R²): 0.706296235925179


In [None]:
# Importar bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# 1. Cargar el dataset
data = pd.read_csv('../data/proccesed/final_happiness_data.csv')

# Definir características (X) y variable objetivo (y)
X = data[['Country', 'Region', 'GDP_per_Capita', 'Healthy_life_expectancy', 'Freedom', 
            'Perceptions_of_corruption', 'Generosity']]
y = data['Happiness_Score']

# 2. Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Crear un preprocesador para las variables categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['GDP_per_Capita', 'Healthy_life_expectancy', 'Freedom', 
                                        'Perceptions_of_corruption', 'Generosity']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country', 'Region'])  # Manejar categorías desconocidas
    ])

# 4. Crear un pipeline que incluye el preprocesamiento y el modelo
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', LinearRegression())])

# 5. Entrenar el modelo usando el pipeline
pipeline.fit(X_train, y_train)

# 6. Predecir valores para el conjunto de prueba
y_pred = pipeline.predict(X_test)

# 7. Evaluación del modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")


Mean Squared Error (MSE): 0.06785667973560428
R-squared (R²): 0.945651852740431


In [None]:
import pickle

# Guardar el modelo entrenado en un archivo .pkl
with open('../models/final_happiness_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)


In [14]:
import pickle

# Cargar el modelo desde el archivo PKL
with open('../models/final_happiness_model.pkl', 'rb') as file:
    pipeline = pickle.load(file)

# Realizar predicciones en el conjunto de prueba
y_pred_test = pipeline.predict(X_test)

# Crear un DataFrame con las características, predicciones y valores reales
test_results = X_test.copy()
test_results['Predicted_Happiness_Score'] = y_pred_test
test_results['Actual_Happiness_Score'] = y_test.values

# Guardar el DataFrame con los resultados para enviarlo a Kafka
test_results.to_csv('../data/test_results.csv', index=False)
