In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Cargar los datos
train_data = pd.read_csv('train.csv')  # Cambia la ruta si es necesario
test_data = pd.read_csv('test.csv')  # Cambia la ruta si es necesario

In [2]:
# Manejo inicial de valores nulos (solo en columnas numéricas)
train_data.fillna(train_data.select_dtypes(include=[np.number]).mean(), inplace=True)
test_data.fillna(test_data.select_dtypes(include=[np.number]).mean(), inplace=True)

# Corregir nombres de columnas eliminando espacios adicionales
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

# Eliminar columnas irrelevantes
train_data.drop(columns=['Unnamed: 0', 'Country'], inplace=True)
test_data.drop(columns=['Unnamed: 0', 'Country'], inplace=True)

# Codificar variables categóricas
train_data = pd.get_dummies(train_data, columns=['Status'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Status'], drop_first=True)

In [3]:
# Crear nuevas características ajustadas
train_data['Log_GDP'] = np.log1p(train_data['GDP'])
test_data['Log_GDP'] = np.log1p(test_data['GDP'])

train_data['Infant_under5_ratio'] = train_data['infant deaths'] / (train_data['under-five deaths'] + 1)
test_data['Infant_under5_ratio'] = test_data['infant deaths'] / (test_data['under-five deaths'] + 1)

train_data['Expenditure_per_GDP'] = train_data['percentage expenditure'] / (train_data['GDP'] + 1)
test_data['Expenditure_per_GDP'] = test_data['percentage expenditure'] / (test_data['GDP'] + 1)

train_data['Socioeconomic_index'] = train_data['Income composition of resources'] * train_data['Schooling']
test_data['Socioeconomic_index'] = test_data['Income composition of resources'] * test_data['Schooling']

In [4]:
# Refinar eliminación de outliers utilizando el rango intercuartílico (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Variables más correlacionadas con "Life expectancy" para eliminar outliers
correlation_target = train_data.corr()['Life expectancy'].abs().sort_values(ascending=False)
top_features = correlation_target[1:6].index.tolist()  # Las 5 más correlacionadas (excluyendo el target)
print("Variables más correlacionadas:", top_features)

for feature in top_features:
    train_data = remove_outliers(train_data, feature)

Variables más correlacionadas: ['Socioeconomic_index', 'Schooling', 'Adult Mortality', 'Income composition of resources', 'HIV/AIDS']


In [5]:
# Separar características y objetivo
X = train_data.drop(columns=['Life expectancy'])
y = train_data['Life expectancy']

# Dividir en conjunto de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Modelo Random Forest con ajuste de hiperparámetros refinados
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [15, 20, 25, 30],
    'min_samples_split': [2, 4, 6, 10],
    'min_samples_leaf': [1, 2, 3]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2', verbose=2)
grid_search.fit(X_train, y_train)

# Mejor modelo
best_rf = grid_search.best_estimator_
print("Mejores hiperparámetros:", grid_search.best_params_)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
Mejores hiperparámetros: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [7]:
# Predicciones y evaluación
y_pred = best_rf.predict(X_val)
r2 = r2_score(y_val, y_pred)
print(f'R² en el conjunto de validación: {r2:.4f}')

# Predicciones en el conjunto de prueba
test_predictions = best_rf.predict(test_data)

R² en el conjunto de validación: 0.9114


In [8]:
# Crear archivo de salida para Kaggle
submission = pd.DataFrame({
    'ID': range(1, len(test_predictions) + 1),  # Crear IDs secuenciales desde 1
    'Life expectancy': test_predictions
})
submission.to_csv('submission_rf_refined.csv', index=False)
print("Archivo de predicciones generado: submission_rf_refined.csv")

Archivo de predicciones generado: submission_rf_refined.csv
