In [14]:
# ===========================================
# Improved Regression Pipeline with Ensemble
# ===========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# 1. Load Data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# 2. Separate features and target
X = train.drop(columns=["target"])
y = train["target"]

# 3. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

if "id" in test.columns:
    test_features = test.drop(columns=["id"])
else:
    test_features = test
test_scaled = scaler.transform(test_features)

# 4. Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# 5. Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)
}

# 6. Train & Validate
valid_preds = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    print(f"{name} RMSE: {rmse:.4f}")
    valid_preds[name] = preds

# 7. Ensemble (average predictions)
ensemble_valid = np.mean(list(valid_preds.values()), axis=0)
ensemble_rmse = np.sqrt(mean_squared_error(y_valid, ensemble_valid))
print(f"\nEnsemble RMSE: {ensemble_rmse:.4f}")

# 8. Final prediction on test set (average of models)
test_preds = []
for name, model in models.items():
    test_preds.append(model.predict(test_scaled))

final_test_pred = np.mean(test_preds, axis=0)

# 9. Create submission
submission = sample_submission.copy()
submission["target"] = final_test_pred
submission.to_csv("submission.csv", index=False)

print("\n✅ Improved submission file saved as submission.csv")
print(submission.head())


LinearRegression RMSE: 17.2640
RandomForest RMSE: 20.6356
GradientBoosting RMSE: 12.3238

Ensemble RMSE: 15.1598

✅ Improved submission file saved as submission.csv
   id      target
0   1  190.173973
1   2  191.086382
2   3  174.356416
3   4  179.858810
4   5  169.728403
