In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import xgboost as xgb

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

test_ID = test["Id"]
y = train["SalePrice"]
train.drop(["SalePrice", "Id"], axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

data = pd.concat([train, test], axis=0)

data.fillna(data.mean(numeric_only=True), inplace=True)
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].fillna(data[col].mode()[0])

for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

X_train = data[:len(y)]
X_test = data[len(y):]

xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y)

ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train, y)

xgb_preds = xgb_model.predict(X_test)
ridge_preds = ridge_model.predict(X_test)

final_preds = 0.8 * xgb_preds + 0.2 * ridge_preds

submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": final_preds
})
submission = submission.sort_values(by="Id")
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created successfully (sorted by Id)")


✅ submission.csv created successfully (sorted by Id)


In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse = np.sqrt(-cross_val_score(
    xgb_model, X_train, y,
    scoring="neg_mean_squared_error",
    cv=cv
))

print(f"📈 Cross-Validated RMSE: {rmse.mean():.4f}")


📈 Cross-Validated RMSE: 27992.5172
