In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv("train.csv")

features = [
    'GrLivArea',
    'BedroomAbvGr',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath'
]

train_df = train_df.dropna(subset=['SalePrice'])
train_df = train_df[train_df['GrLivArea'] < 4500]

X_train = train_df[features].fillna(train_df[features].median())
y_train = np.log1p(train_df['SalePrice'])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_tr, y_tr)

val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE (log scale): {rmse:.4f}")

test_df = pd.read_csv("test.csv")
X_test = test_df[features].fillna(train_df[features].median())
X_test_scaled = scaler.transform(X_test)

test_preds_log = model.predict(X_test_scaled)
test_preds = np.expm1(test_preds_log)

submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": test_preds
})
submission.to_csv("submission.csv", index=False)
print("submission.csv file has been created with predicted SalePrices.")

Validation RMSE (log scale): 0.2536
submission.csv file has been created with predicted SalePrices.
