In [106]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from scipy.stats import skew


In [107]:
def prepare_data(
    train: pd.DataFrame, test: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    data = pd.concat((train.iloc[:, 1:-1], test.iloc[:, 1:]))
    numeric_feats = data.dtypes[data.dtypes != "object"].index
    skewed_feats = data[numeric_feats].apply(
        lambda x: skew(x.dropna())
    )  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    data[skewed_feats] = np.log1p(data[skewed_feats])
    data.fillna(data.mean(), inplace=True, axis="rows")
    data = pd.get_dummies(data)
    y = np.log1p(train["SalePrice"])
    train = data[: train.shape[0]]
    test = data[train.shape[0] :]
    return train, test, y

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def evaluate(
    model: BaseEstimator, name: str, X: pd.DataFrame, y: pd.DataFrame, reps: int = 100
) -> float:
    rmse_sum = 0
    for i in range(reps):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_sum += rmse
    rmse_avg = rmse_sum / reps
    print(
        name,
        "\trmse=",
        "{:.3f}%".format(rmse_avg * 100),
        " alpha=",
        model.alpha_ if hasattr(model, "alpha_") else "null",
    )
    return rmse_avg


In [108]:
train = pd.read_csv("datasets/train.csv")
test = pd.read_csv('datasets/test.csv')
(X_train, X_test, y_train) = prepare_data(train, test)

In [109]:
from sklearn.linear_model import (
    Ridge,
    RidgeCV,
    ElasticNet,
    ElasticNetCV,
    LassoCV,
    LassoLars,
    LassoLarsCV,
    Lasso,
)

models = [
    (Ridge(alpha=10), "Ridge a=10"),
    (LassoLars(alpha=0.000258), "LassoLars a=0.000258")
]
rmse_min = 1
for (model, name) in models:
    rmse = evaluate(model, name, X_train, y_train)
    if rmse < rmse_min:
        rmse_min = rmse
        best_model = model
best_model.fit(X_train, y_train)
y_test = best_model.predict(X_test)


Ridge a=10 	rmse= 13.128%  alpha= null
LassoLars a=0.000258 	rmse= 12.927%  alpha= null


In [110]:
y_test = np.expm1(y_test)
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_test})
submission.to_csv("datasets/submission.csv", index=False)
