In [222]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, Lasso
from scipy.stats import skew


In [223]:
def prepare_data(
    train: pd.DataFrame, test: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    data = pd.concat((train.iloc[:, 1:-1], test.iloc[:, 1:]))
    numeric_feats = data.dtypes[data.dtypes != "object"].index
    skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    data[skewed_feats] = np.log1p(data[skewed_feats])
    data.fillna(data.mean(), inplace=True, axis="rows")
    data = pd.get_dummies(data)
    y = np.log1p(train["SalePrice"])
    train = data[:train.shape[0]]
    test = data[train.shape[0]:]
    return train, test, y


In [224]:
train = pd.read_csv("datasets/train.csv")
test = pd.read_csv('datasets/test.csv')
(X_train, X_test, y_train) = prepare_data(train, test)

In [225]:
model = Ridge(alpha=10)
model.fit(X_train, y_train)
y_test = model.predict(X_test)

In [226]:
y_test = np.expm1(y_test)
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_test})
submission.to_csv("datasets/submission.csv", index=False)
