In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv("train.csv")

In [3]:
X = df.drop("SalePrice", axis=1)

In [4]:
y = df["SalePrice"]

In [5]:
# Define the type of features
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

In [6]:
# missing imputation - mean replacement
num_imputer = SimpleImputer(strategy='median')
X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])

In [7]:
X[categorical_cols] = X[categorical_cols].fillna('Missing')

In [8]:
# one-hot engoding
X_encoded = pd.get_dummies(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.30, random_state=42)

In [10]:
# target transformation
y_train_scaled = np.log1p(y_train)
y_test_scaled = np.log1p(y_test)

In [11]:
# Z-score normalization
scaler = StandardScaler()

In [12]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ---------------------- Liner --------------------------------

In [13]:
model = LinearRegression()

In [14]:
model.fit(X_train_scaled, y_train_scaled)
prediciton = model.predict(X_test_scaled)

In [41]:
# target untransformation
pred_expm = np.expm1(prediciton)

In [16]:
MSE = mean_squared_error(y_test, pred_expm)

In [31]:
RMSE = np.sqrt(MSE)

In [32]:
print(RMSE)

30945.54732167222


In [27]:
# ---------------------- Ridge --------------------------------

In [28]:
ridge_model = Ridge(alpha=20.0)

In [29]:
# training
ridge_model.fit(X_train_scaled, y_train_scaled)

In [30]:
# prediction
prediction_ridge = ridge_model.predict(X_test_scaled)

In [37]:
# target untransformation
prediction_ridge_ex = np.expm1(prediction_ridge)

In [38]:
MSE = mean_squared_error(y_test, prediction_ridge_ex)

In [39]:
RMSE = np.sqrt(MSE)

In [42]:
print(RMSE)

21911.281536787006


In [None]:
# ---------------------- Lasso --------------------------------

In [43]:
lasso_model = Lasso(alpha=0.1, max_iter=10000)

In [44]:
# training
lasso_model.fit(X_train_scaled, y_train_scaled)

In [45]:
# prediction
prediction_lasso = lasso_model.predict(X_test_scaled)

In [46]:
prediction_lasso_ex = np.expm1(prediction_lasso)

In [47]:
MSE = mean_squared_error(y_test, prediction_lasso_ex)

In [48]:
RMSE = np.sqrt(MSE)

In [49]:
print(RMSE)

50004.99688193788


In [None]:
# ---------------------- XGBoost --------------------------------

In [50]:
xgb_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [52]:
# training
xgb_model.fit(
    X_train_scaled,
    y_train_scaled)

In [53]:
prediction_xgb = xgb_model.predict(X_test_scaled)

In [54]:
prediction_xgb_ex = np.expm1(prediction_xgb)

In [56]:
MSE = mean_squared_error(y_test, prediction_xgb_ex)

In [57]:
RMSE = np.sqrt(MSE)

In [58]:
print(RMSE)

24142.667955302702
