In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
# Loading the dataset
house_sales = pd.read_csv("house_sales.csv", parse_dates = ["sale_date"])

# Cleaning and preparing data for training
house_sales["city"].replace({"--":"Unknown"}, inplace = True)
dict_type = {"Det.": "Detached", "Semi": "Semi-detached", "Terr.": "Terraced"}
house_sales["house_type"].replace(dict_type, inplace = True)
mean_months_listed = round(house_sales["months_listed"].mean(),1)
house_sales["months_listed"].fillna(mean_months_listed, inplace = True)
house_sales["area"] = house_sales["area"].str.replace(' sq.m.', "", regex=False).astype(float)
house_sales["city"] = house_sales["city"].astype("category")
house_sales["house_type"] = house_sales["house_type"].astype("category")
house_sales["sale_year"] = house_sales["sale_date"].dt.year
house_sales["sale_month"] = house_sales["sale_date"].dt.month
house_sales = pd.get_dummies(house_sales)

In [3]:
# Selecting features and targets, and splitting data
X = house_sales.drop(["house_id", "sale_price", "sale_date", "months_listed"], axis = 1)
y = house_sales["sale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [4]:
# Model 1: Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg_score = linreg.score(X_test, y_test)
y_pred1 = linreg.predict(X_test)
linreg_rmse = mean_squared_error(y_test, y_pred1, squared = False)
s1 = round(linreg_score, 3)
r1 = round(linreg_rmse, 1)

In [5]:
# Model 2: Ridge Tuning
ridge = Ridge()
param_grid = {"alpha": [0.001, 0.01, 0.1, 1, 2.5, 5, 10]}
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
ridge_cv = GridSearchCV(ridge, param_grid, cv = kf)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, round(ridge_cv.best_score_,3))

{'alpha': 1} 0.964


In [6]:
# Model 2: Ridge Score
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)
ridge_score = ridge.score(X_test, y_test)
y_pred2 = ridge.predict(X_test)
ridge_rmse = mean_squared_error(y_test, y_pred2, squared = False)
s2 = round(ridge_score, 3)
r2 = round(ridge_rmse, 1)

In [7]:
# Model 3: Lasso Tuning
lasso = Lasso()
param_grid = {"alpha": np.arange(10, 1000, 10)}
lasso_cv = GridSearchCV(lasso, param_grid, cv = kf)
lasso_cv.fit(X_train, y_train)
print(lasso_cv.best_params_, round(lasso_cv.best_score_, 3))

{'alpha': 120} 0.964


In [8]:
# Model 3: Lasso Score
lasso = Lasso(alpha = 120)
lasso.fit(X_train, y_train)
lasso_score = lasso.score(X_test, y_test)
y_pred3 = lasso.predict(X_test)
lasso_rmse = mean_squared_error(y_test, y_pred3, squared = False)
s3 = round(lasso_score, 3)
r3 = round(lasso_rmse, 1)

In [9]:
# Model 4: Decision Tree Regressor Tuning
dt = DecisionTreeRegressor(random_state = 1)
param_grid2 = {"min_samples_leaf": np.arange(0.005, 0.5, 0.005)}
dt_cv = GridSearchCV(dt, param_grid2, cv = kf)
dt_cv.fit(X_train, y_train)
print(dt_cv.best_params_, round(dt_cv.best_score_, 3))

{'min_samples_leaf': 0.005} 0.979


In [10]:
# Model 4: Decision Tree Regressor Score
dt = DecisionTreeRegressor(min_samples_leaf = 0.005, random_state = 1)
dt.fit(X_train, y_train)
dt_score = dt.score(X_test, y_test)
y_pred4 = dt.predict(X_test)
dt_rmse = mean_squared_error(y_test, y_pred4, squared = False)
s4 = round(dt_score, 3)
r4 = round(dt_rmse, 1)

In [11]:
# Model 5: Random Forest Regressor
rf = RandomForestRegressor(n_estimators = 300, random_state = 1)
rf.fit(X_train, y_train)
rf_score = rf.score(X_test, y_test)
y_pred5 = rf.predict(X_test)
rf_rmse = mean_squared_error(y_test, y_pred5, squared = False)
s5 = round(rf_score,3)
r5 = round(rf_rmse, 1)

In [12]:
# Creating DataFrame of models with their scores
model_dict = {
    "model": ["Linear Regression", "Ridge", "Lasso", "Decision Tree Regressor", "Random Forest Regressor"],
    "score": [s1, s2, s3, s4, s5],
    "rmse": [r1, r2, r3, r4, r5]
}
model_df = pd.DataFrame(model_dict)
print(model_df)

                     model  score     rmse
0        Linear Regression  0.958  24923.6
1                    Ridge  0.958  24922.1
2                    Lasso  0.958  25010.7
3  Decision Tree Regressor  0.976  18890.8
4  Random Forest Regressor  0.978  18194.1
