# 房價預測模型

In [4]:
# Data Processing
import polars as pl
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
import joblib

# Visualisation
import matplotlib.pyplot as plt

In [5]:
# load data
dat = pd.read_csv('../data/data.csv')

# Split the data into features (X) and target (y)
x = dat.drop(columns=['neighbor_avg_ping', 'ping'])
y = dat[['ping']]

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

train_hex = x_train['hex_id']
test_hex = x_test['hex_id']

x_train = x_train.drop(['hex_id'], axis=1)
x_test = x_test.drop(['hex_id'], axis=1)

In [6]:
# Train the model
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 10, n_jobs=-1, random_state = 65)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R^2 of the RF model:", r2)
print("MSE of the RF model:", mse)
print("RMSE of the RF model:", rmse)

  return fit_method(estimator, *args, **kwargs)


R^2 of the RF model: 0.7071771375156479
MSE of the RF model: 157.56450664531943
RMSE of the RF model: 12.552470141184141


In [7]:
param_dist = {'n_estimators': randint(200,1000),
            'max_depth': randint(1,20)}

# Use random search to find the best hyperparameters
search = RandomizedSearchCV(rf, 
                            param_distributions = param_dist, 
                            n_iter = 5, 
                            cv = 5,
                            n_jobs = -1,
                            random_state = 65)

# Fit the random search object to the data
search.fit(x_train, y_train)

rf_bst = search.best_estimator_

y_bst_pred = rf_bst.predict(x_test)

r2_bst = r2_score(y_test, y_bst_pred)
mse_bst = mean_squared_error(y_test, y_bst_pred)
rmse_bst = root_mean_squared_error(y_test, y_bst_pred)

print("The parameters of the best RF:", search.best_params_)
print("R^2 of the RF with weighted district encoding:", r2_bst)
print("MSE of the RF with weighted district encoding:", mse_bst)
print("RMSE of the RF with weighted district encoding:", rmse_bst)

  return fit_method(estimator, *args, **kwargs)


The parameters of the best RF: {'max_depth': 15, 'n_estimators': 317}
R^2 of the RF with weighted district encoding: 0.7505635536697766
MSE of the RF with weighted district encoding: 134.21879108733728
RMSE of the RF with weighted district encoding: 11.585283383989244


In [8]:
# save the model
joblib.dump(rf_bst, '../model/rf_buy_new.joblib', compress = ('lzma', 3))

['../model/rf_buy_new.joblib']

In [None]:
# Create a series containing feature importances from the model and feature names from the training data
importances = pd.Series(rf_bst.feature_importances_, index=x_train.columns).sort_values(ascending=True)

# Plot a simple bar chart
importances.plot.barh()

plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importances of RF Model for House Price Prediction')

# Show the plot
plt.tight_layout()  # Adjust layout to fit everything
plt.show()

print(importances)

In [16]:
# H3 Visualization
pred = pd.Series(y_bst_pred, name = 'pred_ping').reset_index(drop=True)
test = pd.concat([test_hex.astype(str), x_test.astype(float), y_test.astype(float)], axis = 1).reset_index(drop=True)
test_pred = pd.concat([test, pred], axis=1)

test_pred = pl.from_pandas(test_pred).with_columns(
    (pl.col('pred_ping') - pl.col('ping')).alias('diff')
)

# test_pred_weight.head()
test_pred.write_csv('../data/data_buy_diff.csv')

# 房租預測模型

In [17]:
# Data Processing
import polars as pl
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
import joblib

# Visualisation
import matplotlib.pyplot as plt

In [9]:
# load data
dat = pd.read_csv('../data/data.csv')

# Split the data into features (X) and target (y)
x = dat.drop(columns=['mean_rent', 'neighbor_avg_ping', 'ping'])
y = dat[['mean_rent']]

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

train_hex = x_train['hex_id']
test_hex = x_test['hex_id']

x_train = x_train.drop(['hex_id'], axis=1)
x_test = x_test.drop(['hex_id'], axis=1)

In [10]:
# Train the model
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 10, n_jobs=-1, random_state = 65)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R^2 of the RF model:", r2)
print("MSE of the RF model:", mse)
print("RMSE of the RF model:", rmse)

  return fit_method(estimator, *args, **kwargs)


R^2 of the RF model: 0.9254632424991005
MSE of the RF model: 6410.922499428313
RMSE of the RF model: 80.06823652003528


In [11]:
param_dist = {'n_estimators': randint(200,1000),
            'max_depth': randint(1,20)}

# Use random search to find the best hyperparameters
search = RandomizedSearchCV(rf, 
                            param_distributions = param_dist, 
                            n_iter = 5, 
                            cv = 5,
                            n_jobs = -1,
                            random_state = 65)

# Fit the random search object to the data
search.fit(x_train, y_train)

rf_bst = search.best_estimator_

y_bst_pred = rf_bst.predict(x_test)

r2_bst = r2_score(y_test, y_bst_pred)
mse_bst = mean_squared_error(y_test, y_bst_pred)
rmse_bst = root_mean_squared_error(y_test, y_bst_pred)

print("The parameters of the best RF:", search.best_params_)
print("R^2 of the RF with weighted district encoding:", r2_bst)
print("MSE of the RF with weighted district encoding:", mse_bst)
print("RMSE of the RF with weighted district encoding:", rmse_bst)

  return fit_method(estimator, *args, **kwargs)


The parameters of the best RF: {'max_depth': 15, 'n_estimators': 317}
R^2 of the RF with weighted district encoding: 0.992733449918499
MSE of the RF with weighted district encoding: 624.9975310524503
RMSE of the RF with weighted district encoding: 24.99995062100024


In [12]:
# save the model
joblib.dump(rf_bst, '../model/rf_rent_new.joblib', compress = ('lzma', 3))

['../model/rf_rent_new.joblib']

In [None]:
# Create a series containing feature importances from the model and feature names from the training data
importances = pd.Series(rf_bst.feature_importances_, index=x_train.columns).sort_values(ascending=True)

# Plot a simple bar chart
importances.plot.barh()

plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importances of RF Model for House Price Prediction')

# Show the plot
plt.tight_layout()  # Adjust layout to fit everything
plt.show()

print(importances)

In [23]:
# H3 Visualization
pred = pd.Series(y_bst_pred, name = 'pred_rent').reset_index(drop=True)
test = pd.concat([test_hex.astype(str), x_test.astype(float), y_test.astype(float)], axis = 1).reset_index(drop=True)
test_pred = pd.concat([test, pred], axis=1)

test_pred = pl.from_pandas(test_pred).with_columns(
    (pl.col('pred_rent') - pl.col('mean_rent')).alias('diff')
)

# test_pred_weight.head()
test_pred.write_csv('../data/data_rent_diff.csv')