# **Prediction Model – Xgboost**

In [36]:
# Import packages
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.ticker import FuncFormatter

### Original Daan data model

In [37]:
# Read the data
data_0 = pd.read_csv('data/daan_data_cleaned.csv')
data_0.head()

Unnamed: 0,土地移轉總面積平方公尺,交易年月日,總樓層數,建物移轉總面積平方公尺,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,建物現況格局-隔間,有無管理組織,單價元平方公尺,...,土地,建物,車位,建物年齡,都市土地使用分區_商,都市土地使用分區_都市,主要建材_加強磚造,主要建材_鋼筋混凝土造,主要建材_鋼骨造,主要建材_鋼骨鋼筋混凝土造
0,36.93,1100101,6,154.21,4,2,2,1,1,237728.0,...,2,1,0,13868,0,0,0,1,0,0
1,17.43,1100103,6,86.96,1,1,1,1,0,270239.0,...,1,1,0,8570,0,0,0,1,0,0
2,22.55,1100103,16,264.96,3,2,2,1,1,263738.0,...,1,1,1,14566,0,0,0,1,0,0
3,25.0,1100103,10,213.74,6,1,2,1,0,220735.0,...,1,1,0,14559,1,0,0,1,0,0
4,30.99,1100104,5,114.37,3,2,2,1,0,139897.0,...,1,1,0,12278,0,0,0,1,0,0


> **Split the data into testing and training**

In [38]:
data_0['交易年月日'] = data_0['交易年月日'].astype(str)
data_sorted = data_0.sort_values(by='交易年月日')

# Calculate the index at which to split the data (80% for training, 20% for testing)
split_index = int(len(data_sorted) * 0.8)

# Split the DataFrame into training and testing sets
training_data = data_sorted[:split_index]
test_data = data_sorted[split_index:]

# Verify the split
print(f"Training data shape: {training_data.shape}")
print(f"Test data shape: {test_data.shape}")

Training data shape: (3562, 75)
Test data shape: (891, 75)


In [39]:
training_data['交易年月日'] = training_data['交易年月日'].astype(float)
test_data['交易年月日'] = test_data['交易年月日'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['交易年月日'] = training_data['交易年月日'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['交易年月日'] = test_data['交易年月日'].astype(float)


> **Train the data**

In [40]:
X_train = training_data.drop(['單價元平方公尺', '交易年月日'], axis=1)
y_train = training_data['單價元平方公尺']
X_test = test_data.drop(['單價元平方公尺', '交易年月日'], axis=1)
y_test = test_data['單價元平方公尺']

# Initialize the XGBoost regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 9178169933.81965
R^2 Score: 0.1373480817820527


### Feature selected model

In [41]:
# Read the data
data = pd.read_csv('data/daan_selected.csv')
data.head()

Unnamed: 0,建物型態_透天厝,車位類別_坡道平面,建物現況格局-房,總樓層數,車位類別_其他,交易標的_房地(土地+建物)+車位,休閒娛樂,鄰避設施,各級學校,主要用途_商業用,...,主要用途_其他,建物型態_公寓(5樓含以下無電梯),附屬建物面積,民間機構,有無管理組織,鄰數_現有門牌,都市土地使用分區_都市,車位類別_升降平面,單價元平方公尺,交易年月日
0,0,0,4,6,0,0,0.0,0.0,0.0,0,...,0,0,0.0,0.0,1,1026,0,0,237728.0,1100101
1,0,0,1,6,0,0,4132.0,1647.0,594.0,0,...,0,0,0.0,5333.0,0,1026,0,0,270239.0,1100103
2,0,1,3,16,0,1,3476.0,1832.0,532.0,0,...,0,0,0.0,5206.0,1,1026,0,0,263738.0,1100103
3,0,0,6,10,0,0,2616.0,1529.0,474.0,0,...,0,0,0.0,2635.0,0,1026,0,0,220735.0,1100103
4,0,0,3,5,0,0,1244.0,845.0,210.0,0,...,1,1,0.0,1932.0,0,1026,0,0,139897.0,1100104


> **Split the data into testing and training**

In [42]:
data['交易年月日'] = data['交易年月日'].astype(str)
data_sorted = data.sort_values(by='交易年月日')

# Calculate the index at which to split the data (80% for training, 20% for testing)
split_index = int(len(data_sorted) * 0.8)

# Split the DataFrame into training and testing sets
training_data = data_sorted[:split_index]
test_data = data_sorted[split_index:]

# Verify the split
print(f"Training data shape: {training_data.shape}")
print(f"Test data shape: {test_data.shape}")

Training data shape: (3562, 44)
Test data shape: (891, 44)


In [43]:
training_data['交易年月日'] = training_data['交易年月日'].astype(float)
test_data['交易年月日'] = test_data['交易年月日'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['交易年月日'] = training_data['交易年月日'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['交易年月日'] = test_data['交易年月日'].astype(float)


> **Train the model**

In [44]:
X_train = training_data.drop(['單價元平方公尺', '交易年月日'], axis=1)
y_train = training_data['單價元平方公尺']
X_test = test_data.drop(['單價元平方公尺', '交易年月日'], axis=1)
y_test = test_data['單價元平方公尺']

# Initialize the XGBoost regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 8024938614.768773
R^2 Score: 0.24573975645158397
