In [1]:
# #############################################################################
# 本題參數設定，請勿更改
seed = 0   # 亂數種子數  
# #############################################################################
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd 

dct = {'模型': [], '細節':[], 'RMSE (test)':[], 
       'R2 (train)':[], 'adj. R2 (train)':[], 
       'R2 (test)':[], 'adj. R2 (test)':[]}
df_eval = pd.DataFrame(dct)

# 讀取台北市房價資料集
df = pd.read_csv('Taipei_house.csv')

# 對"行政區"進行 one-hot encoding
df = pd.get_dummies(df, columns=['行政區'])

# 處理"車位類別"
df['車位類別'] = [0 if x=='無' else 1 for x in df['車位類別']]
df.head(3)

Unnamed: 0,土地面積,建物總面積,屋齡,樓層,總樓層,用途,房數,廳數,衛數,電梯,車位類別,交易日期,經度,緯度,總價,行政區_信義區,行政區_大安區,行政區_文山區,行政區_松山區
0,33.81,109.42,38.996009,3,5,0,3,2,2,0,0,2019/10/6,121.552517,25.004507,1000,0,0,1,0
1,32.19,163.53,23.78146,3,11,0,3,2,2,1,1,2019/4/28,121.559133,24.983199,2100,0,0,1,0
2,60.25,204.79,0.744711,1,10,0,3,2,3,1,0,2019/10/23,121.576052,24.988665,6720,0,0,1,0


In [2]:
# 計算 Adjusted R-squared
def adj_R2(r2, n, k):
    return r2-(k-1)/(n-k)*(1-r2)

from sklearn.metrics import mean_squared_error
def measurement(model, X_train, X_test):
    y_pred = model.predict(X_test)
    
    rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 0)
    r2_train = round(model.score(X_train, y_train), 4)
    adj_r2_train = round(adj_R2(model.score(X_train, y_train), 
                                X_train.shape[0], X_train.shape[1]), 4)
    r2_test = round(model.score(X_test, y_test), 4)
    adj_r2_test = round(adj_R2(model.score(X_test, y_test), 
                               X_test.shape[0], X_test.shape[1]), 4)
    return [rmse, r2_train, adj_r2_train, r2_test, adj_r2_test]

In [3]:
# 切分訓練集(80%)、測試集(20%)
features = df.drop(['總價', '交易日期', '經度', '緯度'], axis=1).columns
target = '總價'

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[features],
                                                    df[target],
                                                    test_size=0.2, 
                                                    random_state=seed)
X_train.shape

(9943, 15)

In [4]:
lst_model, lst_info = [], []
# 多元迴歸(參數皆為預設值)
# #########################################################################
# '行政區_信義區', '行政區_大安區', '行政區_文山區','行政區_松山區' 四個特徵是經過
# one-hot encoding 後產生，若欄位名稱不同可自行修改之。
# #########################################################################
from sklearn import linear_model
lst_model.append(linear_model.LinearRegression())
lst_info.append(['多元迴歸','15 features'])

In [5]:
# 脊迴歸(Ridge regression)，除以下參數設定外，其餘為預設值
# #########################################################################
# alpha=10
# #########################################################################
lst_model.append(linear_model.Ridge(alpha=10))
lst_info.append(['Ridge','15 features'])

In [6]:
# 多項式迴歸，除以下參數設定外，其餘為預設值
# #########################################################################
# degree=2
# #########################################################################
from sklearn.preprocessing import PolynomialFeatures
poly_fea = PolynomialFeatures(degree=2)
X_train_poly = poly_fea.fit_transform(X_train)
X_test_poly = poly_fea.fit_transform(X_test)

lst_model.append(linear_model.LinearRegression())
lst_info.append(['多項式迴歸','deg=2'])

In [7]:
# 多項式迴歸 + L1正規化，除以下參數設定外，其餘為預設值
# #########################################################################
# alpha=10
# #########################################################################
lst_model.append(linear_model.Lasso(alpha=10))
lst_info.append(['多項式迴歸+L1正規化','deg=2'])

In [8]:
idx = df_eval.shape[0]
for i in range(len(lst_model)):
    if '多項式' in lst_info[i][0]:
        X_train, X_test = X_train_poly, X_test_poly
    
    model = lst_model[i].fit(X_train, y_train)
    row = lst_info[i] + measurement(model, X_train, X_test)
    df_eval.loc[idx+i] = row

print('對訓練集的最大 Adjusted R-squared: %.4f' % max(df_eval['adj. R2 (train)']))
print('對測試集的最小 RMSE:%d' % min(df_eval['RMSE (test)']))
print('兩個模型對測試集的最大 Adjusted R-squared: %.4f' % 
      max(df_eval.loc[:1, 'adj. R2 (test)']))

對訓練集的最大 Adjusted R-squared: 0.9252
對測試集的最小 RMSE:801
兩個模型對測試集的最大 Adjusted R-squared: 0.8046


In [9]:
df_eval

Unnamed: 0,模型,細節,RMSE (test),R2 (train),adj. R2 (train),R2 (test),adj. R2 (test)
0,多元迴歸,15 features,1069.0,0.8106,0.8103,0.8056,0.8045
1,Ridge,15 features,1069.0,0.8106,0.8103,0.8057,0.8046
2,多項式迴歸,deg=2,807.0,0.9262,0.9252,0.8891,0.8828
3,多項式迴歸+L1正規化,deg=2,801.0,0.9227,0.9216,0.8909,0.8846


In [10]:
''' 預測 '''
# 利用所有資料重新擬合模型，並進行預測
X = df[features]
y = df[target]
X_poly = poly_fea.fit_transform(X)

#features= ['土地面積', '建物總面積', '屋齡', '樓層', '總樓層', '用途', 
#           '房數', '廳數', '衛數', '電梯', '車位類別', 
#           '行政區_信義區', '行政區_大安區', '行政區_文山區','行政區_松山區']
new = np.array([36, 99, 32, 4, 4, 0, 3, 2, 1, 0, 0, 0, 0, 0, 1]).reshape(1, -1)
df_new = pd.DataFrame(new, columns=features)
df_poly_fea = poly_fea.fit_transform(df_new)

lst = df_eval['adj. R2 (test)'].tolist()
idx = lst.index(max(lst))
if idx <=1:
    model = lst_model[idx].fit(X, y)
    print('房價預測結果：%d' % model.predict(df_new))
else:
    model = lst_model[idx].fit(X_poly, y)
    print('房價預測結果：%d' % model.predict(df_poly_fea))

房價預測結果：1546
