# RandomForestRegressor

## step1: 準備訓練資料

In [1]:
import pandas as pd

train_df = pd.read_csv("house/train.csv", encoding="utf-8")
test_df = pd.read_csv("house/test.csv", encoding="utf-8")

In [2]:
# 如果 train_df 和 test_df 做 One-Hot Encoding 後，欄位數量不同時，可以用 align 把欄位配對起來
# pd.DataFrame.align(fill_value=0)

In [3]:
# 把 train_df 和 test_df 合併起來一起做資料預處理
total_df = pd.concat([train_df, test_df], axis=0)

In [4]:
# total_df = x_train + x_test，是要處理的全部資料
total_df = total_df.drop(["SalePrice"], axis=1)
total_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [5]:
cnt = total_df.isna().sum()  # 統計每個欄位缺失值數量
cnt[cnt > 0]                 # 僅列出有缺失值的

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [6]:
# 填補缺失值：數值型，填中位數
med = total_df.median().drop(["MSSubClass"]) # MSSubClass 是以數字顯示的類別型態特徵
total_df = total_df.fillna(med)

In [7]:
# 填補缺失值：類別型，填出現最多的選項
# most = total_df["??"].value_counts().idxmax()
# total_df["??"] = total_df["??"].fillna(most)

In [8]:
# 省略「填補類別型缺失值」的步驟，直接做 One-Hot Encoding
# 對類別型特徵做 One-Hot Encoding
total_df = pd.get_dummies(total_df)
total_df = pd.get_dummies(total_df, columns=["MSSubClass"])
total_df = total_df.drop(["Id"], axis=1)
total_df

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190
0,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,150.0,...,0,0,0,0,0,0,0,0,0,0
1,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,284.0,...,0,0,0,0,0,0,0,0,0,0
2,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,434.0,...,0,0,0,0,0,0,0,0,0,0
3,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,540.0,...,1,0,0,0,0,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,490.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,546.0,...,0,0,0,0,0,0,0,1,0,0
1455,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,294.0,...,0,0,0,0,0,0,0,1,0,0
1456,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1457,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,575.0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
# 確認是否還有缺失值
cnt = total_df.isna().sum()  # 統計每個欄位缺失值數量
cnt[cnt > 0]                 # 僅列出有缺失值的

Series([], dtype: int64)

In [10]:
# 資料預處理完後，再分割 total_df 取出 x_train 和 x_test

train_df.shape[0]  # 取得 train_d 的 row 數量

x_train = total_df.iloc[:train_df.shape[0]]  # 訓練題目：total_df 前 1460 筆
x_test  = total_df.iloc[train_df.shape[0]:]  # 測試題目：total_df 後 1459 筆
y_train = train_df["SalePrice"]              # 訓練答案
testid  = test_df["Id"] 

## step2: 建立訓練模型

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

reg = RandomForestRegressor()

params = {
    "max_depth":range(5, 20),
    "n_estimators":range(20, 150, 10)
}

cv = GridSearchCV(reg, params, scoring="r2", cv=10, n_jobs=4)
cv.fit(x_train, y_train)

print("best params: ", cv.best_params_)
print("best score: ", cv.best_score_)

best params:  {'max_depth': 16, 'n_estimators': 100}
best score:  0.8667409763533923


In [12]:
# 看起來最佳的參數
reg = RandomForestRegressor(max_depth=8, n_estimators=75)
reg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=75, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## step3: 利用模型預測

In [13]:
pre = reg.predict(x_test)
result_df = pd.DataFrame({
    "Id":testid,
    "SalePrice":pre
})
result_df.to_csv("house/predict_result_RF.csv", index=False, encoding="utf-8")
result_df

Unnamed: 0,Id,SalePrice
0,1461,128084.761798
1,1462,153168.886753
2,1463,181057.519066
3,1464,181337.788430
4,1465,199926.978963
...,...,...
1454,2915,88921.459334
1455,2916,88979.815373
1456,2917,150551.055967
1457,2918,116745.031796
