In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split

In [7]:
#学習データとテストデータの読み込み
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [8]:
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [9]:
#予測に使わないIdを保持・データフレームから削除
df_train_index = df_train["Id"]
df_test_index = df_test["Id"]
df_train.drop(["Id"], axis=1, inplace=True)
df_test.drop(["Id"], axis=1, inplace=True)

In [5]:
df_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
5,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
7,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
9,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [10]:
#目的変数であるSalePriceを別に取っておく
y_train = df_train["SalePrice"].values
df_train.drop(["SalePrice"], axis=1, inplace=True)

In [11]:
#学習用データとテストデータを一度統合する
df_all = pd.concat((df_train,df_test)).reset_index(drop=True)

In [12]:
#データ数
print(df_train.shape)
print(df_test.shape)

(1460, 79)
(1459, 79)


In [13]:
#欠損値の個数確認 / 2919データ
df_all.isnull().sum()[df_all.isnull().sum() != 0].sort_values(ascending=False)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageFinish     159
GarageYrBlt      159
GarageQual       159
GarageCond       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Utilities          2
Functional         2
Exterior2nd        1
Exterior1st        1
SaleType           1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
Electrical         1
KitchenQual        1
GarageCars         1
GarageArea         1
TotalBsmtSF        1
dtype: int64

In [14]:
#欠損値の補完
#以下はNaN = NAかNoneの特徴量リスト。よって欠損値をそれぞれNAとNoneで補完する。
df_all["PoolQC"].fillna('NA', inplace=True)
df_all["MiscFeature"].fillna('None', inplace=True)
df_all["Alley"].fillna('NA', inplace=True)
df_all["Fence"].fillna('NA', inplace=True)
df_all["FireplaceQu"].fillna('NA', inplace=True)
df_all["GarageQual"].fillna('NA', inplace=True)
df_all["GarageFinish"].fillna('NA', inplace=True)
df_all["GarageCond"].fillna('NA', inplace=True)
df_all["GarageType"].fillna('NA', inplace=True)
df_all["BsmtCond"].fillna('NA', inplace=True)
df_all["BsmtExposure"].fillna('NA', inplace=True)
df_all["BsmtQual"].fillna('NA', inplace=True)
df_all["BsmtFinType2"].fillna('NA', inplace=True)
df_all["BsmtFinType1"].fillna('NA', inplace=True)
df_all["MasVnrType"].fillna('None', inplace=True)

#以下はNaN = 0の特徴量リスト。例えば地下なら、地下がないんだから0。みたいな。
df_all["GarageYrBlt"].fillna(0, inplace=True) # ガレージ築年数を0にするのも不思議な気はしますが、そもそもガレージがないので他に妥当な数字が思いつかず。
df_all["MasVnrArea"].fillna(0, inplace=True)
df_all["BsmtHalfBath"].fillna(0, inplace=True)
df_all["BsmtFullBath"].fillna(0, inplace=True)
df_all["TotalBsmtSF"].fillna(0, inplace=True)
df_all["BsmtUnfSF"].fillna(0, inplace=True)
df_all["BsmtFinSF2"].fillna(0, inplace=True)
df_all["BsmtFinSF1"].fillna(0, inplace=True)
df_all["GarageArea"].fillna(0, inplace=True)
df_all["GarageCars"].fillna(0, inplace=True)

#欠損レコード数が少なく、大半が一つの値をとっているためあまりに予測の役に立たなさそうな特徴量は単純に最頻値を代入
df_all["MSZoning"].fillna('RL', inplace=True)
df_all["Functional"].fillna('Typ', inplace=True)
df_all["Utilities"].fillna("AllPub", inplace=True)
df_all['SaleType'] = df_all['SaleType'].fillna(df_all['SaleType'].mode()[0])
df_all['Exterior2nd'] = df_all['Exterior2nd'].fillna(df_all['Exterior2nd'].mode()[0])
df_all['Exterior1st'] = df_all['Exterior1st'].fillna(df_all['Exterior1st'].mode()[0])
df_all['KitchenQual'] = df_all['KitchenQual'].fillna(df_all['KitchenQual'].mode()[0])
df_all['Electrical'] = df_all['Electrical'].fillna(df_all['Electrical'].mode()[0])

#LotFrontage - Linear feet of street connected to property
#これは補完方法が明らかかつ簡単で、近くのStreet名=Neighborhoodでグループし平均を取れば良い精度で補完できそう。
f = lambda x: x.fillna(x.mean())
df_all["LotFrontage"] = df_all.groupby("Neighborhood")["LotFrontage"].transform(f)

In [15]:
#欠損値がすべて補完されているか確認
df_all.isnull().sum()[df_all.isnull().sum() != 0].sort_values(ascending=False)

Series([], dtype: int64)

In [16]:
#One Hot Encoding
df_all = pd.get_dummies(df_all)

In [17]:
#特徴量数の確認
df_all.shape

(2919, 302)

In [18]:
#学習用データとテストデータを切り分ける
ntrain = df_train.shape[0]
train = df_all[:ntrain]
test = df_all[ntrain:]
y = y_train
X = train.loc[:, train.columns != 'SalePrice']
#30%でチューニングを行う
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [19]:
#モデルの呼び出し
lasso = Lasso()
rf = RandomForestRegressor()
svr = svm.SVR()

In [29]:
#グリッドサーチ用パラメータの設定
# lasso_parameters = {'alpha':[0.1, 0.5, 1]}

rf_parameters= {'n_estimators':[100, 500, 2000], 'max_depth':[3, 5, 10]}

# svr_parameters = {'C':[1e-1, 1e+1, 1e+3], 'epsilon':[0.05, 0.1, 0.3]}

#グリッドサーチ
# lasso_gs = GridSearchCV(lasso, lasso_parameters)
# lasso_gs.fit(X_train,y_train)

rf_gs = GridSearchCV(rf, rf_parameters)
rf_gs.fit(X_train,y_train)

# svr_gs = GridSearchCV(svr, svr_parameters)
# svr_gs.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 500, 2000], 'max_depth': [3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
#ラッソ回帰
# y_pred = lasso_gs.predict(X_test)
# print("ラッソ回帰でのRMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

#ランダムフォレスト
y_pred2 = rf_gs.predict(X_test)
print("ランダムフォレストでのRMSE:",np.sqrt(mean_squared_error(y_test, y_pred2)))

#SVR
# y_pred3 = svr_gs.predict(X_test)
# print("SVRでのRMSE:",np.sqrt(mean_squared_error(y_test, y_pred3)))

ランダムフォレストでのRMSE: 24809.73572988657


In [32]:
df_visualize = pd.concat((pd.DataFrame(y_test), np.round(pd.DataFrame(y_pred2))), axis=1)
df_visualize.columns = ['実際の値','予測値']
df_visualize.head(10)

Unnamed: 0,実際の値,予測値
0,205000,202032.0
1,345000,340978.0
2,173900,180076.0
3,93500,91371.0
4,265900,242035.0
5,212000,196506.0
6,221000,204417.0
7,102000,111179.0
8,290000,284050.0
9,140000,148798.0


In [35]:
y_pred_final = rf_gs.predict(test)
submission = pd.concat((df_test_index, pd.DataFrame(y_pred_final)), axis=1)
submission.columns = ['Id', 'SalePrice']
#確認
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,127951.884044
1,1462,159969.818205
2,1463,171449.305928
3,1464,183162.125629
4,1465,209851.421496


In [36]:
submission.to_csv("sample1.csv",sep=',',index=False)