In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ames-housing-dataset/AmesHousing.csv
/kaggle/input/housingdata/housingdata.csv
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [7]:
#学習データ・テストデータの読み込み
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
#float64のカラム3つを削除し、ランダムフォレストオブジェトで重要度の低い特徴量を削減。
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

#学習データを特徴量と目的変数に分ける
train_x = train.drop(['Id','SalePrice','LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
train_y = train['SalePrice']
#テストデータ
test_x = test.drop(['Id','LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
test_y_answer = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv').drop(['Id'], axis=1)

#それぞれのカテゴリ変数にlabel encodingを適用する
x = pd.concat([train_x, test_x], axis=0)
cat_cols = x.dtypes[train_x.dtypes=='object'].index.tolist()
for c in cat_cols:
#学習データに基づいてどう変換するかを定める
    le = LabelEncoder()
    le.fit(x[c].fillna('NaN'))
    
    #学習データ・テストデータを変換する
    train_x[c] = le.transform(train_x[c].fillna('NaN'))
    test_x[c] = le.transform(test_x[c].fillna('NaN'))

#ランダムフォレストオブジェトの生成(決定木の個数=500)
forest = RandomForestClassifier(n_estimators=500, random_state=1)
#モデルを適合
forest.fit(train_x, train_y)
#特徴量の重要度を抽出
importances = forest.feature_importances_
#重要度の降順で特徴量のインデックスを抽出
indices = np.argsort(importances)[::-1]
#重要度の降順で特徴量の名称、重要度を表示
for f in range(train_x.shape[1]):
    print("%2d) %-*s %f" %
         (f + 1, 30, train_x.columns[indices[f]], importances[indices[f]])
         )

 1) LotArea                        0.041329
 2) GrLivArea                      0.040556
 3) 1stFlrSF                       0.038600
 4) GarageArea                     0.038137
 5) BsmtUnfSF                      0.037665
 6) TotalBsmtSF                    0.037616
 7) YearBuilt                      0.033952
 8) MoSold                         0.033581
 9) YearRemodAdd                   0.032154
10) BsmtFinSF1                     0.031500
11) OpenPorchSF                    0.027963
12) WoodDeckSF                     0.026037
13) Neighborhood                   0.025435
14) YrSold                         0.023691
15) TotRmsAbvGrd                   0.022272
16) 2ndFlrSF                       0.021182
17) Exterior2nd                    0.019766
18) Exterior1st                    0.019309
19) OverallQual                    0.018859
20) BsmtFinType1                   0.017564
21) OverallCond                    0.016075
22) FireplaceQu                    0.015439
23) BedroomAbvGr                

In [9]:
#bestなモデルに編集していく
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


train_x_2 = train_x[train_x.columns[indices[np.arange(27)]]]
test_x_2 = test_x[test_x.columns[indices[np.arange(27)]]]
#バリデーション
tr_x, va_x, tr_y, va_y = train_test_split(train_x_2, train_y,
                                         test_size=0.25, random_state=71, shuffle=True)

#特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x_2)

#ハイパーパラメータの設定
params = {'objective': 'reg:squarederror', 'random_state': 71}
num_round=50
#学習の実行
#バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
#watchlistには学習データおよびバリデーションデータをセットする
watchlist = [(dtrain, 'train'), (dvalid, 'eval') ]
model = xgb.train(params, dtrain, num_round, early_stopping_rounds=20, evals=watchlist)

va_pred = model.predict(dvalid)
print('バリデーションデータ予測値:')
print(va_pred)
#平均平方二乗誤差でスコアを算出
score = np.sqrt(mean_squared_error(va_y, va_pred))
print('平均平方二乗誤差')
print(score)

#予測(二値の予測値)
test_pred = model.predict(dtest)
print('test予測値:')
print(test_pred)
#平均平方二乗誤差でスコアを算出
score_test = np.sqrt(mean_squared_error(test_y_answer, test_pred))
print('test平均平方二乗誤差')
print(score_test)

[0]	train-rmse:142182.65314	eval-rmse:141286.72626
[1]	train-rmse:102897.95056	eval-rmse:103162.64598
[2]	train-rmse:75096.88650	eval-rmse:76918.77664
[3]	train-rmse:55456.95963	eval-rmse:59127.01066
[4]	train-rmse:41728.29741	eval-rmse:48611.29635
[5]	train-rmse:31884.58681	eval-rmse:41286.15316
[6]	train-rmse:25028.34725	eval-rmse:37552.98345
[7]	train-rmse:20288.37973	eval-rmse:34775.90865
[8]	train-rmse:17060.81651	eval-rmse:33479.87466
[9]	train-rmse:14649.26467	eval-rmse:32414.31173
[10]	train-rmse:13124.88266	eval-rmse:31862.83466
[11]	train-rmse:11963.89744	eval-rmse:31723.28861
[12]	train-rmse:11221.42968	eval-rmse:31641.26040
[13]	train-rmse:10655.88906	eval-rmse:31283.00922
[14]	train-rmse:10135.98417	eval-rmse:31093.45536
[15]	train-rmse:9742.38619	eval-rmse:30943.37007
[16]	train-rmse:9445.13180	eval-rmse:31018.66678
[17]	train-rmse:9137.45411	eval-rmse:30950.84931
[18]	train-rmse:8924.08790	eval-rmse:30963.01436
[19]	train-rmse:8725.53075	eval-rmse:30906.36122
[20]	train-

In [12]:
#ここから読み込む
#float64のカラムのNaNを0に置換し、ランダムフォレストオブジェトで重要の高い順に特徴量を表示。
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

#学習データ・テストデータの読み込み
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
#学習データを特徴量と目的変数に分ける
train_x = train.drop(['Id','SalePrice'], axis=1)
train_y = train['SalePrice']
#テストデータ
test_x = test.drop(['Id'], axis=1)
test_y_answer = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv').drop(['Id'], axis=1)


#それぞれのカテゴリ変数にlabel encodingを適用する
x = pd.concat([train_x, test_x], axis=0)
cat_cols = x.dtypes[train_x.dtypes=='object'].index.tolist()
#floatの特徴量をintに変換する
train_x[['LotFrontage','MasVnrArea','GarageYrBlt']] = train_x[['LotFrontage','MasVnrArea','GarageYrBlt']].fillna(0).astype(int)
test_x[['LotFrontage','MasVnrArea','GarageYrBlt']] =test_x[['LotFrontage','MasVnrArea','GarageYrBlt']].fillna(0).astype(int)
for c in cat_cols:
#学習データに基づいてどう変換するかを定める
    le = LabelEncoder()
    le.fit(x[c].fillna('NaN'))
    #学習データ・テストデータを変換する
    train_x[c] = le.transform(train_x[c].fillna('NaN'))
    test_x[c] = le.transform(test_x[c].fillna('NaN'))

#ランダムフォレストオブジェトの生成(決定木の個数=500)
forest = RandomForestClassifier(n_estimators=500, random_state=1)
#モデルを適合
forest.fit(train_x, train_y)
#特徴量の重要度を抽出
importances = forest.feature_importances_
#重要度の降順で特徴量のインデックスを抽出
indices = np.argsort(importances)[::-1]
#重要度の降順で特徴量の名称、重要度を表示
for f in range(train_x.shape[1]):
    print("%2d) %-*s %f" %
         (f + 1, 30, train_x.columns[indices[f]], importances[indices[f]])
         )

 1) LotArea                        0.038527
 2) GrLivArea                      0.037582
 3) 1stFlrSF                       0.035614
 4) BsmtUnfSF                      0.035088
 5) GarageArea                     0.034717
 6) TotalBsmtSF                    0.034379
 7) YearBuilt                      0.030571
 8) LotFrontage                    0.030519
 9) MoSold                         0.030191
10) BsmtFinSF1                     0.029362
11) GarageYrBlt                    0.029068
12) YearRemodAdd                   0.028189
13) OpenPorchSF                    0.025271
14) WoodDeckSF                     0.023985
15) Neighborhood                   0.023565
16) YrSold                         0.021617
17) MasVnrArea                     0.020946
18) TotRmsAbvGrd                   0.020356
19) 2ndFlrSF                       0.020337
20) Exterior2nd                    0.018586
21) OverallQual                    0.017521
22) Exterior1st                    0.017423
23) BsmtFinType1                

In [13]:
#予測をcsvで保存して提出する
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)

#ハイパーパラメータの設定
score_list = []
params = {
        'booster':'gbtree',
        'objective': 'reg:squarederror',
        'eta':0.01,
        'gamma':0.0,
        'alpha':0.0,
        'lambda':1.0,
        'min_child_weight':2,
        'max_depth':6,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'colsample_bylevel':0.4,
        'random_state': 71
        }
num_round=1000

train_x_2 = train_x
test_x_2 = test_x

#バリデーション
for tr_idx, va_idx in kf.split(train_x_2):
    tr_x, va_x = train_x_2.iloc[tr_idx], train_x_2.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

#特徴量と目的変数をxgboostのデータ構造に変換する
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)

#学習の実行
#バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
#watchlistには学習データおよびバリデーションデータをセットする
    watchlist = [(dtrain, 'train'), (dvalid, 'eval') ]
    model = xgb.train(params, dtrain, num_round)

    va_pred = model.predict(dvalid)
#平均平方二乗誤差でスコアを算出
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    score_list.append(score)
    continue
        
print(np.mean(score_list))
    
#test予測(二値の予測値)
dtrain = xgb.DMatrix(train_x_2, label=train_y)
dtest = xgb.DMatrix(test_x_2)

model = xgb.train(params, dtrain, num_round)
test_pred = model.predict(dtest)

data2 = {
    'Id'       : test['Id'],
    'Saleprice': test_pred
}
df2 = pd.DataFrame(data2)
print(df2)

df2.to_csv("test6.csv", index = False)

25619.333069088043
        Id      Saleprice
0     1461  127439.148438
1     1462  157320.328125
2     1463  183088.781250
3     1464  188072.531250
4     1465  188530.781250
...    ...            ...
1454  2915   82648.085938
1455  2916   86053.242188
1456  2917  158930.984375
1457  2918  117507.367188
1458  2919  219117.078125

[1459 rows x 2 columns]


In [14]:
#手動でパラメータチューニングする場合
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)

#ハイパーパラメータの設定
for i in [0.0,0.1,0.2,0.3,0.4]:
    score_list = []
    params = {
        'booster':'gbtree',
        'objective': 'reg:squarederror',
        'eta':0.01,
        'gamma':i,
        'alpha':0.0,
        'lambda':1.0,
        'min_child_weight':2,
        'max_depth':6,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'colsample_bylevel':0.4,
        'random_state': 71
        }
    num_round=1000

    train_x_2 = train_x
    test_x_2 = test_x

#バリデーション
    for tr_idx, va_idx in kf.split(train_x_2):
        tr_x, va_x = train_x_2.iloc[tr_idx], train_x_2.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

#特徴量と目的変数をxgboostのデータ構造に変換する
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)

#学習の実行
#バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
#watchlistには学習データおよびバリデーションデータをセットする
        watchlist = [(dtrain, 'train'), (dvalid, 'eval') ]
        model = xgb.train(params, dtrain, num_round)

        va_pred = model.predict(dvalid)
#平均平方二乗誤差でスコアを算出
        score = np.sqrt(mean_squared_error(va_y, va_pred))
        score_list.append(score)
        continue
        
    print("i = %s" %(i))    
    print(np.mean(score_list))
    
#test予測(二値の予測値)
    dtrain = xgb.DMatrix(train_x_2, label=train_y)
    dtest = xgb.DMatrix(test_x_2)

    model = xgb.train(params, dtrain, num_round)
    test_pred = model.predict(dtest)

    data2 = {
        'Id'       : test['Id'],
        'Saleprice': test_pred
    }
    df2 = pd.DataFrame(data2)
    print(df2)

    df2.to_csv("test%s.csv" %(i), index = False)

i = 0.0
25619.333069088043
        Id      Saleprice
0     1461  127439.148438
1     1462  157320.328125
2     1463  183088.781250
3     1464  188072.531250
4     1465  188530.781250
...    ...            ...
1454  2915   82648.085938
1455  2916   86053.242188
1456  2917  158930.984375
1457  2918  117507.367188
1458  2919  219117.078125

[1459 rows x 2 columns]
i = 0.1
25619.333069088043
        Id      Saleprice
0     1461  127439.148438
1     1462  157320.328125
2     1463  183088.781250
3     1464  188072.531250
4     1465  188530.781250
...    ...            ...
1454  2915   82648.085938
1455  2916   86053.242188
1456  2917  158930.984375
1457  2918  117507.367188
1458  2919  219117.078125

[1459 rows x 2 columns]
i = 0.2
25619.333069088043
        Id      Saleprice
0     1461  127439.148438
1     1462  157320.328125
2     1463  183088.781250
3     1464  188072.531250
4     1465  188530.781250
...    ...            ...
1454  2915   82648.085938
1455  2916   86053.242188
1456  2917 

In [None]:
#グリッドサーチでパラメータチューニング
import itertools
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)

#ハイパーパラメータの設定
param_space = {
    'max_depth':[2,3,4,5,6,7,8,9,10],
    'min_child_weight':[1,2,3,4],
    }
param_combinations = itertools.product(param_space['max_depth'],param_space['min_child_weight'])
parameters = []
scores = []
score_list = []
num_round=1000
train_x_2 = train_x
test_x_2 = test_x

for  max_depth, min_child_weight in param_combinations:
    params = {
        'booster':'gbtree',
        'objective': 'reg:squarederror',
        'eta':0.01,
        'gamma':0.0,
        'alpha':0.0,
        'lambda':1.0,
        'max_depth':max_depth,
        'min_child_weight':min_child_weight,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'colsample_bylevel':0.4,
        'random_state': 71
        }

#バリデーション
    for tr_idx, va_idx in kf.split(train_x_2):
        tr_x, va_x = train_x_2.iloc[tr_idx], train_x_2.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

#特徴量と目的変数をxgboostのデータ構造に変換する
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)

#学習の実行
#バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
#watchlistには学習データおよびバリデーションデータをセットする
        watchlist = [(dtrain, 'train'), (dvalid, 'eval') ]
        model = xgb.train(params, dtrain, num_round)

        va_pred = model.predict(dvalid)
#平均平方二乗誤差でスコアを算出
        score = np.sqrt(mean_squared_error(va_y, va_pred))
        score_list.append(score)
        continue
    print(f'max_depth: %s, min_chind_weight: %s'%(max_depth, min_child_weight))
    print(np.mean(score_list))
    
    parameters.append((max_depth,min_child_weight))
    scores.append(np.mean(score_list))

best_idx = np.argsort(scores)[0]
best_param = parameters[best_idx]
print(f'max_depth: {best_param[0]}, min_chind_weight: {best_param[1]}')

max_depth: 2, min_chind_weight: 1
28482.208006717312
max_depth: 2, min_chind_weight: 2
28728.275265094017
max_depth: 2, min_chind_weight: 3
28725.64919202962
max_depth: 2, min_chind_weight: 4
28825.05988001889
max_depth: 3, min_chind_weight: 1
28400.56497763006
max_depth: 3, min_chind_weight: 2
28131.04544673154
max_depth: 3, min_chind_weight: 3
27949.976328931574
max_depth: 3, min_chind_weight: 4
27863.42323178389
max_depth: 4, min_chind_weight: 1
27633.332800352495
max_depth: 4, min_chind_weight: 2
27442.261478368746
max_depth: 4, min_chind_weight: 3
27326.20903591865
max_depth: 4, min_chind_weight: 4
27246.572957171797
max_depth: 5, min_chind_weight: 1
27137.108771866348
max_depth: 5, min_chind_weight: 2
27021.492709141625
max_depth: 5, min_chind_weight: 3
26947.164351111973
max_depth: 5, min_chind_weight: 4
26900.975234852296
max_depth: 6, min_chind_weight: 1
26831.683195925852
max_depth: 6, min_chind_weight: 2
26764.33041110153
max_depth: 6, min_chind_weight: 3
26719.327553150764
