In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb

pd.set_option('display.max_rows', 100)
%pwd



'/Users/tady/.ghq/github.com/tadyjp/kaggle/house-prices-advanced-regression-techniques'

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_answer = pd.read_csv('data/sample_submission.csv')
df_test = pd.merge(df_test, df_answer, how='inner')
output_file = 'data/output.csv'

df_train.head(10).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Id,1,2,3,4,5,6,7,8,9,10
MSSubClass,60,20,60,70,60,50,20,60,50,190
MSZoning,RL,RL,RL,RL,RL,RL,RL,RL,RM,RL
LotFrontage,65,80,68,60,84,85,75,,51,50
LotArea,8450,9600,11250,9550,14260,14115,10084,10382,6120,7420
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,,,,,,
LotShape,Reg,Reg,IR1,IR1,IR1,IR1,Reg,IR1,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub


In [3]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 81 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [4]:
n_train = len(df_train)
df_both = pd.concat([df_train, df_test])

# Condition
conditions = ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']
for condition in conditions:
    df_both['Condition_' + condition] = (df_both['Condition1'] == condition) | (df_both['Condition2'] == condition)
df_both = df_both.drop(['Condition1', 'Condition2'], axis=1)

# NoRemod
df_both['NoRemod'] = df_both['YearBuilt'] == df_both['YearRemodAdd']

# Exterior
# print(df_both['Exterior2nd'].value_counts())
exteriors = [
    'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other',
    'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']

df_both['Exterior2nd'] = df_both['Exterior2nd'].replace('Wd Shng', 'WdShing')

for exterior in exteriors:
    df_both['Exterior_' + exterior] = (df_both['Exterior1st'] == exterior) | (df_both['Exterior2nd'] == exterior)
df_both = df_both.drop(['Exterior1st', 'Exterior2nd'], axis=1)

# ExterQual
df_both['ExterQual'] = df_both['ExterQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# ExterCond
df_both['ExterCond'] = df_both['ExterCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# Basement
df_both['NoBasement'] = df_both['BsmtQual'] == 'NA'
df_both['BsmtQual'] = df_both['BsmtQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
df_both['BsmtCond'] = df_both['BsmtCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
df_both['BsmtExposure'] = df_both['BsmtExposure'].map({'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1})
df_both['BsmtFinType1'] = df_both['BsmtFinType1'].map({'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0})
df_both['BsmtFinType2'] = df_both['BsmtFinType2'].map({'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0})

# Heating
df_both['HeatingQC'] = df_both['HeatingQC'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# CentralAir
df_both['CentralAir'] = df_both['CentralAir'] == 'Y'

# KitchenQual
df_both['KitchenQual'] = df_both['KitchenQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# FireplaceQu
df_both['FireplaceQu'] = df_both['FireplaceQu'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# Garage
df_both['NoGarage'] = df_both['GarageFinish'] == 'NA'
df_both['GarageQual'] = df_both['GarageQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
df_both['GarageCond'] = df_both['GarageCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

# Pool
df_both['NoPool'] = df_both['PoolQC'] == 'NA'
df_both['PoolQC'] = df_both['PoolQC'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2})

# Fence
df_both['NoFence'] = df_both['Fence'] == 'NA'
df_both['Fence'] = df_both['Fence'].map({'GdPrv': 5, 'MnPrv': 4, 'GdWo': 3, 'MnWw': 2})

# dummy
dummy_columns = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation',
    'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'PavedDrive', 'MiscFeature',
    'SaleType', 'SaleCondition']

for c in dummy_columns:
    df_both = pd.concat((df_both, pd.get_dummies(df_both[c], prefix=c, drop_first=True, dummy_na=True)), axis=1)
    df_both = df_both.drop(c, axis=1)

df_both = df_both.fillna(0)

df_both = df_both.drop('Id', axis=1)

df_train_processed = df_both.iloc[:n_train, :]
df_test_processed = df_both.iloc[n_train:, :]

df_train_processed.head(10).T.iloc[0:40, :]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
MSSubClass,60,20,60,70,60,50,20,60,50,190
LotFrontage,65,80,68,60,84,85,75,0,51,50
LotArea,8450,9600,11250,9550,14260,14115,10084,10382,6120,7420
OverallQual,7,6,7,7,8,5,8,7,7,5
OverallCond,5,8,5,5,5,5,5,6,5,6
YearBuilt,2003,1976,2001,1915,2000,1993,2004,1973,1931,1939
YearRemodAdd,2003,1976,2002,1970,2000,1995,2005,1973,1950,1950
MasVnrArea,196,0,162,0,350,0,186,240,0,0
ExterQual,4,3,4,3,4,3,4,3,3,3
ExterCond,3,3,3,3,3,3,3,3,3,3


In [5]:

df_train_processed.head(10).T.iloc[40:, :]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
WoodDeckSF,0,298,0,0,192,40,255,235,90,0
OpenPorchSF,61,0,42,35,84,30,57,204,0,4
EnclosedPorch,0,0,0,272,0,0,0,228,205,0
3SsnPorch,0,0,0,0,0,320,0,0,0,0
ScreenPorch,0,0,0,0,0,0,0,0,0,0
PoolArea,0,0,0,0,0,0,0,0,0,0
PoolQC,0,0,0,0,0,0,0,0,0,0
Fence,0,0,0,0,0,4,0,0,0,0
MiscVal,0,0,0,0,0,700,0,350,0,0
MoSold,2,5,9,2,12,10,8,11,4,1


In [6]:
df_train_processed.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,57.623288,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,3.39589,3.083562,...,0.083562,0.002055,0.867808,0.0,0.00274,0.008219,0.013699,0.820548,0.085616,0.0
std,42.300571,34.664304,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,0.57428,0.351054,...,0.276824,0.045299,0.338815,0.0,0.052289,0.090317,0.116277,0.383862,0.279893,0.0
min,20.0,0.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,42.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,3.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,50.0,63.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,3.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,4.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5.0,5.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [7]:
df_train_processed.corr()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
MSSubClass,1.000000,-0.215023,-0.139781,0.032628,-0.059316,0.027850,0.040581,0.023573,0.016178,-0.064686,...,-0.045156,-0.014555,0.026359,,0.016241,0.030002,0.000983,0.024359,-0.051068,
LotFrontage,-0.215023,1.000000,0.100739,0.176561,-0.053457,0.036853,0.078686,0.105010,0.127864,-0.046807,...,0.183706,0.001366,-0.139867,,-0.004724,-0.009519,0.028489,-0.134115,0.184103,
LotArea,-0.139781,0.100739,1.000000,0.105806,-0.005636,0.014228,0.013788,0.103321,0.055570,0.014732,...,0.020039,-0.005722,-0.002292,,-0.013208,0.008966,-0.010781,0.005711,0.022635,
OverallQual,0.032628,0.176561,0.105806,1.000000,-0.091932,0.572323,0.550684,0.407252,0.726278,0.013953,...,0.327412,-0.057962,-0.225013,,-0.041677,-0.044950,-0.025515,-0.143282,0.323295,
OverallCond,-0.059316,-0.053457,-0.005636,-0.091932,1.000000,-0.375983,0.073741,-0.125694,-0.138942,0.389163,...,-0.156175,-0.050663,0.163684,,-0.038888,-0.033444,-0.023873,0.161642,-0.151659,
YearBuilt,0.027850,0.036853,0.014228,0.572323,-0.375983,1.000000,0.592855,0.311600,0.598160,-0.103925,...,0.346954,0.012122,-0.238463,,-0.045601,-0.010104,-0.035785,-0.158427,0.343895,
YearRemodAdd,0.040581,0.078686,0.013788,0.550684,0.073741,0.592855,1.000000,0.176529,0.587318,0.074745,...,0.325647,-0.011431,-0.182733,,-0.040294,-0.020727,-0.048056,-0.120577,0.322837,
MasVnrArea,0.023573,0.105010,0.103321,0.407252,-0.125694,0.311600,0.176529,1.000000,0.345329,-0.018065,...,0.165692,-0.025899,-0.128187,,-0.011783,-0.013748,-0.009535,-0.081539,0.162205,
ExterQual,0.016178,0.127864,0.055570,0.726278,-0.138942,0.598160,0.587318,0.345329,1.000000,0.009184,...,0.391048,-0.031292,-0.269804,,-0.036145,-0.049563,-0.050478,-0.184302,0.385961,
ExterCond,-0.064686,-0.046807,0.014732,0.013953,0.389163,-0.103925,0.074745,-0.018065,0.009184,1.000000,...,-0.071901,-0.010805,0.075646,,-0.049820,-0.000059,0.022311,0.055405,-0.072861,


In [8]:
X_train, y_train = df_train_processed.drop('SalePrice', axis=1), df_train_processed['SalePrice']


In [9]:
# def cross_validate(clf, data, label):
#     skf = KFold(n_splits=5, random_state=42)
#     scores = []
#     for train_ix, test_ix in skf.split(data, label):  # テストデータを分割し、順次処理
#         clf.fit(data.loc[train_ix], label.loc[train_ix])  # 予測モデルの構築
#         score = clf.score(data.loc[test_ix], label.loc[test_ix])  # 予測モデルの精度評価を検証データで行う。
#         scores.append(score)
#     return np.mean(scores), np.std(scores)


In [10]:
# import time
from sklearn.model_selection import GridSearchCV
# start_time = time.time()
# 
# estimator = lgb.LGBMRegressor(num_leaves=32)
# 
# param_grid_1 = {
#     'learning_rate': [0.03, 0.1, 0.3, 1],
#     'n_estimators': list(range(50, 100, 10))
# }
# 
# grid_search_1 = GridSearchCV(estimator, param_grid_1, cv=5, n_jobs=4, iid=False, verbose=1)
# 
# grid_search_1.fit(X_train, y_train)
# print('Showing results took {} secs.'.format(time.time() - start_time))
# 
# print('best_params:', grid_search_1.best_params_)
# print('best_score:', grid_search_1.best_score_)

In [11]:
import time
start_time = time.time()

estimator = lgb.LGBMRegressor(
    learning_rate=0.1,
    random_state=42,
    metric='rmse'
)
param_grid_2 = {
    'n_estimators': [40, 60, 80, 100],
#     'num_leaves': [20, 25, 32, 36],
#     'colsample_bytree': [0.2, 0.3, 0.4, 0.5],
#     'subsample': [0.5, 0.7],
#     'reg_lambda': [0.1, 0.3, 1.0],
}
grid_search_2 = GridSearchCV(estimator, param_grid_2, cv=5, n_jobs=4, iid=False, verbose=1)
grid_search_2.fit(X_train, y_train)

print('Showing results took {} secs.'.format(time.time() - start_time))
print('best_params:', grid_search_2.best_params_)
print('best_score:', grid_search_2.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    7.8s finished


Showing results took 8.669859886169434 secs.
best_params: {'n_estimators': 60}
best_score: 0.8680850529141656


In [13]:

print('best_params:', grid_search_2.best_params_)
print('cv_results:', grid_search_2.cv_results_)


model = lgb.LGBMRegressor(
    learning_rate=0.01,
    random_state=42,
    metric='rmse',
    n_estimators=grid_search_2.best_params_['n_estimators'],
#     num_leaves=grid_search_2.best_params_['num_leaves'],
#     colsample_bytree=grid_search_2.best_params_['colsample_bytree'],
#     subsample=grid_search_2.best_params_['subsample'],
#     reg_lambda=grid_search_2.best_params_['reg_lambda']
)
model.fit(X_train, y_train)

feature_importances = pd.DataFrame({
    'colmun': X_train.columns,
    'Importance': model.feature_importances_
})

feature_importances


best_params: {'n_estimators': 60}
cv_results: {'mean_fit_time': array([0.90480247, 1.18934889, 1.48491225, 1.86832776]), 'std_fit_time': array([0.07091443, 0.08222736, 0.1359078 , 0.18875439]), 'mean_score_time': array([0.04459457, 0.02812047, 0.03681312, 0.04459529]), 'std_score_time': array([0.02691973, 0.00664238, 0.02259103, 0.03419804]), 'param_n_estimators': masked_array(data=[40, 60, 80, 100],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 40}, {'n_estimators': 60}, {'n_estimators': 80}, {'n_estimators': 100}], 'split0_test_score': array([0.88913821, 0.88901007, 0.88919239, 0.88688276]), 'split1_test_score': array([0.85338233, 0.84459997, 0.84075305, 0.8367954 ]), 'split2_test_score': array([0.86642604, 0.87941572, 0.8822192 , 0.88236632]), 'split3_test_score': array([0.89767765, 0.89312298, 0.89090807, 0.8915805 ]), 'split4_test_score': array([0.82905782, 0.83427653, 0.83597573, 0.83805623]), 'mean_t

Unnamed: 0,Importance,colmun
0,2,MSSubClass
1,9,LotFrontage
2,64,LotArea
3,227,OverallQual
4,23,OverallCond
5,68,YearBuilt
6,30,YearRemodAdd
7,19,MasVnrArea
8,20,ExterQual
9,0,ExterCond


In [14]:
X_test = df_test_processed.drop('SalePrice', axis=1)
print(X_test.info())

y_pred = model.predict(X_test)

# submission = pd.DataFrame({
#     'SalePrice': y_pred
# })
# print(submission)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Columns: 220 entries, MSSubClass to SaleCondition_nan
dtypes: bool(32), float64(22), int64(28), uint8(138)
memory usage: 823.5 KB
None


In [15]:
print(df_test.info())
submission = pd.DataFrame({
    'Id': df_test['Id'],
    'SalePrice': y_pred
})
submission.to_csv(output_file, index=False)
submission.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 81 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

Unnamed: 0,Id,SalePrice
0,1461,155677.955835
1,1462,166113.268699
2,1463,178931.826906
3,1464,178036.689875
4,1465,196649.02856
5,1466,178873.735389
6,1467,175053.429689
7,1468,175802.163254
8,1469,182707.968303
9,1470,150170.312847
