In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
#データの読み込み
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
#外れ値の処理
df_train = df_train[~((df_train['GrLivArea'] > 4000) & (df_train['SalePrice'] < 300000))]
#Id及びSaleprice以外の全データを結合(左端=Id, 右端はSalePrice)
df_all = pd.concat((df_train.loc[:,'MSSubClass':'SaleCondition'], df_test.loc[:,'MSSubClass':'SaleCondition']))
#Feature Engineering - 前回の中で断トツでFeature Importanceが高かったTotalHousePorchSFを採用
df_all['TotalHousePorchSF'] = df_all['EnclosedPorch']+df_all['OpenPorchSF']+df_all['WoodDeckSF']+df_all['3SsnPorch']+df_all['ScreenPorch']
df_train['TotalHousePorchSF'] = df_train['EnclosedPorch']+df_train['OpenPorchSF']+df_train['WoodDeckSF']+df_train['3SsnPorch']+df_train['ScreenPorch']

In [20]:
#相関係数行列の可視化
fig, ax = plt.subplots(1, 1, figsize=(30, 30))
sns.heatmap(df_train.corr(), vmax=1, vmin=-1, center=0, annot=True, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x11bbd9c18>

In [4]:
df_all.drop(['1stFlrSF','GarageArea','TotRmsAbvGrd', 'GarageYrBlt'], axis=1, inplace=True)

In [5]:
df_train["SalePrice"].hist(bins=30)

<matplotlib.axes._subplots.AxesSubplot at 0x115565240>

In [6]:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
#対数変換後の分布を確認
df_train["SalePrice"].hist(bins=30)

<matplotlib.axes._subplots.AxesSubplot at 0x115565240>

In [7]:
#カテゴリカルでない特徴量
non_categoricals = df_all.dtypes[df_all.dtypes != "object"].index
skewed_feats = df_train[non_categoricals].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.6].index
#歪度0.7より大きい特徴量を対数変換
df_all[skewed_feats] = np.log1p(df_all[skewed_feats])

In [8]:
#One Hot Encoding
df_all = pd.get_dummies(df_all)
#欠損値を平均値で補完
df_all = df_all.fillna(df_all.mean())

In [9]:
#学習データ、テストデータに分割
X = df_all[:df_train.shape[0]]
X_for_test = df_all[df_train.shape[0]:]
y = df_train.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1111)

In [10]:
reg = Lasso(alpha=0.0004)
reg.fit(X_train, y_train)

Lasso(alpha=0.0004, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [11]:
y_pred = reg.predict(X_test)

In [12]:
print("ラッソ回帰でのRMSE:",np.sqrt(mean_squared_error(y_pred, y_test)))

ラッソ回帰でのRMSE: 0.114673460116


In [13]:
lgb_train = lgb.Dataset(X_train,y_train)

In [14]:
params = {'task': 'train','boosting_type': 'gbdt','objective': 'regression','metric': {'l2'},'num_leaves': 256,
'learning_rate': 0.01,'num_iterations':2000,'feature_fraction': 0.4,'bagging_fraction': 0.7,'bagging_freq': 5}
gbm = lgb.train(params, lgb_train, num_boost_round=1500)
y_test_pred_lgb = gbm.predict(np.array(X_test), num_iteration=gbm.best_iteration)
#RMSE
print("LightGBMでのRMSE:",np.sqrt(mean_squared_error(y_test, y_test_pred_lgb)))



LightGBMでのRMSE: 0.117165784985


In [15]:
print("LightGBM+Lassoのスタッキング時のRMSE:",np.sqrt(mean_squared_error(y_test, (y_pred*0.7+y_test_pred_lgb*0.3))))

LightGBM+Lassoのスタッキング時のRMSE: 0.110625710222


In [16]:
#全データで学習
reg.fit(X, y)
lgb_train_full = lgb.Dataset(X,y)
gbm = lgb.train(params, lgb_train_full, num_boost_round=1500)

In [17]:
#ラッソ・LightGBMの予測及びスタッキング
pred = np.expm1(reg.predict(X_for_test))
pred2 = np.expm1(gbm.predict(X_for_test))
pred3 = (pred*0.7+pred2*0.3)

In [18]:
solution = pd.DataFrame({"id":df_test.Id, "SalePrice":pred3})
solution.to_csv("test_lasso.csv", index = False)