# Merge multiple model after preprocessing

cf) baseline kernel: https://www.kaggle.com/kcs93023/2019-ml-month-2nd-baseline
    모두의 딥러닝: http://www.yes24.com/Product/Goods/57736119?scode=032&OzSrank=1

## 1. Check Data

### import modules

In [None]:
import os
from os.path import join
from math import sqrt

from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn import svm
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import tensorflow as tf
import xgboost as xgb
import lightgbm as lgb

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### read data

In [None]:
test = pd.read_csv('./input/test.csv')
train = pd.read_csv('./input/train.csv')
print(test.shape, train.shape)
train.tail()

### Column Description

1. ID : 집을 구분하는 번호 
1. date : 집을 구매한 날짜 
1. price : 타겟 변수인 집의 가격 
1. bedrooms : 침실의 수
1. bathrooms : 침실당 화장실 개수
1. sqft_living : 주거 공간의 평방 피트
1. sqft_lot : 부지의 평방 피트
1. floors : 집의 층 수
1. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
1. view : 집이 얼마나 좋아 보이는지의 정도
1. condition : 집의 전반적인 상태
1. grade : King County grading 시스템 기준으로 매긴 집의 등급
1. sqft_above : 지하실을 제외한 평방 피트
1. sqft_basement : 지하실의 평방 피트
1. yr_built : 집을 지은 년도
1. yr_renovated : 집을 재건축한 년도
1. zipcode : 우편번호
1. lat : 위도
1. long : 경도
1. sqft_living15 : 2015년 기준 주거 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)
1. sqft_lot15 : 2015년 기준 부지의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)

In [None]:
y = train['price']
del train['price'] # 가격을 별도로 저장 후, 원본에서 삭제

In [None]:
train_len = len(train) # 둘을 합치기 전에, 구분점(train의 길이)을 저장
data = pd.concat((train, test), axis=0)  # 합침

## 2. PreProcessing

### Missing Data

In [None]:
msno.matrix(data)

In [None]:
for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]),c].values)))

### manipulate id,date

In [None]:
sub_id = data['id'][train_len:]
del data['id']

In [None]:
data['date'] = data['date'].apply(lambda x: str(x[:6])).astype(str)
print(len(data))
data.tail()

### Distribution

In [None]:
fig, ax = plt.subplots(10, 2, figsize=(20,60))

# ignore id column
count = 0
columns = data.columns
for row in range(10):
    for col in range(2):
        sns.kdeplot(data[columns[count]], ax=ax[row][col]) # kernel density plot
        ax[row][col].set_title(columns[count], fontsize=15)
        count+=1
        if count == 19:
            break

### apply log-scaling
price, bedrooms, sqft_living, sqft_lot, sqft_above, sqft_basement 

In [None]:
skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(10, 15))

count = 0
for row in range(3):
    for col in range(2):
        if count == 5:
            break
        sns.kdeplot(data[skew_columns[count]], ax=ax[row][col])
        ax[row][col].set_title(skew_columns[count], fontsize=15)
        count+=1

### divide data

In [None]:
sub = data.iloc[train_len:, :]
x = data.iloc[:train_len, :]

## 3. Modeling

### Except Keras

In [None]:
gboost = GradientBoostingRegressor(random_state=2019)
xgboost = xgb.XGBRegressor(random_state=2019)
lightgbm = lgb.LGBMRegressor(random_state=2019)
lasso = Lasso(alpha=1.0, random_state=2019)
svm = svm.SVC(kernel='rbf', C=1, gamma=0.1, random_state=2019)

models = [{'model':xgboost, 'name':'XGBoost'},
          {'model':gboost, 'name': 'GradientBoosting'},
          {'model':lightgbm, 'name':'LightGBM'}] # too slow and too low score
#           {'model':xgboost, 'name':'XGBoost'},
#           {'model':gboost, 'name': 'GradientBoosting'},
#           {'model':lasso, 'name': 'Lasso'},
#           {'model':svm, 'name': 'SVM'}

In [None]:
def get_cv_score(models):
    kfold = KFold(n_splits=5, random_state=2019).get_n_splits(x.values)
    for m in models:
        print("Model {} CV score : {:.4f}".format(m['name'], np.mean(cross_val_score(m['model'], x.values, y)), kf=kfold))

In [None]:
get_cv_score(models)

In [None]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
        
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [None]:
y_pred = AveragingBlending(models, x, y, sub) # 다른 노트북 keras 학습이랑 같이 시키니까 커널 죽음... 

In [None]:
merged = pd.DataFrame(data={'id':sub_id, 'price':y_pred})

In [None]:
merged.to_csv('lgbm+XGB+gboost_after_preprocessing.csv', index=False)

### Keras

In [None]:
# seed 값 설정
seed = 0
np.random.seed(seed)
tf.set_random_seed(seed)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.001, random_state=seed) # 15035개에서 0.001인 16개만 샘프롤 꺼냄
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
model = Sequential()
model.add(Dense(190, input_dim=19, activation='relu'))
model.add(Dense(380, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1))

In [None]:
model.compile(loss='mean_squared_error',
             optimizer='adam') # accuracy를 쓸 수 없음

In [None]:
model.fit(X_train, Y_train, epochs=100, batch_size=4)

In [None]:
Y_prediction = model.predict(X_test).flatten()

# peek 5 samples

for i in range(5):
    label = Y_test.values[i]
    prediction = Y_prediction[i]
    print("실제가격: {:.3f}, 예상가격: {:.3f},      오차: {:.3f}".format(label, prediction, abs(label - prediction)))
RMSE = sqrt(mean_squared_error(Y_test, Y_prediction))
RMSE

In [None]:
test_predict = model.predict(sub).flatten()

In [None]:
sub = pd.DataFrame(data={'id':sub_id, 'price':test_predict})

In [None]:
sub.to_csv('Keras_4th_add_more_layers.csv', index=False)

# RMSE score

* light gmb : 121,182 - 04/07
* lgbm+XGB+gboost_after_preprocessing : 129,439 - 04/07
* merged model : 167,483 - 04/07
* 3rd keras: : 281,574 - layer(40,20,5,1) - 04/07
* 4th keras : 281,639 - layer(19,190,380,100,50,25,5,1) - 04/07  / rmse = 182,448

keras - layer(19,190,380,100,50,25,5,1) - rmse=238,108