# Personal Project for ML/DL : [House Prices: Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

# Index
## Step 1 : Preprocessing(이전 단계의 전처리 과정 중 Outlier 제거를 제외한 과정)
## Step2 : Set Deep Neural Network
## Step3 : Select optimal Network and Parameters

In [0]:
# from keras.callbacks import ModelCheckpoint
# from keras.models import Sequential
import keras
from keras.layers import Dense, Activation, Flatten
from keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import os

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## # Step 1 : Preprocessing(이전 단계의 전처리 과정 중 Outlier 제거를 제외한 과정)
* Missing value 대체
* 상대적 우위를 갖는 categorical 변수들을 상대적 우위에 따라 숫자로 변환
* numeric value가 상대적 우위를 갖지 않는 numeric 변수들의 카테고리화
* 기존 변수의 조합/변형으로 새로운 변수 생성
* 모든 categorical 변수의 one-hot-encoding
* 여기서는 version1과 version2로 구분하지 않고 모든 변수 사용

In [0]:
def create_merge_frame(df_train, df_test):
    df_train.drop(['Id'], axis=1, inplace=True)
    df_test.drop(['Id'], axis=1, inplace=True)
    
    df_train.drop(['SalePrice'], axis=1, inplace=True)
    df_merge = pd.concat([df_train, df_test]).reset_index(drop=True)
    
    return df_merge

In [0]:
def preprocessing_data(df_merge):
    # Alley
    df_merge['Alley'].fillna('NA', inplace=True)

    # Basement
    bsmt_o = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
    bsmt_f = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']

    for i in bsmt_o:
        df_merge[i].fillna('NA', inplace=True)
    for i in bsmt_f:
        df_merge[i].fillna(0, inplace=True)

    # Garage
    garage_o = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    garage_f = ['GarageYrBlt', 'GarageCars', 'GarageArea']

    for i in garage_o:
        df_merge[i].fillna('NA', inplace=True)
    for i in garage_f:
        df_merge[i].fillna(0, inplace=True)

    # Pool
    df_merge['PoolQC'].fillna(0, inplace=True)

    # Masonry veneer
    df_merge['MasVnrType'].fillna('None', inplace=True)
    df_merge['MasVnrArea'].fillna(0, inplace=True)

    # Fireplace
    df_merge['FireplaceQu'].fillna('None', inplace=True)

    # Fence
    df_merge['Fence'].fillna('NA', inplace=True)

    # Miscellaneous feature
    df_merge['MiscFeature'].fillna('NA', inplace=True)
    
    
    # 상대적 우위를 가지는 categorical 변수를 상대적 우위에 따라 숫자로 변환
    # 최저 등급이 1인 columns
    df_merge["LandSlope"] = df_merge["LandSlope"].astype("category",categories=['Sev', 'Mod', 'Gtl'],ordered=True).cat.codes + 1
    df_merge["ExterQual"] = df_merge["ExterQual"].astype("category",categories=['Po', 'Fa', 'TA','Gd', 'Ex'],ordered=True).cat.codes + 1
    df_merge["ExterCond"] = df_merge["ExterCond"].astype("category",categories=['Po', 'Fa', 'TA','Gd', 'Ex'],ordered=True).cat.codes + 1
    df_merge["HeatingQC"] = df_merge["HeatingQC"].astype("category",categories=['Po', 'Fa', 'TA','Gd', 'Ex'],ordered=True).cat.codes + 1
    df_merge["KitchenQual"] = df_merge["KitchenQual"].astype("category",categories=['Po', 'Fa', 'TA','Gd', 'Ex'],ordered=True).cat.codes + 1
    for col in ["Electrical", "Functional", "Utilities"]:
        df_merge[col].fillna(df_merge[col].mode()[0], inplace=True)
    df_merge["Electrical"] = df_merge["Electrical"].astype("category",categories=['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],ordered=True).cat.codes + 1
    df_merge["Functional"] = df_merge["Functional"].astype("category",categories=['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],ordered=True).cat.codes + 1
    df_merge["LotShape"] = df_merge["LotShape"].astype("category",categories=['IR3', 'IR2', 'IR1', 'Reg'],ordered=True).cat.codes + 1
    df_merge["LandContour"] = df_merge["LandContour"].astype("category",categories=['Low', 'HLS', 'Bnk', 'Lvl'],ordered=True).cat.codes + 1
    df_merge["Utilities"] = df_merge["Utilities"].astype("category",categories=['ELO', 'NoSeWa', 'NoSewr', 'AllPub'],ordered=True).cat.codes + 1

    # 최저 등급이 0인 columns : 존재하지 않을 수 있음
    df_merge["BsmtQual"] = df_merge["BsmtQual"].astype("category",categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["BsmtCond"] = df_merge["BsmtCond"].astype("category",categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["FireplaceQu"] = df_merge["FireplaceQu"].astype("category",categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["GarageQual"] = df_merge["GarageQual"].astype("category",categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["GarageCond"] = df_merge["GarageCond"].astype("category",categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["BsmtExposure"] = df_merge["BsmtExposure"].astype("category",categories=['NA', 'No', 'Mn', 'Av', 'Gd' ],ordered=True).cat.codes
    df_merge["BsmtFinType1"] = df_merge["BsmtFinType1"].astype("category",categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],ordered=True).cat.codes
    df_merge["BsmtFinType2"] = df_merge["BsmtFinType2"].astype("category",categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],ordered=True).cat.codes
    df_merge["CentralAir"] = df_merge["CentralAir"].astype("category",categories=['N', 'Y'],ordered=True).cat.codes
    df_merge["GarageFinish"] = df_merge["GarageFinish"].astype("category",categories=['NA', 'Unf', 'RFn', 'Fin'],ordered=True).cat.codes
    df_merge["PoolQC"] = df_merge["PoolQC"].astype("category",categories=['NA', 'Fa', 'TA', 'Gd', 'Ex'],ordered=True).cat.codes
    df_merge["Fence"] = df_merge["Fence"].astype("category",categories=['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],ordered=True).cat.codes
    
    
    # null 값이 존재하는 나머지 categorical 변수들은 최빈값(MODE)로 대체
    for i in ('MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType'):
        df_merge[i].fillna(df_merge[i].mode()[0], inplace=True)
        
    # null 값이 존재하는 나머지 numeric 변수들은 중앙값(MEDIAN)으로 대체
    df_merge['LotFrontage'].fillna(df_merge['LotFrontage'].median(), inplace=True)
    
    # 일부 numeric 변수들의 카테고리화 : 숫자가 상대적 우위를 나타내지 않는 경우
    for i in ('MSSubClass', 'MoSold', 'YrSold'):
        df_merge[i] = df_merge[i].astype(str)
        
    # 기존 변수의 조합/변형으로 새로운 변수 생성
    df_merge['cr_TotalPorchSF'] = df_merge['WoodDeckSF'] + df_merge['OpenPorchSF'] + df_merge['EnclosedPorch'] + df_merge['3SsnPorch'] + df_merge['ScreenPorch']
    df_merge['cr_TotalSF'] = df_merge['GrLivArea'] + df_merge['TotalBsmtSF'] + df_merge['cr_TotalPorchSF'] + df_merge['PoolArea']
    
    df_merge['cr_ExistFence'] = df_merge['Fence'].apply(lambda x : 0 if x == 0 else 1)
    df_merge['cr_ExistPool'] = df_merge['PoolArea'].apply(lambda x : 1 if x > 0 else 0)
    df_merge['cr_ExistBsmt'] = df_merge['TotalBsmtSF'].apply(lambda x : 1 if x > 0 else 0)
    df_merge['cr_ExistGarage'] = df_merge['GarageArea'].apply(lambda x : 1 if x > 0 else 0)
    df_merge['cr_ExistFireplaces'] = df_merge['Fireplaces'].apply(lambda x : 1 if x > 0 else 0)
    
    # 모든 categorical 변수의 one-hot-encoding
    df_all = pd.get_dummies(df_merge)
    
    return df_all

In [5]:
df_train = pd.read_csv('/content/drive/My Drive/Colab_Notebooks/__data/train.csv')
df_test = pd.read_csv('/content/drive/My Drive/Colab_Notebooks/__data/test.csv')

print(df_train.shape)
print(df_test.shape)

(1460, 81)
(1459, 80)


In [0]:
dfy = df_train["SalePrice"]
df_merge = create_merge_frame(df_train, df_test)
df_all = preprocessing_data(df_merge)

In [7]:
train = df_all[:len(dfy)]
test = df_all[len(dfy):]

print(df_merge.shape)
print(df_all.shape)
print(dfy.shape)
print(train.shape)
print(test.shape)

(2919, 86)
(2919, 256)
(1460,)
(1460, 256)
(1459, 256)


## # Step2 :  Set Deep Neural Network
(1). `Sequential`로 모형 클래스 객체를 생성한다.

(2). `add` method로 layer를 추가한다.
- 입력 layer부터 순차적으로 추가한다.
- 첫 번째 layer는 `input_dim` parameter를 사용하여 입력 크기를 설정한다.
- 각 layer는 첫 번째 parameter로 출력 뉴련의 수를 입력한다.
- 각 layer는 `activation` parameter로 activation 을 지정한다. (activation functin은 linear한 단순 합을 입력 받아 non-linearity를 제공한다.)
- `kernel_initializer` parameter로 각 layer의 초기 가중치를 설정하는 방법을 지정한다.
- 마지막 layer의 출력 뉴런의 수는 output의 종류에 맞게 설정한다.(여기서는 1)

(3). `compile` method로 모형을 완성한다.
- `loss` parameter로 loss function을 지정한다.
- `optimizer` parameter로 최적화 알고리즘을 지정한다.
- `metrics` parameter로 training 단계에서 기록할 성능 기준을 설정한다.

(4). `fit` method로 training을 진행한다.
- `epoch` parameter로 반복될 epoch의 수를 지정한다.
- `validation_split`parameter로 training data 중 교차 검증에 사용될 부분을 지정한다.


In [0]:
def modeling_DL_algos(dense_1, h_num, dense_2, act_1, act2, optimizer):
  dl_model = keras.models.Sequential()
  dl_model.add(Dense(dense_1, kernel_initializer='normal',input_dim = train.shape[1], activation=act_1))
  
  for i in range(h_num):
    dl_model.add(Dense(dense_2, kernel_initializer='normal',activation=act2))
  
  dl_model.add(Dense(1, kernel_initializer='normal',activation='linear'))
  dl_model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_squared_error'])
  
  dl_model.fit(train, dfy, epochs=500, validation_split = 0.1)
  
  return dl_model

In [0]:
def store_DL_models(first_dense, h_num, other_dense, opt):
  predictions = dl_model.predict(test)
  my_submission = pd.DataFrame({'Id':pd.read_csv('/content/drive/My Drive/Colab_Notebooks/__data/test.csv').Id,'SalePrice':predictions[:,0]})
  my_submission.to_csv('/content/drive/My Drive/Colab_Notebooks/DL_sub/DL_{}_{}_{}_{}.csv'.format(first_dense, h_num, other_dense, opt), index=False)  

In [0]:
leaky_relu = tf.nn.leaky_relu

* hidden layer의 개수 : 3~5개
* 각 layer 출력 뉴런의 개수 : 128, 256, 512
* activation function : ReLU, Leaky ReLU
* optimizer : SGD, RMSProp, Adam

## Step3 : 최적의 Network와 parameter 선정

(1) SGD
  - learning_rate의 조절(0.00001 ~ 1)에도 prediction 시 nan값 발생
  

(2) Adam
  - best score : 0.15321
  - first layer : 출력 뉴런 128, leaky relu
  - hidden layer : 4개, 출력 뉴런 128, relu


(3) RMSProp
  - best score : 0.17380
  - first layer : 출력 뉴런 128, leaky relu
  - hidden layer : 4개, 출력 뉴런 128, leaky relu

In [0]:
%%time
sgd = keras.optimizers.SGD(lr=0.0005)
dl_model = modeling_DL_algos(128, 4, 128, leaky_relu, leaky_relu, sgd)
store_DL_models(128, 4, 128, 'sgd')

In [0]:
%%time
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
dl_model = modeling_DL_algos(128, 4, 128, leaky_relu, 'relu', adam)
store_DL_models(128, 4, 128, 'adam')

In [0]:
%%time
rmsprop = keras.optimizers.RMSprop(lr=0.001, decay=0.0)
dl_model = modeling_DL_algos(128, 4, 128, leaky_relu, leaky_relu, rmsprop)
store_DL_models(128, 4, 128, 'rmsprop')