# E6. Kaggle Competition : House Price Prediction(2019)

## 1. 데이터 준비 및 전처리 작업

In [13]:
# 각종 라이브러리 호출
import os
import warnings
warnings.filterwarnings("ignore")
import missingno as msno
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import numpy as np # 배열
import pandas as pd # 데이터 프레임
from os.path import join # 링크 결합
from sklearn.model_selection import train_test_split # 데이터셋 구분
from sklearn.metrics import mean_squared_error # RMSE 점수 계산
from xgboost import XGBRegressor # 모델 1
from lightgbm import LGBMRegressor # 모델 2
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor # 모델 3, 4
from sklearn.model_selection import GridSearchCV # 그리드 탐색

# 파일경로 설정
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

# 데이터 전처리(1) - train 데이터
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int) # date 정수형 데이터 전환
del train['id'] # id 제거

# 데이터 전처리(2) - price 컬럼 제거
y = train['price']
del train['price']

# 데이터 전처리(3) - test 데이터
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int) # date 정수형 데이터 전환
del test['id'] # id 제거

# 로그변환을 통한 정규분포화
y = np.log1p(y)

In [14]:
# 모델 입력
random_state=2020 # 랜덤시드 값 고정(None 초기값)

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

# 모델별 RMSE 값 확인
get_scores(models, train, y)

Unnamed: 0,RMSE
GradientBoostingRegressor,128360.196497
RandomForestRegressor,125487.071025
LGBMRegressor,111920.367359
XGBRegressor,110318.669566


## 2. 모델 튜닝


In [20]:
# 함수 적용 1(RMSE)
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))) # 원래 데이터에 맞게 계산

# 함수 적용 2(모델별 RMSE 값 확인)
def get_scores(models, train, y):
    df = {}
    
    for model in models:
        # 모델 이름 획득
        model_name = model.__class__.__name__

        # train, test 데이터셋 분리
        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)

        # 모델 학습
        model.fit(X_train, y_train)
    
        # 예측
        y_pred = model.predict(X_test)

        # 예측 결과 rmse값 저장
        df[model_name] = rmse(y_test, y_pred)
    
        # data frame 저장
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    
    return score_df

# 함수 적용 3(GridSearchCV)
'''
param_grid : 탐색할 파라미터의 종류 (딕셔너리로 입력)
scoring : 모델의 성능을 평가할 지표
cv : cross validation을 수행하기 위해 train 데이터셋을 나누는 조각의 개수
verbose : 그리드 탐색을 진행하면서 진행 과정을 출력해서 보여줄 메세지의 양 (숫자가 클수록 더 많은 메세지를 출력합니다.)
n_jobs : 그리드 탐색을 진행하면서 사용할 CPU의 개수
'''
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 피팅
    grid_model.fit(train, y)
    
    # 결과값 저wkd
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    # results['RMSE'] = np.sqrt(-1 * results['score'])
    # results = results.rename(columns={'RMSE': 'RMSLE'})
    # results.sort_values(by=['RMSLE'], axis=0) # or results = results.sort_values('RMSLE')
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')
    
    return results

In [23]:
# 하이퍼 파라미터 및 모델 입력(LightGBM)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.1, 0.5, 1],
    'min_child_weight': [1, 10, 20],
    'num_leaves': [31, 60, 90],
}

model1 = LGBMRegressor(random_state=random_state)

# 함수 구현(그리드 탐색)
my_GridSearch(model1, train, y, param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=5)]: Done 294 tasks      | elapsed:   14.2s
[Parallel(n_jobs=5)]: Done 515 tasks      | elapsed:   29.8s
[Parallel(n_jobs=5)]: Done 798 tasks      | elapsed:   48.8s
[Parallel(n_jobs=5)]: Done 1163 tasks      | elapsed:  1.2min
[Parallel(n_jobs=5)]: Done 1215 out of 1215 | elapsed:  1.2min finished


Unnamed: 0,learning_rate,max_depth,min_child_weight,n_estimators,num_leaves,score,RMSLE
78,0.1,10,20,150,31,-0.026458,0.162660
60,0.1,10,1,150,31,-0.026458,0.162660
69,0.1,10,10,150,31,-0.026458,0.162660
61,0.1,10,1,150,60,-0.026753,0.163564
70,0.1,10,10,150,60,-0.026753,0.163564
...,...,...,...,...,...,...,...
10,0.1,1,10,50,60,-0.073394,0.270914
9,0.1,1,10,50,31,-0.073394,0.270914
2,0.1,1,1,50,90,-0.073394,0.270914
1,0.1,1,1,50,60,-0.073394,0.270914


In [25]:
# 하이퍼 파라미터 및 모델 입력(xgboost)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.1, 0.5, 1],
    'min_child_weight': [1, 10, 20],
    'gamma': [0, 10, 30]
}


model2 = LGBMRegressor(random_state=random_state)

# 함수 구현(그리드 탐색)
my_GridSearch(model2, train, y, param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=5)]: Done 294 tasks      | elapsed:   15.1s
[Parallel(n_jobs=5)]: Done 700 tasks      | elapsed:   36.1s
[Parallel(n_jobs=5)]: Done 1206 out of 1215 | elapsed:  1.0min remaining:    0.5s
[Parallel(n_jobs=5)]: Done 1215 out of 1215 | elapsed:  1.1min finished


Unnamed: 0,gamma,learning_rate,max_depth,min_child_weight,n_estimators,score,RMSLE
188,30,0.1,10,20,150,-0.026458,0.162660
185,30,0.1,10,10,150,-0.026458,0.162660
182,30,0.1,10,1,150,-0.026458,0.162660
101,10,0.1,10,1,150,-0.026458,0.162660
104,10,0.1,10,10,150,-0.026458,0.162660
...,...,...,...,...,...,...,...
87,10,0.1,1,20,50,-0.073394,0.270914
162,30,0.1,1,1,50,-0.073394,0.270914
165,30,0.1,1,10,50,-0.073394,0.270914
81,10,0.1,1,1,50,-0.073394,0.270914


In [17]:
# 함수 구현(모델 구현 및 저장)
def save_submission(model, train, y, test, model_name, rmsle=None):
    # 모델 학습 및 예측
    model.fit(train, y)
    prediction = model.predict(test)
    
    # 변환함수 적용
    prediction = np.expm1(prediction)
    
    # 데이터 프레임 불러들이기
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    
    # 데이터 프레임 편집
    submission['price'] = prediction
    
    # 파일저장
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    
    print('{} saved!'.format(submission_csv_path))

In [26]:
model = LGBMRegressor(n_estimators=150, max_depth=10, learning_rate=0.1, min_child_weight=20, num_leaves=31, random_state=random_state)
save_submission(model, train, y, test, 'lgbm', rmsle='0.0162')

/aiffel/aiffel/kaggle_kakr_housing/data/submission_lgbm_RMSLE_0.0162.csv saved!
