In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 30 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


In [6]:
# pip install catboost

import os
import numpy as np
import pandas as pd
import gc # garbage collector 불필요하게 메모리를 잡는것을 방지 함

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split

# 1	StandardScaler	기본 스케일. 평균과 표준편차 사용
# 2	MinMaxScaler	최대/최소값이 각각 1, 0이 되도록 스케일링
# 3	MaxAbsScaler	최대절대값과 0이 각각 1, 0이 되도록 스케일링
# 4	RobustScaler	중앙값(median)과 IQR(interquartile range) 사용. 아웃라이어의 영향을 최소화
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb # 최신 알고리즘
import xgboost as xgb # 최신 알고리즘
from catboost import CatBoostRegressor # Colab에서 에러뜸

# Allows the use of display() for DataFrames
from IPython.display import display # 데이터프레임을 시각화해서 보여주기 위한 모듈

import warnings # 불필요한 경고 메세지 창 무시
warnings.filterwarnings('ignore') 

# check where data exists
os.listdir('./drive/MyDrive/machine_learning_data')

['train.csv',
 'test.csv',
 'friend.csv',
 'abalone.data',
 'train_data.csv',
 'Naver_opinion.txt',
 'dolphine.jpg',
 'NanumBarunGothic.ttf',
 'wc_result.png',
 'wc_result1.png',
 'test_csv.txt',
 'crawlings__210510.log',
 '.ipynb_checkpoints',
 'new_friend.csv',
 'new_friend_index_true.csv',
 'DataFrame_1.ipynb']

In [7]:
# Load Train and Test Data

train_df = pd.read_csv('./drive/MyDrive/machine_learning_data/train.csv')
test_df = pd.read_csv('./drive/MyDrive/machine_learning_data/test.csv')

In [9]:
# chcek data size

print('train_df.shape : ', train_df.shape)
print('test_df.shape :', test_df.shape) # test data라 train data와 다르게 target이 존재하지 않아서 columns 갯수가 차이가 존재

train_df.shape :  (4459, 4993)
test_df.shape : (49342, 4992)


In [10]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,11d86fa6a,77c9823f2,8d6c2a0b2,4681de4fd,adf119b9a,cff75dd09,96f83a237,b8a716ebf,6c7a4567c,4fcfd2b4d,f3b9c0b95,71cebf11c,d966ac62c,68b647452,c88d108c9,ff7b471cd,d5308d8bc,0d866c3d7,bc3f77679,bd8f989f1,0eff5bf95,22ed6dba3,92b13ebba,c330f1a67,233c7c17c,2cb4d123e,eeac16933,87ffda550,...,969caa87a,00302fe51,1189ee335,ca04a07ca,f6f15ffa5,841704460,ea5ed6ff7,b1bb8eac3,8132d18b8,c24ea6548,cdfc2b069,2a879b4f7,6b119d8ce,98dea9e42,9f2471031,88458cb21,f40da20f4,7ad6b38bd,c901e7df1,8f55955dc,85dcc913d,5ca0b9b0c,eab8abf7a,8d8bffbae,2a1f6c7f9,9437d8b64,5831f4c76,2e84e09c5,d45fd5508,a165f5761,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,1300000.0,...,0,0,1100000.0,0,0,0,0.0,0.0,14800000,0.0,1200000.0,0.0,0.0,0,0,0,0,0.0,4000000,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0


In [11]:
# Train and Test Data Info

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [14]:
# Data Preprocessing 데이터 전처리 -> 가장 중요한 부분 성능의 90퍼센트 차지
# 결측치 유무

train_df.isnull().sum().sum() # 결측치 X
test_df.isnull().sum() # test데이터 도 결측치 X

ID           0
48df886f9    0
0deb4b6a8    0
34b15f335    0
a8cb14b00    0
            ..
71b203550    0
137efaa80    0
fb36b89d9    0
7e293fbaf    0
9fc776466    0
Length: 4992, dtype: int64

In [16]:
# 모델 학습에 영향없는 상수 column 삭제

train_df.head()
colsToRemove = []
for col in train_df.columns:
  if col != "ID" and col != 'traget':
    if train_df[col].std() == 0: # 표준편차가 0이면 값이 다 같다는 뜻
      colsToRemove.append(col)

train_df.drop(colsToRemove,axis=1,inplace=True) # axis default 값 -> axis = 0 row 방향 , axis = 1 column 방향
test_df.drop(colsToRemove,axis=1,inplace=True)

In [17]:
print("삭제된 컬럼 수 : ",len(colsToRemove))
print("삭제된 컬럼 : ",colsToRemove)

삭제된 컬럼 수 :  256
삭제된 컬럼 :  ['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a742107'

In [29]:
df_col = train_df[value].columns


1


In [30]:
# 중복 column 삭제 (중복 row 아님)
# iloc vs loc 차이
# iloc는 인덱스와 컬럼을 리스트 배열로 선택하는 것 -> 슬라이싱하기 용이함
# loc는 인덱스와 컬럼을 문자로 선택하는 것 -> 특정 데이터 칼럼을 찾기 위해 사용


groups = train_df.columns.to_series().groupby(train_df.dtypes).groups # 타입별로 그룹화
print(groups.keys())

dup_col = []

for key,value in group.items():
  df_col = train_df[value].columns
  df = train_df[value]
  I_df_col = len(df_col)

  for i in range(I_df_col):
    i_df = df.iloc[:,i].values # iloc => 
   
    for j in range(i+1,I_df_col):
      j_df = df.iloc[:,j].values
      if np.array_equal(i_df,j_df):
        dup_col.append(df_col[i])

train_df = train_df.drop(dup_col,axis=1)
test_df = test_df.drop(dup_col,axis=1)

dict_keys([dtype('int64'), dtype('float64'), dtype('O')])


In [32]:
train_df.head() # 초기 train columns 값보다 200개이상 줄음

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,11d86fa6a,77c9823f2,8d6c2a0b2,4681de4fd,adf119b9a,cff75dd09,96f83a237,b8a716ebf,6c7a4567c,4fcfd2b4d,f3b9c0b95,71cebf11c,d966ac62c,68b647452,c88d108c9,ff7b471cd,0d866c3d7,bc3f77679,bd8f989f1,0eff5bf95,22ed6dba3,92b13ebba,233c7c17c,2cb4d123e,87ffda550,822e49b95,316b978cd,d04e16aed,...,969caa87a,00302fe51,1189ee335,ca04a07ca,f6f15ffa5,841704460,ea5ed6ff7,b1bb8eac3,8132d18b8,c24ea6548,cdfc2b069,2a879b4f7,6b119d8ce,98dea9e42,9f2471031,88458cb21,f40da20f4,7ad6b38bd,c901e7df1,8f55955dc,85dcc913d,5ca0b9b0c,eab8abf7a,8d8bffbae,2a1f6c7f9,9437d8b64,5831f4c76,2e84e09c5,d45fd5508,a165f5761,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,1300000.0,0.0,0,0.0,...,0,0,1100000.0,0,0,0,0.0,0.0,14800000,0.0,1200000.0,0.0,0.0,0,0,0,0,0.0,4000000,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0


Data Scale & 정규화 작업은 필수 -> 예측 수렴하는데에 대한 속도가 빨라짐
단 항상 예측 성능 향상에 좋은 것은 아니다
보통 feature 데이터에 변환 작업은 다음과 같다.

StandardScaler, MinMaxScaler 등 스케일/정규화 작업 실시
스케일/정규화 수행한 데이터셋에 다항 특성 등을 적용하여 데이터 변환
2.은 1번 방법에 예측 성능 향상이 없을 경우 보통 사용

원래 값에 log 함수를 적용하면 보다 정규 분포에 가까운 형태로 분포 시킨다.
이러한 변환을 로그 변환이라 부른다.
1,2번 보다 로그 변환이 훨씬 많이 사용되는 변환 방법
그 이유는 1 방법은 예측 성능 향상에 크게 기대하기 어려움
2번 방법은 feature의 개수가 기하급수적으로 늘어날 수 있음
target 데이터에 변환 작업은 다음과 같다.

일반적으로 로그 변환을 적용한다.
target값을 정규분포나 다른 정규값으로 변환하면 변환된 값을 다시 복원하기 힘듦
무엇보다, 타켓 로그 변환을 통해서 예측 성능 향상의 사례가 많이 보고 됨

In [33]:
train_df

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,11d86fa6a,77c9823f2,8d6c2a0b2,4681de4fd,adf119b9a,cff75dd09,96f83a237,b8a716ebf,6c7a4567c,4fcfd2b4d,f3b9c0b95,71cebf11c,d966ac62c,68b647452,c88d108c9,ff7b471cd,0d866c3d7,bc3f77679,bd8f989f1,0eff5bf95,22ed6dba3,92b13ebba,233c7c17c,2cb4d123e,87ffda550,822e49b95,316b978cd,d04e16aed,...,969caa87a,00302fe51,1189ee335,ca04a07ca,f6f15ffa5,841704460,ea5ed6ff7,b1bb8eac3,8132d18b8,c24ea6548,cdfc2b069,2a879b4f7,6b119d8ce,98dea9e42,9f2471031,88458cb21,f40da20f4,7ad6b38bd,c901e7df1,8f55955dc,85dcc913d,5ca0b9b0c,eab8abf7a,8d8bffbae,2a1f6c7f9,9437d8b64,5831f4c76,2e84e09c5,d45fd5508,a165f5761,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,1300000.0,0.0,0,0.0,...,0,0,1100000.0,0,0,0,0.0,0.0,14800000,0.0,1200000.0,0.0,0.0,0,0,0,0,0.0,4000000,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,ff85154c8,1065000.0,0.0,0,0.0,0,0,0,0,0,70000.0,0.0,0.0,0,0,6000000,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,6000000,0,0.0,0,0,0.0,0,0,0.0,0,0,920000.0,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,ffb6b3f4f,48000.0,0.0,0,0.0,0,0,0,0,0,375000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,375000.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,80000.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,375000.0,0,0,0.0,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,ffcf61eb6,2800000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,...,0,0,30000.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,4291000.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,ffea67e98,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,200000.0,1500000.0,0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0


In [34]:
# Prepare Data for Training & Test

x_train = train_df.drop(['ID','target'],axis=1)
y_train = np.log1p(train_df['target'].values)

x_test = test_df.drop(['ID'],axis=1)

# train_test_split => train,test,val
dev_x,val_x,dev_y,val_y = train_test_split(x_train,y_train,test_size=0.2,random_state=2021) 

In [42]:
# Build LigthGBM Model& Predict

params = {
    'objective':'regression',
    'metric' : 'rmse', #root MSE
    'num_leaves':40,
    'learning_rate':0.004,
    'bagging_fraction':0.6,
    'feature_fraction':0.6,
    'bagging_frequency':6,
    'bagging_seed':2021,
    'vervosity':-1, # 학습되는 상황을 한셋씩 계속 보여주겠다는 뜻
    'seed':2021
}

lgtrain = lgb.Dataset(dev_x,dev_y)
lgval = lgb.Dataset(val_x,val_y)

evals_result ={}
model = lgb.train(params=params,
                  train_set = lgtrain,
                  num_boost_round = 5000,
                  valid_sets = [lgtrain,lgval],
                  early_stopping_rounds = 100,
                  verbose_eval = 150,
                  evals_result = evals_result
                  )


TypeError: ignored

In [46]:
# np.log1p 를 다시 되돌리는 exp => np.expm1

prediction = model.predict(x_test,num_iteration=model.best_iteration)
pred_test_y = np.expm1(prediction)

In [47]:
pred_test_y

array([2001453.0320639 , 1701792.84486827, 1546757.21256508, ...,
        888222.92595353,  501018.104024  , 1939020.78174481])

In [53]:
# feature importance

gain = model.feature_importance('gain')
featureimp = pd.DataFrame({
    'feature':model.feature_name(),
    'split':model.feature_importance('split'),
    'gain':100*gain/gain.sum() # 비율로서 보여준다
}).sort_values('gain',ascending=False) # 비율중에서 잘한것들 위주로 보여준다

In [54]:
featureimp

Unnamed: 0,feature,split,gain
4130,f190486d6,882,9.849580
2375,58e2e02e6,849,5.811981
4020,15ace8c9f,512,3.433981
3465,eeb9cd3aa,581,2.989733
2614,9fd594eec,355,2.526893
...,...,...,...
1852,6cf2d8705,0,0.000000
1855,09f827f1c,0,0.000000
1857,4ca7f1312,0,0.000000
1858,ad566b17c,0,0.000000
