In [None]:
import pandas as pd
pd.set_option('display.max_columns',None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings
import datawig
from category_encoders.ordinal import OrdinalEncoder

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import SMOTENC

from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import log_loss

# Data load

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### 결측치 처리

In [None]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

## 데이터 전처리 / 파생변수 생성

&nbsp;

#### binary class 

In [1]:
# binary transform (0, 1)

train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

NameError: ignored

&nbsp;

#### days 변수 

양수변환

In [None]:
# 음수 -> 양수 변환

train.DAYS_BIRTH = -train.DAYS_BIRTH
test.DAYS_BIRTH = -test.DAYS_BIRTH

train.DAYS_EMPLOYED = -train.DAYS_EMPLOYED
test.DAYS_EMPLOYED = -test.DAYS_EMPLOYED

train.begin_month = -train.begin_month
test.begin_month = -test.begin_month

이상치 처리

In [None]:
# 365243 인 값 0으로 바꾸기

train.DAYS_EMPLOYED = train.DAYS_EMPLOYED.replace([-365243],[0])
test.DAYS_EMPLOYED = test.DAYS_EMPLOYED.replace([-365243],[0])

나이, 일한 기간 변수로 변환

In [None]:
# 360으로 나눠서 나이, 일한 년수 계산

train['EMPLOYED']= train.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
train['age']= train.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
train['begin_month']= train.begin_month.apply(lambda x: abs(x) ).astype(int)

test['EMPLOYED']= test.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
test['age']= test.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
test['begin_month']= test.begin_month.apply(lambda x: abs(x) ).astype(int)

&nbsp;

#### child num, family size

In [None]:
# child_num이 14, 19인 행 지우기
idx_child_drop = train[(train['child_num'] == 14)|(train['child_num'] == 19)].index
train = train.drop(idx_child_drop)

In [None]:
# child_num이 family_size보다 큰 행도 지우기

idx_child_drop2 = train[train['family_size'] < train['child_num']].index
train = train.drop(idx_child_drop2)

**child_num, faimily size PCA 변수 생성**

In [None]:
train_pca = train[['child_num', 'family_size']]
#train_pca_target = train['credit']     

test_pca = test[['child_num', 'family_size']]
#test_pca_target = test['credit']

In [None]:
train_pca = StandardScaler().fit_transform(train_pca)
test_pca  = StandardScaler().fit_transform(test_pca)

In [None]:
from sklearn.decomposition import PCA
pca_train = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_train = pca_train.fit_transform(train_pca)
principalDf_train = pd.DataFrame(data=printcipalComponents_train, columns = ['principal component'])

pca_test = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_test = pca_test.fit_transform(test_pca)
principalDf_test = pd.DataFrame(data=printcipalComponents_test, columns = ['principal component'])

In [None]:
train['cf_pca'] = principalDf_train # 기존 데이터 셋에 넣어주기
test['cf_pca'] = principalDf_test

&nbsp;

#### income / family size 변수 생성

In [None]:
train['if_ratio'] = train['income_total'] / train['family_size']
test['if_ratio'] = test['income_total'] / test['family_size']

#### 통합 핸드폰 관련 변수 생성

In [None]:
def try_t(data):
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 0
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 1
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 2
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 3
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 4
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 5
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 6
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 7

try_t(train)
try_t(test)

In [None]:
train.info()

&nbsp;

#### 대체된 열 / 미사용 변수 삭제

In [None]:
train = train.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

test = test.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

&nbsp;

### 스케일링 / 인코딩

log scailing - income_total

In [None]:
# train에 로그변환한 변수로 바꾸기
tr_it=train['income_total']
tr_it_log = np.log1p(tr_it)
train['income_total']= tr_it_log
# test set
te_it=test['income_total']
te_it_log = np.log1p(te_it)
test['income_total']= te_it_log

ordinal encoding

In [None]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

카테고리 변수에 ordinal encoding 적용

In [None]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

standard scailing

income_total을 제외한 수치형 변수들에 standard scailing 진행

In [None]:
#numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

&nbsp;

### 이유불명 결측값 처리

In [None]:
train = train.dropna(axis = 0)

&nbsp;

### 최종데이터셋

In [None]:
train
train.info()

In [None]:
test.info()

## smote_nc 적용버전

In [None]:
target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

In [None]:
print('Original dataset shape %s' % Counter(y))


#Create the oversampler. For SMOTE-NC we need to pinpoint the column position where is the categorical features are. In this case, 'IsActiveMember' is positioned in the second column we input [1] as the parameter. If you have more than one categorical columns, just input all the columns position
smote_nc = SMOTENC(categorical_features=[0,1,2,5,6,7,8,9,10,11,12], random_state=0)
X_over, y_over = smote_nc.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y_over))

Original dataset shape Counter({2.0: 16959, 1.0: 6266, 0.0: 3222})
Resampled dataset shape Counter({1.0: 16959, 2.0: 16959, 0.0: 16959})


## 모델링

&nbsp;

### catboost

In [None]:
n_est = 2000
seed = 42
n_fold = 10
n_class = 3

파라미터는 default값 사용

과적합 방지, 안정적인 성능을 위해 k-fold 실행

In [None]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y.iloc[train_idx], y.iloc[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115043
0:	learn: 1.0343027	test: 1.0338955	best: 1.0338955 (0)	total: 31.5ms	remaining: 31.5s
100:	learn: 0.7874667	test: 0.8001023	best: 0.8001023 (100)	total: 3.53s	remaining: 31.5s
200:	learn: 0.7572722	test: 0.7863649	best: 0.7863649 (200)	total: 7.59s	remaining: 30.2s
300:	learn: 0.7323850	test: 0.7776023	best: 0.7774700 (298)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7080241	test: 0.7696839	best: 0.7696839 (400)	total: 15.8s	remaining: 23.6s
500:	learn: 0.6878145	test: 0.7644361	best: 0.7644361 (500)	total: 20.3s	remaining: 20.2s
600:	learn: 0.6683584	test: 0.7586164	best: 0.7586164 (600)	total: 24.6s	remaining: 16.3s
700:	learn: 0.6500365	test: 0.7539426	best: 0.7539426 (700)	total: 29s	remaining: 12.4s
800:	learn: 0.6344145	test: 0.7514044	best: 0.7514044 (800)	total: 33.2s	remaining: 8.25s
900:	learn: 0.6178028	test: 0.7495799	best: 0.7495622 (899)	total: 37.5s	remaining: 4.12s
999:	learn: 0.6031267	test: 0.

<catboost.core.CatBoostClassifier at 0x19b20237f40>

CV Log Loss Score: 0.746331

----------------- Fold 1 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341752	test: 1.0346371	best: 1.0346371 (0)	total: 32.3ms	remaining: 32.3s
100:	learn: 0.7859058	test: 0.8012229	best: 0.8012157 (99)	total: 3.66s	remaining: 32.6s
200:	learn: 0.7534052	test: 0.7912024	best: 0.7912024 (200)	total: 7.31s	remaining: 29s
300:	learn: 0.7280748	test: 0.7848374	best: 0.7848374 (300)	total: 11.7s	remaining: 27.2s
400:	learn: 0.7049703	test: 0.7782416	best: 0.7782147 (399)	total: 16.2s	remaining: 24.2s
500:	learn: 0.6836040	test: 0.7748985	best: 0.7748383 (496)	total: 20.3s	remaining: 20.2s
600:	learn: 0.6643531	test: 0.7696984	best: 0.7695653 (595)	total: 24.4s	remaining: 16.2s
700:	learn: 0.6473196	test: 0.7663419	best: 0.7662030 (697)	total: 28.6s	remaining: 12.2s
800:	learn: 0.6299686	test: 0.7653578	best: 0.7652278 (752)	total: 33.4s	remaining: 8.29s
900:	learn: 0.6127583	test: 0.7638262	best: 0.7636904 (892)	total: 37.7s	remaining: 4.15s
999

<catboost.core.CatBoostClassifier at 0x19b20237a60>

CV Log Loss Score: 0.762005

----------------- Fold 2 -----------------

Learning rate set to 0.115043
0:	learn: 1.0352995	test: 1.0344783	best: 1.0344783 (0)	total: 33.2ms	remaining: 33.2s
100:	learn: 0.7881943	test: 0.7965490	best: 0.7965490 (100)	total: 3.56s	remaining: 31.7s
200:	learn: 0.7538671	test: 0.7831926	best: 0.7831248 (199)	total: 7.52s	remaining: 29.9s
300:	learn: 0.7272709	test: 0.7745970	best: 0.7745571 (299)	total: 12s	remaining: 27.8s
400:	learn: 0.7064836	test: 0.7675592	best: 0.7675402 (398)	total: 16s	remaining: 23.9s
500:	learn: 0.6850389	test: 0.7595654	best: 0.7595291 (498)	total: 20.5s	remaining: 20.4s
600:	learn: 0.6654988	test: 0.7577579	best: 0.7575976 (597)	total: 24.8s	remaining: 16.4s
700:	learn: 0.6479536	test: 0.7552631	best: 0.7551380 (695)	total: 29.2s	remaining: 12.4s
800:	learn: 0.6319551	test: 0.7514738	best: 0.7514738 (800)	total: 33s	remaining: 8.21s
900:	learn: 0.6157011	test: 0.7498792	best: 0.7498669 (899)	total: 36.7s	remaining: 4.04s
999:	l

<catboost.core.CatBoostClassifier at 0x19b20237820>

CV Log Loss Score: 0.746391

----------------- Fold 3 -----------------

Learning rate set to 0.115043
0:	learn: 1.0344374	test: 1.0333665	best: 1.0333665 (0)	total: 31.6ms	remaining: 31.6s
100:	learn: 0.7873031	test: 0.7985645	best: 0.7985645 (100)	total: 3.8s	remaining: 33.8s
200:	learn: 0.7564939	test: 0.7877674	best: 0.7877604 (199)	total: 8.21s	remaining: 32.6s
300:	learn: 0.7307907	test: 0.7803106	best: 0.7803106 (300)	total: 11.8s	remaining: 27.5s
400:	learn: 0.7071603	test: 0.7740516	best: 0.7736202 (383)	total: 16.3s	remaining: 24.3s
500:	learn: 0.6864749	test: 0.7694010	best: 0.7691865 (498)	total: 20.4s	remaining: 20.3s
600:	learn: 0.6651383	test: 0.7667647	best: 0.7667647 (600)	total: 24.5s	remaining: 16.2s
700:	learn: 0.6462522	test: 0.7642322	best: 0.7642322 (700)	total: 28.4s	remaining: 12.1s
800:	learn: 0.6287284	test: 0.7621538	best: 0.7619454 (797)	total: 32.3s	remaining: 8.04s
900:	learn: 0.6127687	test: 0.7596401	best: 0.7593897 (898)	total: 36.7s	remaining: 4.04s
9

<catboost.core.CatBoostClassifier at 0x19b202375b0>

CV Log Loss Score: 0.757336

----------------- Fold 4 -----------------

Learning rate set to 0.115043
0:	learn: 1.0348082	test: 1.0366168	best: 1.0366168 (0)	total: 30.9ms	remaining: 30.8s
100:	learn: 0.7833794	test: 0.8169602	best: 0.8169495 (99)	total: 3.36s	remaining: 29.9s
200:	learn: 0.7498405	test: 0.8037452	best: 0.8037452 (200)	total: 7.19s	remaining: 28.6s
300:	learn: 0.7252353	test: 0.7971014	best: 0.7971014 (300)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7023587	test: 0.7934592	best: 0.7930414 (370)	total: 15.2s	remaining: 22.7s
500:	learn: 0.6804351	test: 0.7884974	best: 0.7884280 (499)	total: 18.9s	remaining: 18.9s
600:	learn: 0.6624269	test: 0.7858885	best: 0.7858535 (598)	total: 23.1s	remaining: 15.3s
700:	learn: 0.6420662	test: 0.7810363	best: 0.7810235 (696)	total: 26.9s	remaining: 11.5s
800:	learn: 0.6251018	test: 0.7777183	best: 0.7776510 (792)	total: 31.1s	remaining: 7.72s
900:	learn: 0.6102722	test: 0.7743571	best: 0.7742942 (898)	total: 35.2s	remaining: 3.87s
9

<catboost.core.CatBoostClassifier at 0x19b202375e0>

CV Log Loss Score: 0.773385

----------------- Fold 5 -----------------

Learning rate set to 0.115043
0:	learn: 1.0340829	test: 1.0346846	best: 1.0346846 (0)	total: 35.1ms	remaining: 35.1s
100:	learn: 0.7835730	test: 0.8049254	best: 0.8049254 (100)	total: 3.32s	remaining: 29.6s
200:	learn: 0.7500575	test: 0.7931703	best: 0.7931703 (200)	total: 7.61s	remaining: 30.2s
300:	learn: 0.7261283	test: 0.7863153	best: 0.7862896 (299)	total: 12s	remaining: 27.8s
400:	learn: 0.7037861	test: 0.7797752	best: 0.7796783 (399)	total: 16.2s	remaining: 24.1s
500:	learn: 0.6840794	test: 0.7749011	best: 0.7749011 (500)	total: 20.1s	remaining: 20.1s
600:	learn: 0.6651191	test: 0.7729836	best: 0.7728524 (587)	total: 24.2s	remaining: 16.1s
700:	learn: 0.6480070	test: 0.7713616	best: 0.7712854 (699)	total: 27.7s	remaining: 11.8s
800:	learn: 0.6303117	test: 0.7684475	best: 0.7680202 (781)	total: 31.1s	remaining: 7.74s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7680202043
bestIteratio

<catboost.core.CatBoostClassifier at 0x19b20237790>

CV Log Loss Score: 0.768020

----------------- Fold 6 -----------------

Learning rate set to 0.115043
0:	learn: 1.0352085	test: 1.0347448	best: 1.0347448 (0)	total: 34.3ms	remaining: 34.2s
100:	learn: 0.7881319	test: 0.8025186	best: 0.8025186 (100)	total: 3.2s	remaining: 28.5s
200:	learn: 0.7552430	test: 0.7903011	best: 0.7902821 (199)	total: 7.34s	remaining: 29.2s
300:	learn: 0.7278605	test: 0.7806939	best: 0.7806939 (300)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7044460	test: 0.7751802	best: 0.7751802 (400)	total: 15.7s	remaining: 23.4s
500:	learn: 0.6834794	test: 0.7707507	best: 0.7707507 (500)	total: 19.7s	remaining: 19.6s
600:	learn: 0.6654387	test: 0.7695493	best: 0.7694843 (592)	total: 23.5s	remaining: 15.6s
700:	learn: 0.6470298	test: 0.7658978	best: 0.7658581 (698)	total: 27.5s	remaining: 11.7s
800:	learn: 0.6292476	test: 0.7623055	best: 0.7621518 (799)	total: 31.1s	remaining: 7.73s
900:	learn: 0.6139458	test: 0.7593406	best: 0.7593406 (900)	total: 34.8s	remaining: 3.82s
9

<catboost.core.CatBoostClassifier at 0x19b202376a0>

CV Log Loss Score: 0.756329

----------------- Fold 7 -----------------

Learning rate set to 0.115043
0:	learn: 1.0342853	test: 1.0335520	best: 1.0335520 (0)	total: 31.6ms	remaining: 31.6s
100:	learn: 0.7874502	test: 0.7915380	best: 0.7915380 (100)	total: 3.37s	remaining: 30s
200:	learn: 0.7533420	test: 0.7781210	best: 0.7780660 (199)	total: 6.86s	remaining: 27.3s
300:	learn: 0.7277870	test: 0.7700051	best: 0.7700051 (300)	total: 10.9s	remaining: 25.2s
400:	learn: 0.7066458	test: 0.7635345	best: 0.7635345 (400)	total: 14.2s	remaining: 21.2s
500:	learn: 0.6856068	test: 0.7582556	best: 0.7582556 (500)	total: 17.4s	remaining: 17.4s
600:	learn: 0.6662291	test: 0.7538991	best: 0.7538025 (595)	total: 20.8s	remaining: 13.8s
700:	learn: 0.6486422	test: 0.7504323	best: 0.7504323 (700)	total: 24.3s	remaining: 10.4s
800:	learn: 0.6315004	test: 0.7467306	best: 0.7464691 (798)	total: 27.5s	remaining: 6.83s
900:	learn: 0.6141501	test: 0.7447743	best: 0.7447743 (900)	total: 31.4s	remaining: 3.45s
99

<catboost.core.CatBoostClassifier at 0x19b20237b80>

CV Log Loss Score: 0.741725

----------------- Fold 8 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341336	test: 1.0343548	best: 1.0343548 (0)	total: 31ms	remaining: 31s
100:	learn: 0.7853685	test: 0.8000339	best: 0.8000339 (100)	total: 3.3s	remaining: 29.4s
200:	learn: 0.7535296	test: 0.7876904	best: 0.7876823 (198)	total: 7.92s	remaining: 31.5s
300:	learn: 0.7283412	test: 0.7803910	best: 0.7803910 (300)	total: 11.3s	remaining: 26.3s
400:	learn: 0.7065717	test: 0.7744898	best: 0.7744898 (400)	total: 14.9s	remaining: 22.2s
500:	learn: 0.6868296	test: 0.7699989	best: 0.7699787 (499)	total: 18.7s	remaining: 18.6s
600:	learn: 0.6675020	test: 0.7670111	best: 0.7668552 (593)	total: 22.2s	remaining: 14.7s
700:	learn: 0.6485104	test: 0.7643483	best: 0.7642842 (672)	total: 25.5s	remaining: 10.9s
800:	learn: 0.6292531	test: 0.7627743	best: 0.7626838 (796)	total: 28.8s	remaining: 7.15s
900:	learn: 0.6121762	test: 0.7606049	best: 0.7606049 (900)	total: 32.1s	remaining: 3.52s
999:	

<catboost.core.CatBoostClassifier at 0x19b20237670>

CV Log Loss Score: 0.758868

----------------- Fold 9 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341785	test: 1.0340822	best: 1.0340822 (0)	total: 31.4ms	remaining: 31.3s
100:	learn: 0.7843458	test: 0.7991214	best: 0.7991214 (100)	total: 3.59s	remaining: 31.9s
200:	learn: 0.7513504	test: 0.7891149	best: 0.7891007 (198)	total: 8.1s	remaining: 32.2s
300:	learn: 0.7243293	test: 0.7821097	best: 0.7821097 (300)	total: 12.5s	remaining: 29s
400:	learn: 0.7016558	test: 0.7765358	best: 0.7765358 (400)	total: 16.1s	remaining: 24s
500:	learn: 0.6805866	test: 0.7716903	best: 0.7716903 (500)	total: 19.7s	remaining: 19.6s
600:	learn: 0.6612055	test: 0.7694699	best: 0.7694699 (600)	total: 24s	remaining: 15.9s
700:	learn: 0.6428791	test: 0.7659496	best: 0.7657665 (696)	total: 27.9s	remaining: 11.9s
800:	learn: 0.6256736	test: 0.7621747	best: 0.7621073 (799)	total: 31.7s	remaining: 7.87s
900:	learn: 0.6061365	test: 0.7621178	best: 0.7615033 (841)	total: 35.7s	remaining: 3.93s
Stopped

<catboost.core.CatBoostClassifier at 0x19b20237dc0>

CV Log Loss Score: 0.761503
	Log Loss: 0.757190


In [None]:
submission=pd.read_csv('sample_submission.csv')
submission.iloc[:, 1:] = cat_pred_test
submission

Unnamed: 0,index,0,1,2
0,26457,0.108691,0.140215,0.751094
1,26458,0.141232,0.157840,0.700928
2,26459,0.119412,0.151644,0.728944
3,26460,0.128912,0.142789,0.728299
4,26461,0.091621,0.218364,0.690015
...,...,...,...,...
9995,36452,0.174974,0.191246,0.633779
9996,36453,0.101350,0.242709,0.655941
9997,36454,0.024520,0.078620,0.896860
9998,36455,0.127251,0.172769,0.699980


In [None]:
submission.to_csv('ml_cat_fin.csv', index=False)

&nbsp;

### RandomForest

In [None]:
from tensorflow.keras.utils import to_categorical

#### 파라미터 튜닝

In [None]:
seed = 30

params_rf = {'n_estimators': [110,130,150],
              'max_depth': [38,40,44,48,50],
              'min_samples_split': [9],
              'min_samples_leaf': [1,3,5,7,9]}

model_rf = RandomForestClassifier(random_state=seed)
    
search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, scoring='neg_log_loss',
                                n_iter=30, cv = 5,random_state=seed)
search_rf.fit(X, y)
    

model_rf = RandomForestClassifier(n_estimators=search_rf.best_params_['n_estimators'],
                                  max_depth=search_rf.best_params_['max_depth'],
                                  min_samples_split=search_rf.best_params_['min_samples_split'],
                                  min_samples_leaf=search_rf.best_params_['min_samples_leaf'],
                                  random_state=seed)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=30),
                   n_iter=30,
                   param_distributions={'max_depth': [38, 40, 44, 48, 50],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [9],
                                        'n_estimators': [110, 130, 150]},
                   random_state=30, scoring='neg_log_loss')

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

def Kfold(model):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=156)
    outcomes=[]
    sub=np.zeros((X_test.shape[0], 3))
    for n_fold, (train_index, val_index) in enumerate(folds.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train, y_train)
        
        predictions=model.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=model.predict_proba(X_test)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

rf_submission = Kfold(model_rf)

FOLD 0 : logloss:0.7087393632612599
FOLD 1 : logloss:0.7110248159366424
FOLD 2 : logloss:0.7173359801606047
FOLD 3 : logloss:0.7177966429142233
FOLD 4 : logloss:0.7251270902038277
Mean:0.7160047784953115


In [None]:
rf_submission = pd.DataFrame(rf_submission)

rf_submission['index'] = submission['index']
#rf_submission = rf_submission.reindex(columns=['index','0','1','2'])

In [None]:
rf_submission

rf_submission.to_csv('ml_rf_fin.csv',index=False)

Unnamed: 0,0,1,2,index
0,0.105532,0.189752,0.704716,26457
1,0.189374,0.289610,0.521016,26458
2,0.076976,0.089298,0.833726,26459
3,0.084531,0.089776,0.825694,26460
4,0.122682,0.213724,0.663594,26461
...,...,...,...,...
9995,0.125306,0.154692,0.720002,36452
9996,0.210707,0.287825,0.501469,36453
9997,0.030615,0.104622,0.864763,36454
9998,0.213529,0.313131,0.473341,36455


### smote 버전

In [None]:
seed = 30

params_rf = {'n_estimators': [110,130,150],
              'max_depth': [38,40,44,48,50],
              'min_samples_split': [9],
              'min_samples_leaf': [1,3,5,7,9]}

model_rf = RandomForestClassifier(random_state=seed)
    
search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, scoring='neg_log_loss',
                                n_iter=30, cv = 5,random_state=seed)
search_rf.fit(X_over, y_over)
    

model_rf = RandomForestClassifier(n_estimators=search_rf.best_params_['n_estimators'],
                                  max_depth=search_rf.best_params_['max_depth'],
                                  min_samples_split=search_rf.best_params_['min_samples_split'],
                                  min_samples_leaf=search_rf.best_params_['min_samples_leaf'],
                                  random_state=seed)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=30),
                   n_iter=30,
                   param_distributions={'max_depth': [38, 40, 44, 48, 50],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [9],
                                        'n_estimators': [110, 130, 150]},
                   random_state=30, scoring='neg_log_loss')

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

def Kfold(model):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=156)
    outcomes=[]
    sub=np.zeros((X_test.shape[0], 3))
    for n_fold, (train_index, val_index) in enumerate(folds.split(X, y)):
        X_train, X_val = X_over.iloc[train_index], X_over.iloc[val_index]
        y_train, y_val = y_over.iloc[train_index], y_over.iloc[val_index]
        model.fit(X_train, y_train)
        
        predictions=model.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=model.predict_proba(X_test)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

rf_submission_nc = Kfold(model_rf)

FOLD 0 : logloss:0.7090487640333039
FOLD 1 : logloss:0.711749023755983
FOLD 2 : logloss:0.7176746146760286
FOLD 3 : logloss:0.718168701360285
FOLD 4 : logloss:0.7252969504047662
Mean:0.7163876108460733


In [None]:
rf_submission_nc = pd.DataFrame(rf_submission_nc)

rf_submission_nc['index'] = submission['index']
rf_submission_nc

Unnamed: 0,0,1,2,index
0,0.102560,0.186664,0.710776,26457
1,0.184391,0.287738,0.527872,26458
2,0.075715,0.086077,0.838208,26459
3,0.084916,0.090380,0.824703,26460
4,0.116445,0.216893,0.666663,26461
...,...,...,...,...
9995,0.121264,0.156484,0.722251,36452
9996,0.210103,0.290908,0.498989,36453
9997,0.031081,0.098489,0.870429,36454
9998,0.210210,0.319687,0.470103,36455


In [None]:
rf_submission_nc.to_csv('ml_rf_nc_fin.csv',index=False)