In [57]:
import pandas as pd
pd.set_option('display.max_columns',None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings
import datawig
from category_encoders.ordinal import OrdinalEncoder

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import SMOTENC

from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import log_loss

# Data load

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### 결측치 처리

In [3]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

## 데이터 전처리 / 파생변수 생성

&nbsp;

#### binary class 

In [4]:
# binary transform (0, 1)

train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

&nbsp;

#### days 변수 

양수변환

In [5]:
# 음수 -> 양수 변환

train.DAYS_BIRTH = -train.DAYS_BIRTH
test.DAYS_BIRTH = -test.DAYS_BIRTH

train.DAYS_EMPLOYED = -train.DAYS_EMPLOYED
test.DAYS_EMPLOYED = -test.DAYS_EMPLOYED

train.begin_month = -train.begin_month
test.begin_month = -test.begin_month

이상치 처리

In [6]:
# 365243 인 값 0으로 바꾸기

train.DAYS_EMPLOYED = train.DAYS_EMPLOYED.replace([-365243],[0])
test.DAYS_EMPLOYED = test.DAYS_EMPLOYED.replace([-365243],[0])

나이, 일한 기간 변수로 변환

In [7]:
# 360으로 나눠서 나이, 일한 년수 계산

train['EMPLOYED']= train.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
train['age']= train.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
train['begin_month']= train.begin_month.apply(lambda x: abs(x) ).astype(int)

test['EMPLOYED']= test.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
test['age']= test.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
test['begin_month']= test.begin_month.apply(lambda x: abs(x) ).astype(int)

&nbsp;

#### child num, family size

In [8]:
# child_num이 14, 19인 행 지우기
idx_child_drop = train[(train['child_num'] == 14)|(train['child_num'] == 19)].index
train = train.drop(idx_child_drop)

In [9]:
# child_num이 family_size보다 큰 행도 지우기

idx_child_drop2 = train[train['family_size'] < train['child_num']].index
train = train.drop(idx_child_drop2)

**child_num, faimily size PCA 변수 생성**

In [12]:
train_pca = train[['child_num', 'family_size']]
#train_pca_target = train['credit']       <-  이거는 혹시 왜 필요한가요?

test_pca = test[['child_num', 'family_size']]
#test_pca_target = test['credit']

In [13]:
train_pca = StandardScaler().fit_transform(train_pca)
test_pca  = StandardScaler().fit_transform(test_pca)

In [14]:
from sklearn.decomposition import PCA
pca_train = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_train = pca_train.fit_transform(train_pca)
principalDf_train = pd.DataFrame(data=printcipalComponents_train, columns = ['principal component'])

pca_test = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_test = pca_test.fit_transform(test_pca)
principalDf_test = pd.DataFrame(data=printcipalComponents_test, columns = ['principal component'])

In [16]:
train['cf_pca'] = principalDf_train # 기존 데이터 셋에 넣어주기
test['cf_pca'] = principalDf_test

&nbsp;

#### income / family size 변수 생성

In [21]:
train['if_ratio'] = train['income_total'] / train['family_size']
test['if_ratio'] = test['income_total'] / test['family_size']

#### 통합 핸드폰 관련 변수 생성

In [23]:
def try_t(data):
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 0
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 1
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 2
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 3
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 4
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 5
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 6
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 7

try_t(train)
try_t(test)

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 25 columns):
index            26452 non-null int64
gender           26452 non-null int64
car              26452 non-null int64
reality          26452 non-null int64
child_num        26452 non-null int64
income_total     26452 non-null float64
income_type      26452 non-null object
edu_type         26452 non-null object
family_type      26452 non-null object
house_type       26452 non-null object
DAYS_BIRTH       26452 non-null int64
DAYS_EMPLOYED    26452 non-null int64
FLAG_MOBIL       26452 non-null int64
work_phone       26452 non-null int64
phone            26452 non-null int64
email            26452 non-null int64
occyp_type       26452 non-null object
family_size      26452 non-null float64
begin_month      26452 non-null int32
credit           26452 non-null float64
EMPLOYED         26452 non-null float64
age              26452 non-null int32
cf_pca           26447 non-null float64
if

&nbsp;

#### 대체된 열 / 미사용 변수 삭제

In [27]:
train = train.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

test = test.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

&nbsp;

### 스케일링 / 인코딩

log scailing - income_total

In [None]:
# train에 로그변환한 변수로 바꾸기
tr_it=train['income_total']
tr_it_log = np.log1p(tr_it)
train['income_total']= tr_it_log
# test set
te_it=test['income_total']
te_it_log = np.log1p(te_it)
test['income_total']= te_it_log

ordinal encoding

In [28]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  10
Number of Categorical features:  5


In [32]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

standard scailing

In [33]:
#numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

&nbsp;

### 이유불명 결측값 처리

In [45]:
train = train.dropna(axis = 0)

&nbsp;

### 최종데이터셋

In [47]:
train
train.info()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,occyp_type,begin_month,credit,EMPLOYED,age,cf_pca,if_ratio,try
0,-0.703421,-0.782331,-1.437548,0.149153,1,1,1,1,1,-1.215266,1.0,1.059101,-0.499873,-0.414431,0.002212,-0.722786
1,-0.703421,-0.782331,0.695629,0.590832,1,2,2,2,2,-1.275658,1.0,-0.277854,-1.099805,0.868285,-0.254013,-0.242043
2,1.421624,1.278231,0.695629,2.578387,2,1,1,2,3,-0.248987,2.0,0.943082,0.785693,-0.414431,1.693299,0.238699
3,-0.703421,-0.782331,0.695629,0.149153,1,2,1,2,4,0.656899,0.0,-0.044973,-0.242760,-0.414431,0.002212,0.238699
4,-0.703421,1.278231,0.695629,-0.292525,3,1,1,2,3,-0.007418,2.0,-0.039488,-0.242760,-0.414431,-0.305258,-0.722786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26447,1.421624,-0.782331,0.695629,-0.866708,2,2,1,2,2,1.019253,2.0,-0.494703,-0.414169,2.151001,-1.043187,2.161669
26448,1.421624,-0.782331,0.695629,1.032511,1,1,1,2,2,-0.188595,0.0,-0.553345,0.100058,0.296293,0.617153,-0.722786
26449,-0.703421,-0.782331,-1.437548,-0.955044,2,2,1,2,1,-1.456836,1.0,0.092562,-1.271213,-0.414431,-0.766464,2.161669
26450,-0.703421,-0.782331,-1.437548,0.369993,1,2,4,2,2,0.656899,2.0,-0.682442,-1.356918,-0.986423,1.693299,0.238699


<class 'pandas.core.frame.DataFrame'>
Int64Index: 26447 entries, 0 to 26451
Data columns (total 16 columns):
gender          26447 non-null float64
car             26447 non-null float64
reality         26447 non-null float64
income_total    26447 non-null float64
income_type     26447 non-null int32
edu_type        26447 non-null int32
family_type     26447 non-null int32
house_type      26447 non-null int32
occyp_type      26447 non-null int32
begin_month     26447 non-null float64
credit          26447 non-null float64
EMPLOYED        26447 non-null float64
age             26447 non-null float64
cf_pca          26447 non-null float64
if_ratio        26447 non-null float64
try             26447 non-null float64
dtypes: float64(11), int32(5)
memory usage: 2.9 MB


In [37]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
gender          10000 non-null float64
car             10000 non-null float64
reality         10000 non-null float64
income_total    10000 non-null float64
income_type     10000 non-null int32
edu_type        10000 non-null int32
family_type     10000 non-null int32
house_type      10000 non-null int32
occyp_type      10000 non-null int32
begin_month     10000 non-null float64
EMPLOYED        10000 non-null float64
age             10000 non-null float64
cf_pca          10000 non-null float64
if_ratio        10000 non-null float64
try             10000 non-null float64
dtypes: float64(10), int32(5)
memory usage: 976.7 KB


## smote_nc 적용버전

In [48]:
target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

In [51]:
print('Original dataset shape %s' % Counter(y))


#Create the oversampler. For SMOTE-NC we need to pinpoint the column position where is the categorical features are. In this case, 'IsActiveMember' is positioned in the second column we input [1] as the parameter. If you have more than one categorical columns, just input all the columns position
smote_nc = SMOTENC(categorical_features=[0,1,2,5,6,7,8,9,10,11,12], random_state=0)
X_over, y_over = smote_nc.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y_over))

Original dataset shape Counter({2.0: 16959, 1.0: 6266, 0.0: 3222})
Resampled dataset shape Counter({1.0: 16959, 2.0: 16959, 0.0: 16959})


## 모델링

&nbsp;

### catboost

In [53]:
n_est = 2000
seed = 42
n_fold = 10
n_class = 3

In [58]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y.iloc[train_idx], y.iloc[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115043
0:	learn: 1.0343027	test: 1.0338955	best: 1.0338955 (0)	total: 31.5ms	remaining: 31.5s
100:	learn: 0.7874667	test: 0.8001023	best: 0.8001023 (100)	total: 3.53s	remaining: 31.5s
200:	learn: 0.7572722	test: 0.7863649	best: 0.7863649 (200)	total: 7.59s	remaining: 30.2s
300:	learn: 0.7323850	test: 0.7776023	best: 0.7774700 (298)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7080241	test: 0.7696839	best: 0.7696839 (400)	total: 15.8s	remaining: 23.6s
500:	learn: 0.6878145	test: 0.7644361	best: 0.7644361 (500)	total: 20.3s	remaining: 20.2s
600:	learn: 0.6683584	test: 0.7586164	best: 0.7586164 (600)	total: 24.6s	remaining: 16.3s
700:	learn: 0.6500365	test: 0.7539426	best: 0.7539426 (700)	total: 29s	remaining: 12.4s
800:	learn: 0.6344145	test: 0.7514044	best: 0.7514044 (800)	total: 33.2s	remaining: 8.25s
900:	learn: 0.6178028	test: 0.7495799	best: 0.7495622 (899)	total: 37.5s	remaining: 4.12s
999:	learn: 0.6031267	test: 0.

<catboost.core.CatBoostClassifier at 0x19b20237f40>

CV Log Loss Score: 0.746331

----------------- Fold 1 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341752	test: 1.0346371	best: 1.0346371 (0)	total: 32.3ms	remaining: 32.3s
100:	learn: 0.7859058	test: 0.8012229	best: 0.8012157 (99)	total: 3.66s	remaining: 32.6s
200:	learn: 0.7534052	test: 0.7912024	best: 0.7912024 (200)	total: 7.31s	remaining: 29s
300:	learn: 0.7280748	test: 0.7848374	best: 0.7848374 (300)	total: 11.7s	remaining: 27.2s
400:	learn: 0.7049703	test: 0.7782416	best: 0.7782147 (399)	total: 16.2s	remaining: 24.2s
500:	learn: 0.6836040	test: 0.7748985	best: 0.7748383 (496)	total: 20.3s	remaining: 20.2s
600:	learn: 0.6643531	test: 0.7696984	best: 0.7695653 (595)	total: 24.4s	remaining: 16.2s
700:	learn: 0.6473196	test: 0.7663419	best: 0.7662030 (697)	total: 28.6s	remaining: 12.2s
800:	learn: 0.6299686	test: 0.7653578	best: 0.7652278 (752)	total: 33.4s	remaining: 8.29s
900:	learn: 0.6127583	test: 0.7638262	best: 0.7636904 (892)	total: 37.7s	remaining: 4.15s
999

<catboost.core.CatBoostClassifier at 0x19b20237a60>

CV Log Loss Score: 0.762005

----------------- Fold 2 -----------------

Learning rate set to 0.115043
0:	learn: 1.0352995	test: 1.0344783	best: 1.0344783 (0)	total: 33.2ms	remaining: 33.2s
100:	learn: 0.7881943	test: 0.7965490	best: 0.7965490 (100)	total: 3.56s	remaining: 31.7s
200:	learn: 0.7538671	test: 0.7831926	best: 0.7831248 (199)	total: 7.52s	remaining: 29.9s
300:	learn: 0.7272709	test: 0.7745970	best: 0.7745571 (299)	total: 12s	remaining: 27.8s
400:	learn: 0.7064836	test: 0.7675592	best: 0.7675402 (398)	total: 16s	remaining: 23.9s
500:	learn: 0.6850389	test: 0.7595654	best: 0.7595291 (498)	total: 20.5s	remaining: 20.4s
600:	learn: 0.6654988	test: 0.7577579	best: 0.7575976 (597)	total: 24.8s	remaining: 16.4s
700:	learn: 0.6479536	test: 0.7552631	best: 0.7551380 (695)	total: 29.2s	remaining: 12.4s
800:	learn: 0.6319551	test: 0.7514738	best: 0.7514738 (800)	total: 33s	remaining: 8.21s
900:	learn: 0.6157011	test: 0.7498792	best: 0.7498669 (899)	total: 36.7s	remaining: 4.04s
999:	l

<catboost.core.CatBoostClassifier at 0x19b20237820>

CV Log Loss Score: 0.746391

----------------- Fold 3 -----------------

Learning rate set to 0.115043
0:	learn: 1.0344374	test: 1.0333665	best: 1.0333665 (0)	total: 31.6ms	remaining: 31.6s
100:	learn: 0.7873031	test: 0.7985645	best: 0.7985645 (100)	total: 3.8s	remaining: 33.8s
200:	learn: 0.7564939	test: 0.7877674	best: 0.7877604 (199)	total: 8.21s	remaining: 32.6s
300:	learn: 0.7307907	test: 0.7803106	best: 0.7803106 (300)	total: 11.8s	remaining: 27.5s
400:	learn: 0.7071603	test: 0.7740516	best: 0.7736202 (383)	total: 16.3s	remaining: 24.3s
500:	learn: 0.6864749	test: 0.7694010	best: 0.7691865 (498)	total: 20.4s	remaining: 20.3s
600:	learn: 0.6651383	test: 0.7667647	best: 0.7667647 (600)	total: 24.5s	remaining: 16.2s
700:	learn: 0.6462522	test: 0.7642322	best: 0.7642322 (700)	total: 28.4s	remaining: 12.1s
800:	learn: 0.6287284	test: 0.7621538	best: 0.7619454 (797)	total: 32.3s	remaining: 8.04s
900:	learn: 0.6127687	test: 0.7596401	best: 0.7593897 (898)	total: 36.7s	remaining: 4.04s
9

<catboost.core.CatBoostClassifier at 0x19b202375b0>

CV Log Loss Score: 0.757336

----------------- Fold 4 -----------------

Learning rate set to 0.115043
0:	learn: 1.0348082	test: 1.0366168	best: 1.0366168 (0)	total: 30.9ms	remaining: 30.8s
100:	learn: 0.7833794	test: 0.8169602	best: 0.8169495 (99)	total: 3.36s	remaining: 29.9s
200:	learn: 0.7498405	test: 0.8037452	best: 0.8037452 (200)	total: 7.19s	remaining: 28.6s
300:	learn: 0.7252353	test: 0.7971014	best: 0.7971014 (300)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7023587	test: 0.7934592	best: 0.7930414 (370)	total: 15.2s	remaining: 22.7s
500:	learn: 0.6804351	test: 0.7884974	best: 0.7884280 (499)	total: 18.9s	remaining: 18.9s
600:	learn: 0.6624269	test: 0.7858885	best: 0.7858535 (598)	total: 23.1s	remaining: 15.3s
700:	learn: 0.6420662	test: 0.7810363	best: 0.7810235 (696)	total: 26.9s	remaining: 11.5s
800:	learn: 0.6251018	test: 0.7777183	best: 0.7776510 (792)	total: 31.1s	remaining: 7.72s
900:	learn: 0.6102722	test: 0.7743571	best: 0.7742942 (898)	total: 35.2s	remaining: 3.87s
9

<catboost.core.CatBoostClassifier at 0x19b202375e0>

CV Log Loss Score: 0.773385

----------------- Fold 5 -----------------

Learning rate set to 0.115043
0:	learn: 1.0340829	test: 1.0346846	best: 1.0346846 (0)	total: 35.1ms	remaining: 35.1s
100:	learn: 0.7835730	test: 0.8049254	best: 0.8049254 (100)	total: 3.32s	remaining: 29.6s
200:	learn: 0.7500575	test: 0.7931703	best: 0.7931703 (200)	total: 7.61s	remaining: 30.2s
300:	learn: 0.7261283	test: 0.7863153	best: 0.7862896 (299)	total: 12s	remaining: 27.8s
400:	learn: 0.7037861	test: 0.7797752	best: 0.7796783 (399)	total: 16.2s	remaining: 24.1s
500:	learn: 0.6840794	test: 0.7749011	best: 0.7749011 (500)	total: 20.1s	remaining: 20.1s
600:	learn: 0.6651191	test: 0.7729836	best: 0.7728524 (587)	total: 24.2s	remaining: 16.1s
700:	learn: 0.6480070	test: 0.7713616	best: 0.7712854 (699)	total: 27.7s	remaining: 11.8s
800:	learn: 0.6303117	test: 0.7684475	best: 0.7680202 (781)	total: 31.1s	remaining: 7.74s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7680202043
bestIteratio

<catboost.core.CatBoostClassifier at 0x19b20237790>

CV Log Loss Score: 0.768020

----------------- Fold 6 -----------------

Learning rate set to 0.115043
0:	learn: 1.0352085	test: 1.0347448	best: 1.0347448 (0)	total: 34.3ms	remaining: 34.2s
100:	learn: 0.7881319	test: 0.8025186	best: 0.8025186 (100)	total: 3.2s	remaining: 28.5s
200:	learn: 0.7552430	test: 0.7903011	best: 0.7902821 (199)	total: 7.34s	remaining: 29.2s
300:	learn: 0.7278605	test: 0.7806939	best: 0.7806939 (300)	total: 11.4s	remaining: 26.5s
400:	learn: 0.7044460	test: 0.7751802	best: 0.7751802 (400)	total: 15.7s	remaining: 23.4s
500:	learn: 0.6834794	test: 0.7707507	best: 0.7707507 (500)	total: 19.7s	remaining: 19.6s
600:	learn: 0.6654387	test: 0.7695493	best: 0.7694843 (592)	total: 23.5s	remaining: 15.6s
700:	learn: 0.6470298	test: 0.7658978	best: 0.7658581 (698)	total: 27.5s	remaining: 11.7s
800:	learn: 0.6292476	test: 0.7623055	best: 0.7621518 (799)	total: 31.1s	remaining: 7.73s
900:	learn: 0.6139458	test: 0.7593406	best: 0.7593406 (900)	total: 34.8s	remaining: 3.82s
9

<catboost.core.CatBoostClassifier at 0x19b202376a0>

CV Log Loss Score: 0.756329

----------------- Fold 7 -----------------

Learning rate set to 0.115043
0:	learn: 1.0342853	test: 1.0335520	best: 1.0335520 (0)	total: 31.6ms	remaining: 31.6s
100:	learn: 0.7874502	test: 0.7915380	best: 0.7915380 (100)	total: 3.37s	remaining: 30s
200:	learn: 0.7533420	test: 0.7781210	best: 0.7780660 (199)	total: 6.86s	remaining: 27.3s
300:	learn: 0.7277870	test: 0.7700051	best: 0.7700051 (300)	total: 10.9s	remaining: 25.2s
400:	learn: 0.7066458	test: 0.7635345	best: 0.7635345 (400)	total: 14.2s	remaining: 21.2s
500:	learn: 0.6856068	test: 0.7582556	best: 0.7582556 (500)	total: 17.4s	remaining: 17.4s
600:	learn: 0.6662291	test: 0.7538991	best: 0.7538025 (595)	total: 20.8s	remaining: 13.8s
700:	learn: 0.6486422	test: 0.7504323	best: 0.7504323 (700)	total: 24.3s	remaining: 10.4s
800:	learn: 0.6315004	test: 0.7467306	best: 0.7464691 (798)	total: 27.5s	remaining: 6.83s
900:	learn: 0.6141501	test: 0.7447743	best: 0.7447743 (900)	total: 31.4s	remaining: 3.45s
99

<catboost.core.CatBoostClassifier at 0x19b20237b80>

CV Log Loss Score: 0.741725

----------------- Fold 8 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341336	test: 1.0343548	best: 1.0343548 (0)	total: 31ms	remaining: 31s
100:	learn: 0.7853685	test: 0.8000339	best: 0.8000339 (100)	total: 3.3s	remaining: 29.4s
200:	learn: 0.7535296	test: 0.7876904	best: 0.7876823 (198)	total: 7.92s	remaining: 31.5s
300:	learn: 0.7283412	test: 0.7803910	best: 0.7803910 (300)	total: 11.3s	remaining: 26.3s
400:	learn: 0.7065717	test: 0.7744898	best: 0.7744898 (400)	total: 14.9s	remaining: 22.2s
500:	learn: 0.6868296	test: 0.7699989	best: 0.7699787 (499)	total: 18.7s	remaining: 18.6s
600:	learn: 0.6675020	test: 0.7670111	best: 0.7668552 (593)	total: 22.2s	remaining: 14.7s
700:	learn: 0.6485104	test: 0.7643483	best: 0.7642842 (672)	total: 25.5s	remaining: 10.9s
800:	learn: 0.6292531	test: 0.7627743	best: 0.7626838 (796)	total: 28.8s	remaining: 7.15s
900:	learn: 0.6121762	test: 0.7606049	best: 0.7606049 (900)	total: 32.1s	remaining: 3.52s
999:	

<catboost.core.CatBoostClassifier at 0x19b20237670>

CV Log Loss Score: 0.758868

----------------- Fold 9 -----------------

Learning rate set to 0.115043
0:	learn: 1.0341785	test: 1.0340822	best: 1.0340822 (0)	total: 31.4ms	remaining: 31.3s
100:	learn: 0.7843458	test: 0.7991214	best: 0.7991214 (100)	total: 3.59s	remaining: 31.9s
200:	learn: 0.7513504	test: 0.7891149	best: 0.7891007 (198)	total: 8.1s	remaining: 32.2s
300:	learn: 0.7243293	test: 0.7821097	best: 0.7821097 (300)	total: 12.5s	remaining: 29s
400:	learn: 0.7016558	test: 0.7765358	best: 0.7765358 (400)	total: 16.1s	remaining: 24s
500:	learn: 0.6805866	test: 0.7716903	best: 0.7716903 (500)	total: 19.7s	remaining: 19.6s
600:	learn: 0.6612055	test: 0.7694699	best: 0.7694699 (600)	total: 24s	remaining: 15.9s
700:	learn: 0.6428791	test: 0.7659496	best: 0.7657665 (696)	total: 27.9s	remaining: 11.9s
800:	learn: 0.6256736	test: 0.7621747	best: 0.7621073 (799)	total: 31.7s	remaining: 7.87s
900:	learn: 0.6061365	test: 0.7621178	best: 0.7615033 (841)	total: 35.7s	remaining: 3.93s
Stopped

<catboost.core.CatBoostClassifier at 0x19b20237dc0>

CV Log Loss Score: 0.761503
	Log Loss: 0.757190


In [59]:
submission=pd.read_csv('sample_submission.csv')
submission.iloc[:, 1:] = cat_pred_test
submission

Unnamed: 0,index,0,1,2
0,26457,0.108691,0.140215,0.751094
1,26458,0.141232,0.157840,0.700928
2,26459,0.119412,0.151644,0.728944
3,26460,0.128912,0.142789,0.728299
4,26461,0.091621,0.218364,0.690015
...,...,...,...,...
9995,36452,0.174974,0.191246,0.633779
9996,36453,0.101350,0.242709,0.655941
9997,36454,0.024520,0.078620,0.896860
9998,36455,0.127251,0.172769,0.699980


In [61]:
submission.to_csv('ml_cat_fin.csv', index=False)

### smote 버전

In [62]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X_over, y_over):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X_over.shape[0], n_class))
cat_pred_test_nc = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X_over.iloc[train_idx], X_over.iloc[valid_idx], y_over.iloc[train_idx], y_over.iloc[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test_nc += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')



----------------- Fold 0 -----------------

Learning rate set to 0.116558
0:	learn: 1.0856816	test: 1.0854006	best: 1.0854006 (0)	total: 48.5ms	remaining: 48.5s
100:	learn: 0.9109535	test: 0.9209863	best: 0.9209863 (100)	total: 6.56s	remaining: 58.4s
200:	learn: 0.8251711	test: 0.8485294	best: 0.8485294 (200)	total: 13.3s	remaining: 52.7s
300:	learn: 0.7695292	test: 0.8038491	best: 0.8038491 (300)	total: 20s	remaining: 46.5s
400:	learn: 0.7287393	test: 0.7713085	best: 0.7713085 (400)	total: 26.5s	remaining: 39.5s
500:	learn: 0.6946040	test: 0.7446072	best: 0.7446072 (500)	total: 32.6s	remaining: 32.5s
600:	learn: 0.6643090	test: 0.7229421	best: 0.7229421 (600)	total: 38.9s	remaining: 25.8s
700:	learn: 0.6406713	test: 0.7064192	best: 0.7064192 (700)	total: 44.6s	remaining: 19s
800:	learn: 0.6205704	test: 0.6919974	best: 0.6919974 (800)	total: 50.5s	remaining: 12.5s
900:	learn: 0.6010270	test: 0.6772650	best: 0.6772650 (900)	total: 57.1s	remaining: 6.28s
999:	learn: 0.5836423	test: 0.66

<catboost.core.CatBoostClassifier at 0x19b202377f0>

CV Log Loss Score: 0.666576

----------------- Fold 1 -----------------

Learning rate set to 0.116558
0:	learn: 1.0861036	test: 1.0855469	best: 1.0855469 (0)	total: 56.1ms	remaining: 56s
100:	learn: 0.9186259	test: 0.9190420	best: 0.9190420 (100)	total: 6.09s	remaining: 54.3s
200:	learn: 0.8289176	test: 0.8375386	best: 0.8375386 (200)	total: 12.3s	remaining: 49.1s
300:	learn: 0.7704060	test: 0.7877278	best: 0.7877278 (300)	total: 18.5s	remaining: 43.1s
400:	learn: 0.7321207	test: 0.7575089	best: 0.7575089 (400)	total: 24.1s	remaining: 35.9s
500:	learn: 0.6996505	test: 0.7330885	best: 0.7330885 (500)	total: 29.9s	remaining: 29.7s
600:	learn: 0.6707237	test: 0.7115592	best: 0.7115592 (600)	total: 35.5s	remaining: 23.6s
700:	learn: 0.6451596	test: 0.6942062	best: 0.6942062 (700)	total: 41.4s	remaining: 17.7s
800:	learn: 0.6215134	test: 0.6770839	best: 0.6770839 (800)	total: 47.2s	remaining: 11.7s
900:	learn: 0.6024883	test: 0.6640655	best: 0.6640655 (900)	total: 53.1s	remaining: 5.84s
99

<catboost.core.CatBoostClassifier at 0x19b202372b0>

CV Log Loss Score: 0.653995

----------------- Fold 2 -----------------

Learning rate set to 0.116558
0:	learn: 1.0866893	test: 1.0867151	best: 1.0867151 (0)	total: 56.3ms	remaining: 56.3s
100:	learn: 0.9104775	test: 0.9155194	best: 0.9155194 (100)	total: 5.56s	remaining: 49.5s
200:	learn: 0.8221760	test: 0.8369740	best: 0.8369740 (200)	total: 11.8s	remaining: 46.9s
300:	learn: 0.7672087	test: 0.7912292	best: 0.7912292 (300)	total: 17.5s	remaining: 40.7s
400:	learn: 0.7324766	test: 0.7647522	best: 0.7647522 (400)	total: 23.4s	remaining: 34.9s
500:	learn: 0.6993518	test: 0.7407635	best: 0.7407635 (500)	total: 29.2s	remaining: 29.1s
600:	learn: 0.6711783	test: 0.7198585	best: 0.7198585 (600)	total: 35.5s	remaining: 23.6s
700:	learn: 0.6461553	test: 0.6998772	best: 0.6998772 (700)	total: 41.9s	remaining: 17.9s
800:	learn: 0.6224377	test: 0.6837742	best: 0.6837628 (799)	total: 48.9s	remaining: 12.1s
900:	learn: 0.6015888	test: 0.6686616	best: 0.6686616 (900)	total: 55.7s	remaining: 6.12s


<catboost.core.CatBoostClassifier at 0x19b202375b0>

CV Log Loss Score: 0.659002

----------------- Fold 3 -----------------

Learning rate set to 0.116558
0:	learn: 1.0862175	test: 1.0863526	best: 1.0863526 (0)	total: 62.1ms	remaining: 1m 2s
100:	learn: 0.9144148	test: 0.9265179	best: 0.9265179 (100)	total: 6.79s	remaining: 1m
200:	learn: 0.8225973	test: 0.8488894	best: 0.8488894 (200)	total: 13.6s	remaining: 53.9s
300:	learn: 0.7689689	test: 0.8063578	best: 0.8063578 (300)	total: 19.8s	remaining: 45.9s
400:	learn: 0.7310532	test: 0.7747440	best: 0.7747440 (400)	total: 26.2s	remaining: 39.2s
500:	learn: 0.6973923	test: 0.7481678	best: 0.7481678 (500)	total: 32.9s	remaining: 32.8s
600:	learn: 0.6692787	test: 0.7269620	best: 0.7269620 (600)	total: 38.9s	remaining: 25.8s
700:	learn: 0.6447418	test: 0.7085566	best: 0.7085566 (700)	total: 45.2s	remaining: 19.3s
800:	learn: 0.6237556	test: 0.6946892	best: 0.6946892 (800)	total: 51.2s	remaining: 12.7s
900:	learn: 0.6039766	test: 0.6807864	best: 0.6807864 (900)	total: 57.3s	remaining: 6.3s
999:

<catboost.core.CatBoostClassifier at 0x19b1c26ad30>

CV Log Loss Score: 0.667900

----------------- Fold 4 -----------------

Learning rate set to 0.116558
0:	learn: 1.0855071	test: 1.0860505	best: 1.0860505 (0)	total: 62.7ms	remaining: 1m 2s
100:	learn: 0.9065830	test: 0.9235822	best: 0.9235822 (100)	total: 6.64s	remaining: 59.1s
200:	learn: 0.8230084	test: 0.8527409	best: 0.8527409 (200)	total: 13s	remaining: 51.7s
300:	learn: 0.7675727	test: 0.8064466	best: 0.8064466 (300)	total: 19.8s	remaining: 46s
400:	learn: 0.7292418	test: 0.7757972	best: 0.7757972 (400)	total: 27.1s	remaining: 40.5s
500:	learn: 0.6966006	test: 0.7500869	best: 0.7500869 (500)	total: 34.2s	remaining: 34s
600:	learn: 0.6699616	test: 0.7292972	best: 0.7292972 (600)	total: 41s	remaining: 27.2s
700:	learn: 0.6469608	test: 0.7112532	best: 0.7112532 (700)	total: 47.4s	remaining: 20.2s
800:	learn: 0.6253549	test: 0.6954183	best: 0.6954183 (800)	total: 54s	remaining: 13.4s
900:	learn: 0.6045272	test: 0.6810897	best: 0.6810786 (899)	total: 1m	remaining: 6.61s
999:	learn: 0

<catboost.core.CatBoostClassifier at 0x19b20237880>

CV Log Loss Score: 0.669314

----------------- Fold 5 -----------------

Learning rate set to 0.116558
0:	learn: 1.0866218	test: 1.0872020	best: 1.0872020 (0)	total: 52.7ms	remaining: 52.7s
100:	learn: 0.9068869	test: 0.9237988	best: 0.9237988 (100)	total: 6.46s	remaining: 57.5s
200:	learn: 0.8209834	test: 0.8512546	best: 0.8512546 (200)	total: 12.8s	remaining: 51.1s
300:	learn: 0.7652765	test: 0.8054406	best: 0.8054406 (300)	total: 19.8s	remaining: 46s
400:	learn: 0.7249483	test: 0.7719610	best: 0.7719349 (399)	total: 26.6s	remaining: 39.8s
500:	learn: 0.6905994	test: 0.7455697	best: 0.7455697 (500)	total: 32.5s	remaining: 32.4s
600:	learn: 0.6617981	test: 0.7252708	best: 0.7252708 (600)	total: 38.6s	remaining: 25.6s
700:	learn: 0.6398819	test: 0.7082710	best: 0.7082710 (700)	total: 44.8s	remaining: 19.1s
800:	learn: 0.6192107	test: 0.6920384	best: 0.6920384 (800)	total: 51s	remaining: 12.7s
900:	learn: 0.6011081	test: 0.6798448	best: 0.6798448 (900)	total: 57.6s	remaining: 6.32s
999:

<catboost.core.CatBoostClassifier at 0x19b20258c40>

CV Log Loss Score: 0.666920

----------------- Fold 6 -----------------

Learning rate set to 0.116558
0:	learn: 1.0855711	test: 1.0867514	best: 1.0867514 (0)	total: 54ms	remaining: 54s
100:	learn: 0.9089741	test: 0.9268196	best: 0.9268196 (100)	total: 6.46s	remaining: 57.5s
200:	learn: 0.8227120	test: 0.8496239	best: 0.8496239 (200)	total: 12.7s	remaining: 50.4s
300:	learn: 0.7667995	test: 0.8006115	best: 0.8006115 (300)	total: 18.9s	remaining: 43.9s
400:	learn: 0.7253216	test: 0.7681167	best: 0.7681167 (400)	total: 25.2s	remaining: 37.7s
500:	learn: 0.6915964	test: 0.7413989	best: 0.7413989 (500)	total: 31.6s	remaining: 31.5s
600:	learn: 0.6657341	test: 0.7223035	best: 0.7223035 (600)	total: 37.9s	remaining: 25.2s
700:	learn: 0.6413799	test: 0.7035477	best: 0.7035477 (700)	total: 44.5s	remaining: 19s
800:	learn: 0.6196869	test: 0.6887434	best: 0.6887434 (800)	total: 50.7s	remaining: 12.6s
900:	learn: 0.6001661	test: 0.6758237	best: 0.6758237 (900)	total: 57.1s	remaining: 6.27s
999:	l

<catboost.core.CatBoostClassifier at 0x19b202584f0>

CV Log Loss Score: 0.663739

----------------- Fold 7 -----------------

Learning rate set to 0.116558
0:	learn: 1.0869905	test: 1.0871105	best: 1.0871105 (0)	total: 53.7ms	remaining: 53.7s
100:	learn: 0.9042392	test: 0.9144474	best: 0.9144474 (100)	total: 6.73s	remaining: 59.9s
200:	learn: 0.8192149	test: 0.8371785	best: 0.8371785 (200)	total: 13.6s	remaining: 54.2s
300:	learn: 0.7675800	test: 0.7928348	best: 0.7928348 (300)	total: 20.1s	remaining: 46.7s
400:	learn: 0.7258986	test: 0.7569406	best: 0.7569406 (400)	total: 26.3s	remaining: 39.3s
500:	learn: 0.6912648	test: 0.7298486	best: 0.7298486 (500)	total: 32.8s	remaining: 32.7s
600:	learn: 0.6636498	test: 0.7091384	best: 0.7091384 (600)	total: 38.9s	remaining: 25.8s
700:	learn: 0.6405836	test: 0.6935469	best: 0.6935469 (700)	total: 45.4s	remaining: 19.4s
800:	learn: 0.6204666	test: 0.6803784	best: 0.6803784 (800)	total: 52.1s	remaining: 12.9s
900:	learn: 0.6012609	test: 0.6665258	best: 0.6665258 (900)	total: 58.8s	remaining: 6.46s


<catboost.core.CatBoostClassifier at 0x19b198cec10>

CV Log Loss Score: 0.655302

----------------- Fold 8 -----------------

Learning rate set to 0.116558
0:	learn: 1.0870573	test: 1.0870783	best: 1.0870783 (0)	total: 50.4ms	remaining: 50.4s
100:	learn: 0.9087553	test: 0.9206583	best: 0.9206583 (100)	total: 6.14s	remaining: 54.6s
200:	learn: 0.8247028	test: 0.8431920	best: 0.8431920 (200)	total: 12.2s	remaining: 48.6s
300:	learn: 0.7741073	test: 0.7998263	best: 0.7998263 (300)	total: 18.5s	remaining: 42.9s
400:	learn: 0.7322681	test: 0.7646738	best: 0.7646738 (400)	total: 25.5s	remaining: 38.1s
500:	learn: 0.7014175	test: 0.7392036	best: 0.7392036 (500)	total: 31.5s	remaining: 31.4s
600:	learn: 0.6689228	test: 0.7140543	best: 0.7140543 (600)	total: 38.3s	remaining: 25.4s
700:	learn: 0.6453039	test: 0.6965704	best: 0.6965038 (699)	total: 44.3s	remaining: 18.9s
800:	learn: 0.6236147	test: 0.6806481	best: 0.6806481 (800)	total: 50.2s	remaining: 12.5s
900:	learn: 0.6039382	test: 0.6665350	best: 0.6665246 (899)	total: 56.1s	remaining: 6.17s


<catboost.core.CatBoostClassifier at 0x19b20258d90>

CV Log Loss Score: 0.652517

----------------- Fold 9 -----------------

Learning rate set to 0.116558
0:	learn: 1.0870211	test: 1.0874724	best: 1.0874724 (0)	total: 49.2ms	remaining: 49.2s
100:	learn: 0.9110096	test: 0.9237743	best: 0.9237743 (100)	total: 6.04s	remaining: 53.8s
200:	learn: 0.8258288	test: 0.8471549	best: 0.8471549 (200)	total: 12.3s	remaining: 49s
300:	learn: 0.7709331	test: 0.8002341	best: 0.8002341 (300)	total: 18.5s	remaining: 42.9s
400:	learn: 0.7303913	test: 0.7665504	best: 0.7665504 (400)	total: 24.6s	remaining: 36.8s
500:	learn: 0.6981236	test: 0.7409063	best: 0.7409063 (500)	total: 30.8s	remaining: 30.7s
600:	learn: 0.6697719	test: 0.7197074	best: 0.7197074 (600)	total: 36.9s	remaining: 24.5s
700:	learn: 0.6444445	test: 0.7008503	best: 0.7008503 (700)	total: 43.2s	remaining: 18.4s
800:	learn: 0.6216626	test: 0.6849767	best: 0.6849767 (800)	total: 49.8s	remaining: 12.4s
900:	learn: 0.6011937	test: 0.6702384	best: 0.6702384 (900)	total: 56.6s	remaining: 6.22s
99

<catboost.core.CatBoostClassifier at 0x19b1c1dbc70>

CV Log Loss Score: 0.656819


ValueError: Found input variables with inconsistent numbers of samples: [50877, 26447]

In [None]:
submission=pd.read_csv('sample_submission.csv')
submission.iloc[:, 1:] = cat_pred_test_nc
submission

In [None]:
submission.to_csv('ml_cat_smote_fin.csv', index=False)

&nbsp;

### RandomForest

In [63]:
from tensorflow.keras.utils import to_categorical

#### 파라미터 튜닝

In [64]:
seed = 30

params_rf = {'n_estimators': [110,130,150],
              'max_depth': [38,40,44,48,50],
              'min_samples_split': [9],
              'min_samples_leaf': [1,3,5,7,9]}

model_rf = RandomForestClassifier(random_state=seed)
    
search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, scoring='neg_log_loss',
                                n_iter=30, cv = 5,random_state=seed)
search_rf.fit(X, y)
    

model_rf = RandomForestClassifier(n_estimators=search_rf.best_params_['n_estimators'],
                                  max_depth=search_rf.best_params_['max_depth'],
                                  min_samples_split=search_rf.best_params_['min_samples_split'],
                                  min_samples_leaf=search_rf.best_params_['min_samples_leaf'],
                                  random_state=seed)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=30),
                   n_iter=30,
                   param_distributions={'max_depth': [38, 40, 44, 48, 50],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [9],
                                        'n_estimators': [110, 130, 150]},
                   random_state=30, scoring='neg_log_loss')

In [79]:
from sklearn.model_selection import KFold, StratifiedKFold

def Kfold(model):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=156)
    outcomes=[]
    sub=np.zeros((X_test.shape[0], 3))
    for n_fold, (train_index, val_index) in enumerate(folds.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train, y_train)
        
        predictions=model.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=model.predict_proba(X_test)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

rf_submission = Kfold(model_rf)

FOLD 0 : logloss:0.7087393632612599
FOLD 1 : logloss:0.7110248159366424
FOLD 2 : logloss:0.7173359801606047
FOLD 3 : logloss:0.7177966429142233
FOLD 4 : logloss:0.7251270902038277
Mean:0.7160047784953115


In [80]:
rf_submission = pd.DataFrame(rf_submission)

rf_submission['index'] = submission['index']
#rf_submission = rf_submission.reindex(columns=['index','0','1','2'])

In [84]:
rf_submission

rf_submission.to_csv('ml_rf_fin.csv',index=False)

Unnamed: 0,0,1,2,index
0,0.105532,0.189752,0.704716,26457
1,0.189374,0.289610,0.521016,26458
2,0.076976,0.089298,0.833726,26459
3,0.084531,0.089776,0.825694,26460
4,0.122682,0.213724,0.663594,26461
...,...,...,...,...
9995,0.125306,0.154692,0.720002,36452
9996,0.210707,0.287825,0.501469,36453
9997,0.030615,0.104622,0.864763,36454
9998,0.213529,0.313131,0.473341,36455


### smote 버전

In [87]:
seed = 30

params_rf = {'n_estimators': [110,130,150],
              'max_depth': [38,40,44,48,50],
              'min_samples_split': [9],
              'min_samples_leaf': [1,3,5,7,9]}

model_rf = RandomForestClassifier(random_state=seed)
    
search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, scoring='neg_log_loss',
                                n_iter=30, cv = 5,random_state=seed)
search_rf.fit(X_over, y_over)
    

model_rf = RandomForestClassifier(n_estimators=search_rf.best_params_['n_estimators'],
                                  max_depth=search_rf.best_params_['max_depth'],
                                  min_samples_split=search_rf.best_params_['min_samples_split'],
                                  min_samples_leaf=search_rf.best_params_['min_samples_leaf'],
                                  random_state=seed)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=30),
                   n_iter=30,
                   param_distributions={'max_depth': [38, 40, 44, 48, 50],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [9],
                                        'n_estimators': [110, 130, 150]},
                   random_state=30, scoring='neg_log_loss')

In [93]:
from sklearn.model_selection import KFold, StratifiedKFold

def Kfold(model):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=156)
    outcomes=[]
    sub=np.zeros((X_test.shape[0], 3))
    for n_fold, (train_index, val_index) in enumerate(folds.split(X, y)):
        X_train, X_val = X_over.iloc[train_index], X_over.iloc[val_index]
        y_train, y_val = y_over.iloc[train_index], y_over.iloc[val_index]
        model.fit(X_train, y_train)
        
        predictions=model.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=model.predict_proba(X_test)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

rf_submission_nc = Kfold(model_rf)

FOLD 0 : logloss:0.7090487640333039
FOLD 1 : logloss:0.711749023755983
FOLD 2 : logloss:0.7176746146760286
FOLD 3 : logloss:0.718168701360285
FOLD 4 : logloss:0.7252969504047662
Mean:0.7163876108460733


In [94]:
rf_submission_nc = pd.DataFrame(rf_submission_nc)

rf_submission_nc['index'] = submission['index']
rf_submission_nc

Unnamed: 0,0,1,2,index
0,0.102560,0.186664,0.710776,26457
1,0.184391,0.287738,0.527872,26458
2,0.075715,0.086077,0.838208,26459
3,0.084916,0.090380,0.824703,26460
4,0.116445,0.216893,0.666663,26461
...,...,...,...,...
9995,0.121264,0.156484,0.722251,36452
9996,0.210103,0.290908,0.498989,36453
9997,0.031081,0.098489,0.870429,36454
9998,0.210210,0.319687,0.470103,36455


In [95]:
rf_submission_nc.to_csv('ml_rf_nc_fin.csv',index=False)