In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역
from datetime import datetime

import lightgbm as lgb


In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
num_class = len(train_label['label'].unique())

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, 
                           event_time=10, nrows=120)


In [7]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']
X_train.shape

(91080, 5121)

In [53]:
model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
print('model_ts', model_ts)

initscore_filename = ''
params = {
#     'boosting':'gbdt',
#     'boosting':'dart',
#     'boosting':'goss',
    'num_leaves': 5, 
    'max_depth': 3,
    'num_class':num_class,
    'objective': 'multiclass',
    'metric':'multi_logloss',
    'num_threads': 14,
    'learning_rate': 0.01,
    'is_unbalance': True,
#     'scale_pos_weight':200,
    'bagging_fraction':0.1,
    'bagging_freq':10,
    'feature_fraction':0.1,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':5,
#     'device_type':'gpu',
#     'tree_learner':'data',

}
print(params)

data_params = {
    'max_bin':127,
#     'enable_bundle': False,
}
print(data_params)

num_round = 500
print('num_round:', num_round)

model_ts 20200128T023219
{'num_leaves': 5, 'max_depth': 3, 'num_class': 198, 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_threads': 14, 'learning_rate': 0.01, 'is_unbalance': True, 'bagging_fraction': 0.1, 'bagging_freq': 10, 'feature_fraction': 0.1, 'initscore_filename': ''}
{'max_bin': 127}
num_round: 500


In [55]:
init_model = joblib.load('model/20200128T022935.model')

In [56]:
# 0.02x overfit 0.805803

# 0.08323 0.635796
# 0.174632 0.727734

In [64]:
for i in range(10):
    print(i)
    train_set = lgb.Dataset(X_train, label=y_train, params=data_params)

    evals_result = {}
    model = lgb.train(params, train_set, num_round, early_stopping_rounds=300, 
                            valid_sets=[train_set],
                            verbose_eval=50,
                              evals_result=evals_result,
                            init_model=model,
                           )

    # evals_result['training']['multi_logloss'][-1]

    model_tag ='{}_{}'.format(datetime.now().strftime('%Y%m%dT%H%M%S'), evals_result['training']['multi_logloss'][-1])
    print(model_tag)

    joblib.dump(model, 'model/{}.model'.format(model_tag))

    test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=None)

    pred = model.predict(test)

    submission = pd.DataFrame(data=pred)
    submission.index = test.index
    submission.index.name = 'id'
    submission = submission.sort_index()
    submission = submission.groupby('id').mean()

    submission.to_csv('submit/{}.csv'.format(model_tag), index=True) 
    model_tag

0
Training until validation scores don't improve for 300 rounds
[1550]	training's multi_logloss: 0.113162
[1600]	training's multi_logloss: 0.109926
[1650]	training's multi_logloss: 0.106738
[1700]	training's multi_logloss: 0.103542
[1750]	training's multi_logloss: 0.100775
[1800]	training's multi_logloss: 0.0980215
[1850]	training's multi_logloss: 0.0952497
[1900]	training's multi_logloss: 0.092805
[1950]	training's multi_logloss: 0.09035
[2000]	training's multi_logloss: 0.0881891
Did not meet early stopping. Best iteration is:
[2000]	training's multi_logloss: 0.0881891
20200128T025414_0.08818913412660236
Training until validation scores don't improve for 300 rounds
[2050]	training's multi_logloss: 0.0861216
[2100]	training's multi_logloss: 0.0842805
[2150]	training's multi_logloss: 0.0824029
[2200]	training's multi_logloss: 0.0804162
[2250]	training's multi_logloss: 0.0787499
[2300]	training's multi_logloss: 0.0770176
[2350]	training's multi_logloss: 0.0752593
[2400]	training's multi_