In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역
from datetime import datetime

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook

from sklearn.metrics import log_loss

from tools import eval_summary, save_feature_importance, merge_preds, report





In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
num_class = len(train_label['label'].unique())

In [5]:
def data_loader_all(func, files, folder='', train_label=None, event_time=10, nrows=60, use_cols=[]):   
    func_fixed = partial(func, folder=folder, train_label=train_label, nrows=nrows, use_cols=use_cols)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df



def _data_loader(file_name, folder='', train_label=None, nrows=60, use_cols=[]):
    file_id = int(file_name.split('.')[0]) 
    df = pd.read_csv(folder+file_name, nrows=nrows) 
    df = df.replace('.*', 0, regex=True).fillna(0) 
    
    df['id'] = file_id
    
        
    cols = ['time', 'id'] + use_cols
    
    df = df[cols]   
    
    df['time'] = df['time'].apply(lambda x: f't{x:03}')
    df_p = df.pivot('id', 'time', use_cols)
    df_p.columns = ['_'.join(col) for col in df_p.columns]
    
    
    ##
    
    if type(train_label) != type(None):
        label = train_label.loc[file_id]['label'] 
        df_p['label'] = label
    
    return df_p

In [6]:
event_time = 10
nrows = 60
use_cols = ['V3239', 'V3237', 'V3238']
train = data_loader_all(_data_loader, train_list, folder=train_folder, train_label=train_label, 
                           event_time=event_time, nrows=nrows, use_cols=use_cols)
print(train.shape)

(828, 181)


In [7]:
test = data_loader_all(_data_loader, test_list, folder=test_folder, train_label=None, 
                       event_time=10, nrows=None, use_cols=use_cols)
print(test.shape)


(720, 180)


In [8]:
train = train.reset_index()
train.head(5)

Unnamed: 0,id,V3239_t000,V3239_t001,V3239_t002,V3239_t003,V3239_t004,V3239_t005,V3239_t006,V3239_t007,V3239_t008,...,V3238_t051,V3238_t052,V3238_t053,V3238_t054,V3238_t055,V3238_t056,V3238_t057,V3238_t058,V3238_t059,label
0,105,16.564892,16.558286,16.558963,16.560084,16.563487,16.55968,16.555753,16.558892,16.560912,...,16.551255,16.562519,16.564556,16.555987,16.557727,16.551886,16.539071,16.541303,16.561505,77
1,617,16.563824,16.563763,16.563763,16.562532,16.559973,16.55808,16.557836,16.565339,16.561291,...,16.558306,16.567346,16.554973,16.552679,16.573336,16.569496,16.570331,16.553238,16.564369,114
2,12,16.564937,16.562374,16.56294,16.563478,16.557728,16.869688,16.841234,16.826543,16.810714,...,16.598243,16.582688,16.594089,16.605097,16.600743,16.591576,16.571638,16.595281,16.58368,132
3,465,16.55965,16.561563,16.563542,16.556449,16.559149,16.563376,16.558738,16.560629,16.558002,...,16.57017,16.570964,16.582383,16.56289,16.573013,16.561436,16.552583,16.56387,16.543884,4
4,313,16.561392,16.561237,16.555722,16.56512,16.560582,16.558743,16.557365,16.562224,16.559725,...,16.736279,16.721753,16.746977,16.678755,16.64438,16.662776,16.681025,16.665391,16.642454,22


In [9]:
fea_cols = [c for c in train.columns if c[0] == 'V']
len(fea_cols)

180

In [None]:
# zero_cols = joblib.load('zero_cols.bin')
# fea_cols = [c for c in fea_cols if c not in zero_cols]

In [None]:
# use_cols = joblib.load('use_cols.bin')
# fea_cols = use_cols[:500]

In [17]:
# fea_cols = cols

In [10]:
len(fea_cols)

180

In [11]:
train['label'].value_counts(dropna=False)

110    24
17     21
114    20
118    20
113    19
       ..
134     1
135     1
137     1
138     1
98      1
Name: label, Length: 198, dtype: int64

In [20]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=81511991154 % 2**32-1)

# X_train.shape, X_test.shape

In [12]:
model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
print('model_ts', model_ts)

initscore_filename = ''
params = {
    'boosting':'gbdt',
#     'boosting':'dart',
#     'boosting':'goss',
    'num_leaves': 7,
#     'max_depth': 3,
    'num_class':num_class,
    'objective': 'multiclass',
    'metric':'multi_logloss',
    'num_threads': 16,
    'learning_rate': 0.01,
    'is_unbalance': True,
#     'scale_pos_weight':200,
    'bagging_fraction':1.0,
#     'bagging_freq':5,
    'feature_fraction':1.0,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':20,
    'device_type':'gpu',
#     'tree_learner':'data',

}
print(params)

data_params = {
#     'max_bin':127,
#     'enable_bundle': False,
}
print(data_params)

num_round = 5000
print('num_round:', num_round)

model_ts 20200206T141753
{'boosting': 'gbdt', 'num_leaves': 7, 'num_class': 198, 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_threads': 16, 'learning_rate': 0.01, 'is_unbalance': True, 'bagging_fraction': 1.0, 'feature_fraction': 1.0, 'initscore_filename': '', 'device_type': 'gpu'}
{}
num_round: 5000


In [None]:
model = None
submit_csv = []

X_train = train[fea_cols]
y_train = train['label']

print(X_train.shape)

train_set = lgb.Dataset(X_train, label=y_train, params=data_params)
val_set = lgb.Dataset(X_test, label=y_test, params=data_params)

evals_result = {}
model = lgb.train(params, train_set, num_round, early_stopping_rounds=200, 
                        valid_sets=[train_set],
                        verbose_eval=50,
                        evals_result=evals_result,
#                             init_model=model,
                       )

In [15]:
model_tag ='{}_{}'.format(model_ts,
                             evals_result['training']['multi_logloss'][model.best_iteration-1]
                            )
print(model_tag)

joblib.dump(model, 'model/{}.model'.format(model_tag))


pred = model.predict(test[fea_cols])

submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()

csv_path = 'submit/{}.csv'.format(model_tag)
submit_csv.append(csv_path)
submission.to_csv(    csv_path, index=True) 

print(submission.sum(axis=1))
print(submission)

20200206T141753_4.5229651113300277e-05
id
828     1.0
829     1.0
830     1.0
831     1.0
832     1.0
       ... 
1543    1.0
1544    1.0
1545    1.0
1546    1.0
1547    1.0
Length: 720, dtype: float64
               0             1             2             3             4    \
id                                                                           
828   1.173297e-07  1.952713e-07  1.136406e-07  1.471526e-07  2.953771e-07   
829   1.529063e-03  1.410625e-04  5.619636e-04  1.761505e-04  4.209348e-04   
830   1.909268e-04  1.787835e-04  1.944110e-04  1.975971e-04  2.657966e-03   
831   1.428303e-03  1.263564e-03  1.794180e-03  2.989128e-03  1.003132e-02   
832   6.525820e-04  6.719813e-04  1.319957e-03  5.807292e-04  2.994565e-03   
...            ...           ...           ...           ...           ...   
1543  5.444246e-04  1.134919e-03  4.661991e-04  3.273118e-04  4.529080e-04   
1544  9.840530e-08  8.555108e-08  9.708615e-08  1.620849e-07  1.165334e-07   
1545  8.343374e-03

In [13]:
model = None
submit_csv = []
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=81511991154 % 2**32-1, shuffle=True)

cv = 0
for train_index, valid_index in tqdm_notebook(skf.split(train.index, train['label'].values), total=n_splits, desc = 'CV'):
#     cnt += 1
#     print(cnt)
#     if cnt < 3:
#         continue

    
    X_train, X_test = train.loc[train_index, fea_cols], train.loc[valid_index, fea_cols] 
    y_train, y_test = train.loc[train_index,'label'], train.loc[valid_index, 'label']    
    
    print(X_train.shape, X_test.shape)
#     print(y_train.value_counts(dropna=False))
#     print(y_test.value_counts(dropna=False))
    
    
    train_set = lgb.Dataset(X_train, label=y_train, params=data_params)
    val_set = lgb.Dataset(X_test, label=y_test, params=data_params)

    evals_result = {}
    model = lgb.train(params, train_set, num_round, early_stopping_rounds=200, 
                            valid_sets=[train_set, val_set],
                            verbose_eval=50,
                            evals_result=evals_result,
#                             init_model=model,
                           )

    model_tag ='{}_{}_{}_{}'.format(model_ts, cv,
                                 evals_result['valid_1']['multi_logloss'][model.best_iteration-1],
                                 evals_result['training']['multi_logloss'][model.best_iteration-1]
                                )
    print(model_tag)

    joblib.dump(model, 'model/{}.model'.format(model_tag))
    
    
    pred = model.predict(test[fea_cols])

    submission = pd.DataFrame(data=pred)
    submission.index = test.index
    submission.index.name = 'id'
    submission = submission.sort_index()
    submission = submission.groupby('id').mean()

    csv_path = 'submit/{}.csv'.format(model_tag)
    submit_csv.append(csv_path)
    submission.to_csv(    csv_path, index=True) 
    
    print(submission.sum(axis=1))
    print(submission)
    cv += 1
#     break

HBox(children=(IntProgress(value=0, description='CV', max=10, style=ProgressStyle(description_width='initial')…



(734, 180) (94, 180)
Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.64769	valid_1's multi_logloss: 6.59114
[100]	training's multi_logloss: 1.62915	valid_1's multi_logloss: 6.59394
[150]	training's multi_logloss: 1.05809	valid_1's multi_logloss: 6.61689
[200]	training's multi_logloss: 0.70764	valid_1's multi_logloss: 6.6536
[250]	training's multi_logloss: 0.481514	valid_1's multi_logloss: 6.70741
Early stopping, best iteration is:
[81]	training's multi_logloss: 1.9418	valid_1's multi_logloss: 6.58614
20200206T141753_0_6.586139127087876_1.9418004339346369
id
828     1.0
829     1.0
830     1.0
831     1.0
832     1.0
       ... 
1543    1.0
1544    1.0
1545    1.0
1546    1.0
1547    1.0
Length: 720, dtype: float64
           0         1         2         3         4         5         6    \
id                                                                           
828   0.001980  0.000991  0.000991  0.002969  0.001980  0.002969  0.00452

KeyboardInterrupt: 

In [None]:
# submissions = [
# 'submit/20200201T193822_0.42165222105307115_0.03657010393259738.csv',
# 'submit/20200201T183544_0.4354487978488266_0.04354968619883053.csv',
# 'submit/20200201T173725_0.423066834354457_0.03614391993976106.csv',
# 'submit/20200201T163343_0.4287587567261741_0.042392138498467166.csv',
# 'submit/20200201T153529_0.42326652930531944_0.04098269988118284.csv',
# 'submit/20200201T143616_0.4258237823312355_0.04651153387555587.csv',
# 'submit/20200201T134047_0.43147156765580946_0.043682031170534714.csv',
# 'submit/20200201T125939_0.42958065644660504_0.040012625819045466.csv',
# 'submit/20200201T120739_0.42070899280425217_0.03477196302416032.csv',
# 'submit/20200201T110556_0.4181683365658109_0.043843902710973416.csv',
# ]


dfs = [pd.read_csv(s) for s in submit_csv]

df_submit = pd.concat(dfs)

df_submit = df_submit.groupby('id').mean()

df_submit.to_csv('submit/{}_e{}.csv'.format(model_ts, n_splits), index=True) 

In [None]:
df_submit.sum(1)

In [None]:
importance_type = 'split'
impt_dict = {k:v for k, v in zip(fea_cols, model.feature_importance(importance_type=importance_type))}
# sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)
# zero_cols = []
for k, s in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=False):
    if s == 0:
        zero_cols.append(k)
# joblib.dump(impt_dict, f'model/{file_name}_{importance_type}.pkl')

len(zero_cols)

In [None]:
joblib.dump(zero_cols, 'zero_cols.bin')

In [None]:
# model = joblib.load('model/20200130T221520_2.4393985000913667_0.07225009557115544.model')

In [None]:
# pred = model.predict(test)

# submission = pd.DataFrame(data=pred)
# submission.index = test.index
# submission.index.name = 'id'
# submission = submission.sort_index()
# submission = submission.groupby('id').mean()

# submission.to_csv('submit/{}.csv'.format(model_tag), index=True) 
# model_tag

# submission.sum(axis=1)

# submission