In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역
from datetime import datetime

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook

from sklearn.metrics import log_loss

from tools import eval_summary, save_feature_importance, merge_preds, report





In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
num_class = len(train_label['label'].unique())

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()-2) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
# event_time = 10
# nrows = 20
# train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, 
#                            event_time=10, nrows=60)
# print(train.shape)
# joblib.dump(train, 'data/df_train_{}_{}.pkl'.format(event_time, nrows))

In [7]:
# event_time = 10
# nrows = 20
# test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=None)
# print(test.shape)
# joblib.dump(train, 'data/df_test_{}_{}.pkl'.format(event_time, nrows))

In [8]:
train = joblib.load('data/df_train_10_60.pkl')
test = joblib.load('data/df_test_10.pkl')


In [9]:
train = train.reset_index()
train.head(5)

Unnamed: 0,index,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,105,30.464769,8.677597,8.702804,8.730314,8.710375,188.466110,192.279094,3.577269e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,5.235258e-08,85.4,0.0,77
1,105,30.464943,8.791777,8.741013,8.713725,8.719421,217.356293,180.249471,1.489698e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-2.374557e-05,85.4,0.0,77
2,105,30.488713,8.727617,8.704063,8.735527,8.695147,211.251065,203.137411,-4.623827e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.323392e-07,85.4,0.0,77
3,105,30.480049,8.648655,8.703581,8.701050,8.712508,191.682448,229.797028,-4.555857e-20,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.886027e-07,85.4,0.0,77
4,105,30.458851,8.775581,8.692660,8.668370,8.693597,171.733996,197.299448,2.670567e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.486860e-06,85.4,0.0,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41395,412,30.464380,8.756510,8.704116,8.708070,8.695244,204.176503,161.976163,2.251255e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,9.459667e-06,85.4,0.0,19
41396,412,30.469418,8.795708,8.710370,8.675788,8.729044,179.953484,219.859216,4.935893e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-3.433227e-05,85.4,0.0,19
41397,412,30.496433,8.718257,8.711385,8.743894,8.696282,183.269207,177.435280,2.561626e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.931438e-05,85.4,0.0,19
41398,412,30.486438,8.765540,8.699353,8.699495,8.723896,193.128757,225.198348,-8.936770e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,7.236397e-06,85.4,0.0,19


In [10]:
fea_cols = [c for c in train.columns if c[0] == 'V']
len(fea_cols)

5121

#### 전처리

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train[fea_cols].values)

train[fea_cols] = scaler.transform(train[fea_cols].values)
test[fea_cols] = scaler.transform(test[fea_cols].values)

In [16]:
import numpy as np
from sklearn.decomposition import PCA

n_components = 100
pca = PCA(n_components=n_components)
pca.fit(train[fea_cols].values)

cols = ['pca_{}'.format(i) for i in range(n_components)]

for c in cols:
    train[c] = 0
    test[c] = 0
train[cols] = pca.transform(train[fea_cols].values)
test[cols] = pca.transform(test[fea_cols].values)

In [None]:
# zero_cols = joblib.load('zero_cols.bin')
# fea_cols = [c for c in fea_cols if c not in zero_cols]

In [None]:
# use_cols = joblib.load('use_cols.bin')
# fea_cols = use_cols[:500]

In [17]:
fea_cols = cols

In [18]:
len(fea_cols)

100

In [19]:
train['label'].value_counts(dropna=False)

110    1200
17     1050
114    1000
118    1000
117     950
       ... 
137      50
52       50
51       50
42       50
191      50
Name: label, Length: 198, dtype: int64

In [20]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=81511991154 % 2**32-1)

# X_train.shape, X_test.shape

In [31]:
model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
print('model_ts', model_ts)

initscore_filename = ''
params = {
    'boosting':'gbdt',
#     'boosting':'dart',
#     'boosting':'goss',
    'num_leaves': 7,
#     'max_depth': 3,
    'num_class':num_class,
    'objective': 'multiclass',
    'metric':'multi_logloss',
    'num_threads': 2,
    'learning_rate': 0.01,
    'is_unbalance': True,
#     'scale_pos_weight':200,
    'bagging_fraction':0.1,
    'bagging_freq':5,
    'feature_fraction':0.1,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':20,
    'device_type':'gpu',
#     'tree_learner':'data',

}
print(params)

data_params = {
#     'max_bin':127,
#     'enable_bundle': False,
}
print(data_params)

num_round = 5000
print('num_round:', num_round)

model_ts 20200204T140915
{'boosting': 'gbdt', 'num_leaves': 7, 'num_class': 198, 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_threads': 2, 'learning_rate': 0.01, 'is_unbalance': True, 'bagging_fraction': 0.1, 'bagging_freq': 5, 'feature_fraction': 0.1, 'initscore_filename': '', 'device_type': 'gpu'}
{}
num_round: 5000


In [32]:
# init_model = joblib.load('model/20200129T111708_0.27524341757899773.model')

# 0.02x overfit 0.805803

# 0.08323 0.635796
# 0.174632 0.727734


In [33]:
model = None
submit_csv = []
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=81511991154 % 2**32-1, shuffle=True)

cv = 0
for train_index, valid_index in tqdm_notebook(skf.split(train.index, train['label'].values), total=n_splits, desc = 'CV'):
#     cnt += 1
#     print(cnt)
#     if cnt < 3:
#         continue

    
    X_train, X_test = train.loc[train_index, fea_cols], train.loc[valid_index, fea_cols] 
    y_train, y_test = train.loc[train_index,'label'], train.loc[valid_index, 'label']    
    
    print(X_train.shape, X_test.shape)
#     print(y_train.value_counts(dropna=False))
#     print(y_test.value_counts(dropna=False))
    
    
    train_set = lgb.Dataset(X_train, label=y_train, params=data_params)
    val_set = lgb.Dataset(X_test, label=y_test, params=data_params)

    evals_result = {}
    model = lgb.train(params, train_set, num_round, early_stopping_rounds=200, 
                            valid_sets=[train_set, val_set],
                            verbose_eval=50,
                            evals_result=evals_result,
#                             init_model=model,
                           )

    model_tag ='{}_{}_{}_{}'.format(model_ts, cv,
                                 evals_result['valid_1']['multi_logloss'][model.best_iteration-1],
                                 evals_result['training']['multi_logloss'][model.best_iteration-1]
                                )
    print(model_tag)

    joblib.dump(model, 'model/{}.model'.format(model_tag))
    
    
    pred = model.predict(test[fea_cols])

    submission = pd.DataFrame(data=pred)
    submission.index = test.index
    submission.index.name = 'id'
    submission = submission.sort_index()
    submission = submission.groupby('id').mean()

    csv_path = 'submit/{}.csv'.format(model_tag)
    submit_csv.append(csv_path)
    submission.to_csv(    csv_path, index=True) 
    
    print(submission.sum(axis=1))
    print(submission)
    cv += 1
#     break

HBox(children=(IntProgress(value=0, description='CV', max=10, style=ProgressStyle(description_width='initial')…

(37260, 100) (4140, 100)
Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.94187	valid_1's multi_logloss: 3.12158
[100]	training's multi_logloss: 2.33382	valid_1's multi_logloss: 2.62727
[150]	training's multi_logloss: 1.96262	valid_1's multi_logloss: 2.34381
[200]	training's multi_logloss: 1.70371	valid_1's multi_logloss: 2.15565
[250]	training's multi_logloss: 1.51082	valid_1's multi_logloss: 2.0222
[300]	training's multi_logloss: 1.36142	valid_1's multi_logloss: 1.9246
[350]	training's multi_logloss: 1.24331	valid_1's multi_logloss: 1.8528
[400]	training's multi_logloss: 1.14644	valid_1's multi_logloss: 1.7986
[450]	training's multi_logloss: 1.06543	valid_1's multi_logloss: 1.75584
[500]	training's multi_logloss: 0.996124	valid_1's multi_logloss: 1.7207
[550]	training's multi_logloss: 0.937156	valid_1's multi_logloss: 1.69325
[600]	training's multi_logloss: 0.886775	valid_1's multi_logloss: 1.67181
[650]	training's multi_logloss: 0.841671

Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.94219	valid_1's multi_logloss: 3.11191
[100]	training's multi_logloss: 2.336	valid_1's multi_logloss: 2.61555
[150]	training's multi_logloss: 1.96472	valid_1's multi_logloss: 2.32759
[200]	training's multi_logloss: 1.70432	valid_1's multi_logloss: 2.1369
[250]	training's multi_logloss: 1.51208	valid_1's multi_logloss: 2.00131
[300]	training's multi_logloss: 1.36105	valid_1's multi_logloss: 1.90161
[350]	training's multi_logloss: 1.24225	valid_1's multi_logloss: 1.82605
[400]	training's multi_logloss: 1.14588	valid_1's multi_logloss: 1.76919
[450]	training's multi_logloss: 1.06434	valid_1's multi_logloss: 1.72246
[500]	training's multi_logloss: 0.996036	valid_1's multi_logloss: 1.68656
[550]	training's multi_logloss: 0.937598	valid_1's multi_logloss: 1.65852
[600]	training's multi_logloss: 0.886098	valid_1's multi_logloss: 1.63494
[650]	training's multi_logloss: 0.841169	valid_1's multi_loglos

Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.94603	valid_1's multi_logloss: 3.1085
[100]	training's multi_logloss: 2.33705	valid_1's multi_logloss: 2.60525
[150]	training's multi_logloss: 1.96684	valid_1's multi_logloss: 2.31813
[200]	training's multi_logloss: 1.70584	valid_1's multi_logloss: 2.12636
[250]	training's multi_logloss: 1.51353	valid_1's multi_logloss: 1.99255
[300]	training's multi_logloss: 1.36374	valid_1's multi_logloss: 1.89454
[350]	training's multi_logloss: 1.24559	valid_1's multi_logloss: 1.82031
[400]	training's multi_logloss: 1.14934	valid_1's multi_logloss: 1.7646
[450]	training's multi_logloss: 1.06802	valid_1's multi_logloss: 1.71853
[500]	training's multi_logloss: 0.999138	valid_1's multi_logloss: 1.68407
[550]	training's multi_logloss: 0.939893	valid_1's multi_logloss: 1.65583
[600]	training's multi_logloss: 0.888261	valid_1's multi_logloss: 1.63218
[650]	training's multi_logloss: 0.843289	valid_1's multi_loglo

Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.94279	valid_1's multi_logloss: 3.11742
[100]	training's multi_logloss: 2.33725	valid_1's multi_logloss: 2.61934
[150]	training's multi_logloss: 1.9657	valid_1's multi_logloss: 2.33116
[200]	training's multi_logloss: 1.70506	valid_1's multi_logloss: 2.14097
[250]	training's multi_logloss: 1.51216	valid_1's multi_logloss: 2.00579
[300]	training's multi_logloss: 1.36254	valid_1's multi_logloss: 1.90416
[350]	training's multi_logloss: 1.24373	valid_1's multi_logloss: 1.82767
[400]	training's multi_logloss: 1.14724	valid_1's multi_logloss: 1.77006
[450]	training's multi_logloss: 1.06627	valid_1's multi_logloss: 1.72451
[500]	training's multi_logloss: 0.997658	valid_1's multi_logloss: 1.68687
[550]	training's multi_logloss: 0.939572	valid_1's multi_logloss: 1.65827
[600]	training's multi_logloss: 0.888567	valid_1's multi_logloss: 1.63533
[650]	training's multi_logloss: 0.84328	valid_1's multi_loglo

Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 2.94593	valid_1's multi_logloss: 3.10889
[100]	training's multi_logloss: 2.33934	valid_1's multi_logloss: 2.60521
[150]	training's multi_logloss: 1.96747	valid_1's multi_logloss: 2.31474
[200]	training's multi_logloss: 1.70756	valid_1's multi_logloss: 2.11982
[250]	training's multi_logloss: 1.5145	valid_1's multi_logloss: 1.98516
[300]	training's multi_logloss: 1.36512	valid_1's multi_logloss: 1.8836
[350]	training's multi_logloss: 1.24647	valid_1's multi_logloss: 1.80778
[400]	training's multi_logloss: 1.14862	valid_1's multi_logloss: 1.74912
[450]	training's multi_logloss: 1.06773	valid_1's multi_logloss: 1.70342
[500]	training's multi_logloss: 0.999541	valid_1's multi_logloss: 1.66728
[550]	training's multi_logloss: 0.94146	valid_1's multi_logloss: 1.63919
[600]	training's multi_logloss: 0.890174	valid_1's multi_logloss: 1.61568
[650]	training's multi_logloss: 0.844671	valid_1's multi_loglos

KeyboardInterrupt: 

In [None]:
# submissions = [
# 'submit/20200201T193822_0.42165222105307115_0.03657010393259738.csv',
# 'submit/20200201T183544_0.4354487978488266_0.04354968619883053.csv',
# 'submit/20200201T173725_0.423066834354457_0.03614391993976106.csv',
# 'submit/20200201T163343_0.4287587567261741_0.042392138498467166.csv',
# 'submit/20200201T153529_0.42326652930531944_0.04098269988118284.csv',
# 'submit/20200201T143616_0.4258237823312355_0.04651153387555587.csv',
# 'submit/20200201T134047_0.43147156765580946_0.043682031170534714.csv',
# 'submit/20200201T125939_0.42958065644660504_0.040012625819045466.csv',
# 'submit/20200201T120739_0.42070899280425217_0.03477196302416032.csv',
# 'submit/20200201T110556_0.4181683365658109_0.043843902710973416.csv',
# ]


dfs = [pd.read_csv(s) for s in submit_csv]

df_submit = pd.concat(dfs)

df_submit = df_submit.groupby('id').mean()

df_submit.to_csv('submit/{}_e{}.csv'.format(model_ts, n_splits), index=True) 

In [None]:
df_submit.sum(1)

In [None]:
importance_type = 'split'
impt_dict = {k:v for k, v in zip(fea_cols, model.feature_importance(importance_type=importance_type))}
# sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)
# zero_cols = []
for k, s in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=False):
    if s == 0:
        zero_cols.append(k)
# joblib.dump(impt_dict, f'model/{file_name}_{importance_type}.pkl')

len(zero_cols)

In [None]:
joblib.dump(zero_cols, 'zero_cols.bin')

In [None]:
# model = joblib.load('model/20200130T221520_2.4393985000913667_0.07225009557115544.model')

In [None]:
# pred = model.predict(test)

# submission = pd.DataFrame(data=pred)
# submission.index = test.index
# submission.index.name = 'id'
# submission = submission.sort_index()
# submission = submission.groupby('id').mean()

# submission.to_csv('submit/{}.csv'.format(model_tag), index=True) 
# model_tag

# submission.sum(axis=1)

# submission