In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역
from datetime import datetime

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook



In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
num_class = len(train_label['label'].unique())

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()-2) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
# train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, 
#                            event_time=10, nrows=120)
# print(train.shape)

# test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=None)
# print(test.shape)


In [7]:
train = joblib.load('data/df_train_10_120.pkl').reset_index()
test = joblib.load('data/df_test_10.pkl')


In [8]:
train

Unnamed: 0,index,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,105,30.464769,8.677597,8.702804,8.730314,8.710375,188.466110,192.279094,3.577269e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,5.235258e-08,85.4,0.0,77
1,105,30.464943,8.791777,8.741013,8.713725,8.719421,217.356293,180.249471,1.489698e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-2.374557e-05,85.4,0.0,77
2,105,30.488713,8.727617,8.704063,8.735527,8.695147,211.251065,203.137411,-4.623827e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.323392e-07,85.4,0.0,77
3,105,30.480049,8.648655,8.703581,8.701050,8.712508,191.682448,229.797028,-4.555857e-20,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.886027e-07,85.4,0.0,77
4,105,30.458851,8.775581,8.692660,8.668370,8.693597,171.733996,197.299448,2.670567e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.486860e-06,85.4,0.0,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91075,412,30.492960,8.744885,8.717549,8.680362,8.695514,199.515275,169.003273,-2.312561e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-7.233104e-06,85.4,0.0,19
91076,412,30.484724,8.699884,8.703983,8.673985,8.714074,165.587301,156.150820,5.344420e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-4.060542e-07,85.4,0.0,19
91077,412,30.502568,8.684008,8.687454,8.679443,8.722234,170.653265,204.056076,5.437461e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,3.080914e-06,85.4,0.0,19
91078,412,30.520585,8.622467,8.695733,8.668384,8.701016,160.572151,141.810196,-4.936979e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,9.603815e-07,85.4,0.0,19


In [9]:
fea_cols = [c for c in train.columns if c[0] == 'V']
len(fea_cols)

5121

In [10]:
# scaler = joblib.load('scaler_20200129T135731.bin')

# X[fea_cols] = scaler.transform(X[fea_cols].values)

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=81511991154 % 2**32-1)

# X_train.shape, X_test.shape

In [12]:
model = joblib.load('model/20200128T034443_0.0502828605482319.model')

In [13]:
from sklearn.metrics import log_loss
# score = log_loss(y_test, clf_probs)

In [14]:
train['label'].value_counts(dropna=False)

110    2640
17     2310
114    2200
118    2200
117    2090
       ... 
101     110
145     110
37      110
100     110
191     110
Name: label, Length: 198, dtype: int64

In [15]:
pred = model.predict(train[fea_cols].values)
score = log_loss(train['label'].values, pred)
print(score)

0.05028286053630513


In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=81511991154 % 2**32-1, shuffle=True)

for train_index, valid_index in tqdm_notebook(skf.split(train.index, train['label'].values), total=n_splits, desc = 'CV'):
    X_train, X_test = train.loc[train_index, fea_cols], train.loc[valid_index, fea_cols] 
    y_train, y_test = train.loc[train_index,'label'], train.loc[valid_index, 'label']
    
    print(X_train.shape, X_test.shape)
    print(y_train.value_counts(dropna=False))
    print(y_test.value_counts(dropna=False))
    
    pred = model.predict(X_test)
    score = log_loss(y_test, pred)
    print(score)


In [None]:
# model = joblib.load('model/20200130T221520_2.4393985000913667_0.07225009557115544.model')

In [None]:
# pred = model.predict(test)

# submission = pd.DataFrame(data=pred)
# submission.index = test.index
# submission.index.name = 'id'
# submission = submission.sort_index()
# submission = submission.groupby('id').mean()

# submission.to_csv('submit/{}.csv'.format(model_tag), index=True) 
# model_tag

# submission.sum(axis=1)

# submission