In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역
from datetime import datetime

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook

from sklearn.metrics import log_loss

from tools import eval_summary, save_feature_importance, merge_preds, report





In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
num_class = len(train_label['label'].unique())

In [5]:
def data_loader_all(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()-2) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [13]:
df = pd.read_csv('data/train/30.csv')
df.index = np.repeat(30, len(df)) # row 인덱스를 file id로 덮어 씌우기 

In [30]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [46]:
agg_list = ['mean', 'std', 'min', 'max', 'median', 'sum',
            pd.DataFrame.kurt,
            pd.DataFrame.skew,
            percentile(10), percentile(20), percentile(30), percentile(40), percentile(50),
            percentile(60), percentile(70), percentile(80), percentile(90),
           ]

In [47]:
df.groupby(df.index).agg({'V0000':agg_list})

Unnamed: 0_level_0,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000,V0000
Unnamed: 0_level_1,mean,std,min,max,median,sum,kurt,skew,percentile_10,percentile_20,percentile_30,percentile_40,percentile_50,percentile_60,percentile_70,percentile_80,percentile_90
30,3.236973,9.813574,0.0,32.94162,0.0,1877.444136,5.340688,2.705972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df.groupby(df.index).quantile([0, 0.25, 0.5, 0.75, 0.95, 1])


TypeError: 'quantile' cannot be performed against 'object' dtypes!

In [6]:
def data_loader_v3(file_name, folder='', train_label=None, event_time=10, nrows=60, use_cols=None):
    file_id = int(file_name.split('.')[0]) # file id만 불러오기
    df = pd.read_csv(folder+file_name, index_col=0, nrows=nrows) # 파일 읽어오기
    
    df = df.replace('.*', 0, regex=True).fillna(0) # 모든 문자열과 NA값을 0으로 대체
    
    df = df.loc[event_time:] # event_time 이후의 row들만 가지고 오기
    
    df.index = np.repeat(file_id, len(df)) # row 인덱스를 file id로 덮어 씌우기 
    if type(train_label) != type(None):
        label = train_label.loc[file_id]['label'] 
        df['label'] = np.repeat(label, len(df)) #train set일 경우 라벨 추가하기 
    
    return df

In [89]:
train = data_loader_all(data_loader_v3, train_list, folder=train_folder, train_label=train_label, 
                           event_time=10, nrows=60)
print(train.shape)
joblib.dump(train, 'data/df_train_10_60.pkl')

# test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=None)
# print(test.shape)


(41400, 5122)


In [90]:
joblib.dump(train, 'data/df_train_10_60.pkl')

['data/df_train_10_60.pkl']

In [88]:
train = joblib.load('data/df_train_10_60.pkl').reset_index()
test = joblib.load('data/df_test_10.pkl')


FileNotFoundError: [Errno 2] No such file or directory: 'data/df_train_10_60.pkl'

In [8]:
train

Unnamed: 0,index,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,105,30.464769,8.677597,8.702804,8.730314,8.710375,188.466110,192.279094,3.577269e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,5.235258e-08,85.4,0.0,77
1,105,30.464943,8.791777,8.741013,8.713725,8.719421,217.356293,180.249471,1.489698e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-2.374557e-05,85.4,0.0,77
2,105,30.488713,8.727617,8.704063,8.735527,8.695147,211.251065,203.137411,-4.623827e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.323392e-07,85.4,0.0,77
3,105,30.480049,8.648655,8.703581,8.701050,8.712508,191.682448,229.797028,-4.555857e-20,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.886027e-07,85.4,0.0,77
4,105,30.458851,8.775581,8.692660,8.668370,8.693597,171.733996,197.299448,2.670567e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,6.486860e-06,85.4,0.0,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91075,412,30.492960,8.744885,8.717549,8.680362,8.695514,199.515275,169.003273,-2.312561e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-7.233104e-06,85.4,0.0,19
91076,412,30.484724,8.699884,8.703983,8.673985,8.714074,165.587301,156.150820,5.344420e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-4.060542e-07,85.4,0.0,19
91077,412,30.502568,8.684008,8.687454,8.679443,8.722234,170.653265,204.056076,5.437461e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,3.080914e-06,85.4,0.0,19
91078,412,30.520585,8.622467,8.695733,8.668384,8.701016,160.572151,141.810196,-4.936979e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,9.603815e-07,85.4,0.0,19


In [57]:
fea_cols = [c for c in train.columns if c[0] == 'V']
len(fea_cols)

5121

In [1]:
zero_cols = joblib.load('zero_cols.bin')
fea_cols = [c for c in fea_cols if c not in zero_cols]

NameError: name 'joblib' is not defined

In [59]:
len(fea_cols)

2755

In [60]:
train['label'].value_counts(dropna=False)

110    2640
17     2310
114    2200
118    2200
117    2090
       ... 
101     110
145     110
37      110
100     110
191     110
Name: label, Length: 198, dtype: int64

In [61]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=81511991154 % 2**32-1)

# X_train.shape, X_test.shape

In [81]:
model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
print('model_ts', model_ts)

initscore_filename = ''
params = {
    'boosting':'gbdt',
#     'boosting':'dart',
#     'boosting':'goss',
    'num_leaves': 7,
#     'max_depth': 3,
    'num_class':num_class,
    'objective': 'multiclass',
    'metric':'multi_logloss',
    'num_threads': multiprocessing.cpu_count()-2,
    'learning_rate': 0.01,
    'is_unbalance': True,
#     'scale_pos_weight':200,
    'bagging_fraction':0.1,
    'bagging_freq':5,
    'feature_fraction':0.1,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':5,
#     'device_type':'gpu',
#     'tree_learner':'data',

}
print(params)

data_params = {
#     'max_bin':127,
#     'enable_bundle': False,
}
print(data_params)

num_round = 5000
print('num_round:', num_round)

model_ts 20200201T100003
{'boosting': 'gbdt', 'num_leaves': 7, 'num_class': 198, 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_threads': 14, 'learning_rate': 0.01, 'is_unbalance': True, 'bagging_fraction': 0.1, 'bagging_freq': 5, 'feature_fraction': 0.1, 'initscore_filename': ''}
{}
num_round: 5000


In [82]:
# init_model = joblib.load('model/20200129T111708_0.27524341757899773.model')

In [83]:
# 0.02x overfit 0.805803

# 0.08323 0.635796
# 0.174632 0.727734

In [84]:
# init_model = joblib.load('model/20200129T165945_2.4140061681306326_0.1554147776143923.model')

In [None]:
model = None
submit_csv = []
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=81511991154 % 2**32-1, shuffle=True)

cv = 0
for train_index, valid_index in tqdm_notebook(skf.split(train.index, train['label'].values), total=n_splits, desc = 'CV'):
#     cnt += 1
#     print(cnt)
#     if cnt < 3:
#         continue

    
    X_train, X_test = train.loc[train_index, fea_cols], train.loc[valid_index, fea_cols] 
    y_train, y_test = train.loc[train_index,'label'], train.loc[valid_index, 'label']    
    
    print(X_train.shape, X_test.shape)
#     print(y_train.value_counts(dropna=False))
#     print(y_test.value_counts(dropna=False))
    
    
    train_set = lgb.Dataset(X_train, label=y_train, params=data_params)
    val_set = lgb.Dataset(X_test, label=y_test, params=data_params)

    evals_result = {}
    model = lgb.train(params, train_set, num_round, early_stopping_rounds=200, 
                            valid_sets=[train_set, val_set],
                            verbose_eval=50,
                            evals_result=evals_result,
#                             init_model=model,
                           )

    model_tag ='{}_{}_{}_{}'.format(model_ts, cv,
                                 evals_result['valid_1']['multi_logloss'][model.best_iteration-1],
                                 evals_result['training']['multi_logloss'][model.best_iteration-1]
                                )
    print(model_tag)

    joblib.dump(model, 'model/{}.model'.format(model_tag))
    
    
    pred = model.predict(test[fea_cols])

    submission = pd.DataFrame(data=pred)
    submission.index = test.index
    submission.index.name = 'id'
    submission = submission.sort_index()
    submission = submission.groupby('id').mean()

    csv_path = 'submit/{}.csv'.format(model_tag)
    submit_csv.append(csv_path)
    submission.to_csv(    csv_path, index=True) 
    
    print(submission.sum(axis=1))
    print(submission)
    cv += 1
#     break

In [86]:
# submissions = [
# 'submit/20200201T193822_0.42165222105307115_0.03657010393259738.csv',
# 'submit/20200201T183544_0.4354487978488266_0.04354968619883053.csv',
# 'submit/20200201T173725_0.423066834354457_0.03614391993976106.csv',
# 'submit/20200201T163343_0.4287587567261741_0.042392138498467166.csv',
# 'submit/20200201T153529_0.42326652930531944_0.04098269988118284.csv',
# 'submit/20200201T143616_0.4258237823312355_0.04651153387555587.csv',
# 'submit/20200201T134047_0.43147156765580946_0.043682031170534714.csv',
# 'submit/20200201T125939_0.42958065644660504_0.040012625819045466.csv',
# 'submit/20200201T120739_0.42070899280425217_0.03477196302416032.csv',
# 'submit/20200201T110556_0.4181683365658109_0.043843902710973416.csv',
# ]


dfs = [pd.read_csv(s) for s in submit_csv]

df_submit = pd.concat(dfs)

df_submit = df_submit.groupby('id').mean()

df_submit.to_csv('submit/{}_e{}.csv'.format(model_ts, n_splits), index=True) 

In [87]:
df_submit.sum(1)

id
828     1.0
829     1.0
830     1.0
831     1.0
832     1.0
       ... 
1543    1.0
1544    1.0
1545    1.0
1546    1.0
1547    1.0
Length: 720, dtype: float64

In [55]:
importance_type = 'split'
impt_dict = {k:v for k, v in zip(fea_cols, model.feature_importance(importance_type=importance_type))}
# sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)
# zero_cols = []
for k, s in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=False):
    if s == 0:
        zero_cols.append(k)
# joblib.dump(impt_dict, f'model/{file_name}_{importance_type}.pkl')

len(zero_cols)

2366

In [56]:
joblib.dump(zero_cols, 'zero_cols.bin')

['zero_cols.bin']

In [17]:
# model = joblib.load('model/20200130T221520_2.4393985000913667_0.07225009557115544.model')

In [18]:
# pred = model.predict(test)

# submission = pd.DataFrame(data=pred)
# submission.index = test.index
# submission.index.name = 'id'
# submission = submission.sort_index()
# submission = submission.groupby('id').mean()

# submission.to_csv('submit/{}.csv'.format(model_tag), index=True) 
# model_tag

# submission.sum(axis=1)

# submission