# 변수 모으기

user_id 기반으로 유의미한 feature를 하나씩 모아보겠습니다. 다 합쳐서 써보죠 뭐.

# IMPORT & LOAD DATA

In [193]:
import load_dtypes as ld
import warnings
warnings.filterwarnings(action='ignore')

import os, sys
import time
import datetime as dt

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve, recall_score, precision_score

import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

In [2]:
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------

In [3]:
TRAIN_P_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_problem_data.csv'
TRAIN_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_quality_data.csv'
TRAIN_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_err_data.csv'
TEST_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_quality_data.csv'
TEST_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_err_data.csv'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/sample_submission.csv'

In [4]:
%%time
train_p = ld.load_dtypes(TRAIN_P_PATH)
train_q = ld.load_dtypes(TRAIN_Q_PATH)
train_e = ld.load_dtypes(TRAIN_E_PATH)
test_q = ld.load_dtypes(TEST_Q_PATH)
test_e = ld.load_dtypes(TEST_E_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

C:\Users\Wyatt\wyatt37/Data/systemError/train_problem_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/train_quality_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/train_err_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/test_quality_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/test_err_data.csv
Wall time: 1min 22s


In [5]:
train_p.shape, train_q.shape, train_e.shape, test_q.shape, test_e.shape, submission.shape

((5429, 2),
 (828624, 16),
 (16554663, 6),
 (747972, 16),
 (16532648, 6),
 (14999, 2))

In [6]:
def preprocessing_problem(df, object_='binary'):
    """
    definition:
    train_problem 테이블을 받아서 target 값으로 변환
    1. {0, 1}의 binary로 변환
    2. {0 ~ n}의 multiclass로 변환
    """
    
    # 10001부터 24999까지의 index를 만들어줍니다.
    user_id_idx = np.array(range(10000, 25000, 1))
    
    # train_new_p라는 새로운 df를 만들고 index는 위에서 만든 user_id_idx 로 지정해줍니다.
    new_p = pd.DataFrame(index = user_id_idx)
    new_p['target'] = 0
    
    if object_ == 'binary':
        new_p.iloc[df.user_id.unique()-10000] = 1
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    elif object_ == 'multi':
        # multi는 count()로 집계를 해줍니다.
        new_p['target'] = df.groupby('user_id')['time'].count()
        new_p = new_p.fillna(0)
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    return new_p

In [7]:
train_b_p = preprocessing_problem(train_p, 'binary')
train_m_p = preprocessing_problem(train_p, 'multi')

In [8]:
def preprocessing_quality(df):
    """
    definition:
    EDA를 통해 알아낸 정보로 train_q, test_q를 정리해서 내뿜어줍니다.
    1. qaulity_3, quality_4 를 drop 합니다.(단일 value)
    2. qaulity_k 변수들을 정수로 encoding 합니다.
    """
    # 먼저 3, 4번을 drop 합니다.
    df.drop(['quality_3', 'quality_4'], axis=1, inplace=True)
    
    # qual 변수만 할당해주고, 정수로 형변환 해줍니다.
    columns = train_q.columns[train_q.columns.str.contains('quality')]
    # for문을 통해 각 column을 반복 작업해줍니다.
    for col in columns:
        try:
            if df[col].dtype == 'float32': # 기존에 float은 패스
                df[col] = df[col].fillna(-2)
            elif df[col].dtype == 'int8' or df[col].dtype == 'int16': # 기존에 int도 패스
                df[col] = df[col].fillna(-2)
                #print(col)
            else:
                df[col] = df[col].astype('object')
                # nan값이 있으면 float으로 갈 수 없으니 '-2' 으로 채워줍니다.
                df[col] = df[col].fillna('-2')
                df[col] = df[col].apply(lambda x: x.replace(',' , ''))
                df[col] = df[col].astype(np.float32)
        except:
            pass
        
    # fwver 에서 null 값이 꽤 있습니다. missing으로 채우겠습니다.
    df.fwver = df.fwver.astype('object')
    df.fwver = df.fwver.fillna('missing')
    df.fwver = df.fwver.astype('category')
        
    return df

In [9]:
train_q = preprocessing_quality(train_q)
test_q = preprocessing_quality(test_q)

In [10]:
def preprocessing_fwver(df):
    """
    definition:
    별건 아니고, e-set에 fwver 변수에서 '10' 이라는 값이 있는데, 이게 errtype이랑 겹쳐요.
    그래서 10을 -> 8.5.2 으로 바꿔줄 겁니다.
    굳이 이렇게 바꾸는 이유는, 해당 fw가 8.5.3버전과 같은 model_nm을 공유하기 때문입니다.
    """
    df.fwver = df.fwver.replace('10', '8.5.2')
        
    return df

In [11]:
train_e = preprocessing_fwver(train_e)
test_e = preprocessing_fwver(test_e)

In [12]:
def make_datetime(df, column_name):
    """
    definition:
    'time' column이 str로 되어 있으니, datetime으로 바꿔주는 함수입니다.
    다만 'time'양식이 pandas함수에 적용이 안되니, 강제로 슬라이싱해서 만들어줘야 합니다.
    
    """
    
    df['year'] = df[column_name].apply(lambda x: str(x)[:4])
    df['month'] = df[column_name].apply(lambda x: str(x)[4:6])
    df['day'] = df[column_name].apply(lambda x: str(x)[6:8])
    df['hour'] = df[column_name].apply(lambda x: str(x)[8:10])
    df['minute'] = '00' # minute을 넣어주지 않으면 datetime이 완성이 안되니, 00으로 넣어줍니다.
    
    df[column_name] = pd.to_datetime(df.year + df.month + df.day + df.hour + df.minute)
    
    return df

In [13]:
train_p = make_datetime(train_p, 'time')
train_q = make_datetime(train_q, 'time')
test_q = make_datetime(test_q, 'time')
train_e = make_datetime(train_e, 'time')
test_e = make_datetime(test_e, 'time')

# Feature Collection

## Error 테이블 탐험하기

In [17]:
# model 별로 몇 번의 error log가 떴는지를 알고 싶었다.
train_e.groupby(['user_id', 'model_nm']).count().reset_index().dropna()

Unnamed: 0,user_id,model_nm,time,fwver,errtype,errcode,year,month,day,hour,minute
3,10000,model_3,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0
11,10001,model_2,2365.0,2365.0,2365.0,2365.0,2365.0,2365.0,2365.0,2365.0,2365.0
21,10002,model_3,306.0,306.0,306.0,306.0,306.0,306.0,306.0,306.0,306.0
29,10003,model_2,306.0,306.0,306.0,306.0,306.0,306.0,306.0,306.0,306.0
36,10004,model_0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0
...,...,...,...,...,...,...,...,...,...,...,...
134957,24995,model_2,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
134967,24996,model_3,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
134973,24997,model_0,826.0,826.0,826.0,826.0,826.0,826.0,826.0,826.0,826.0
134982,24998,model_0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0


In [39]:
# fwver과 model의 관계를 알고 싶었다.
train_e.groupby(['model_nm', 'fwver']).count().dropna().reset_index()

Unnamed: 0,model_nm,fwver,user_id,time,errtype,errcode,year,month,day,hour,minute
0,model_0,04.22.1684,5554.0,5554.0,5554.0,5554.0,5554.0,5554.0,5554.0,5554.0,5554.0
1,model_0,04.22.1750,2874213.0,2874213.0,2874213.0,2874213.0,2874213.0,2874213.0,2874213.0,2874213.0,2874213.0
2,model_0,04.22.1778,1293946.0,1293946.0,1293946.0,1293946.0,1293946.0,1293946.0,1293946.0,1293946.0,1293946.0
3,model_0,04.22.1666,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,model_0,04.22.1442,2522.0,2522.0,2522.0,2522.0,2522.0,2522.0,2522.0,2522.0,2522.0
5,model_0,04.22.1656,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0
6,model_1,04.16.3553,5237816.0,5237816.0,5237816.0,5237816.0,5237816.0,5237816.0,5237816.0,5237816.0,5237816.0
7,model_1,04.16.3571,145156.0,145156.0,145156.0,145156.0,145156.0,145156.0,145156.0,145156.0,145156.0
8,model_1,04.16.3439,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
9,model_1,04.16.3569,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0


정지훈 선생의 말대로 fwver은 특정 model에만 존재합니다. 즉, model 별로 fwver 라인이 있습니다.

In [38]:
test_e.groupby(['model_nm', 'fwver'])['errtype'].sum().reset_index().dropna()

Unnamed: 0,model_nm,fwver,errtype
3,model_0,04.22.1684,131754.0
4,model_0,04.22.1750,63560173.0
5,model_0,04.22.1778,30011544.0
20,model_0,04.22.1478,6568.0
22,model_0,04.22.1656,10269.0
23,model_0,04.22.1666,849.0
30,model_0,04.22.1608,238.0
33,model_0,04.22.1448,10678.0
35,model_0,10.22.1770,1375.0
36,model_0,10.22.1780,44816.0


테스트세트도 마찬가지입니다. 물론 fwver은 조금씩 다릅니다.

그리고 특별히 많이 발생한 fwver이 있습니다. 이 fwver에서 신고도 많이 들어갈 것으로 예상 됩니다.

In [22]:
# model과 errortype 간의 발생 관계를 보고자 했습니다.
train_e.groupby(['model_nm', 'errtype'])['errtype'].count().unstack()

errtype,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42
model_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
model_0,108,3775,0,2596,395563,10164,12037,47,47,20325,85231,87733,2911,54128,545830,394076,5340,958,249,841,248,617067,574183,10960,645,158050,1621,1625,123,737022,4402,68781,71144,3391,4249,4253,1432,8222,243220,35289,8393
model_1,69,655,3,2720,220544,8985,9807,10,23,15059,56414,59574,8439,37596,272336,218642,25268,340,108,292,108,993653,1490911,16651,223,78966,943,947,53,521305,1110834,42544,47470,2359,960,961,1450,2175,120399,14446,249
model_2,78,18161,0,284,303689,14847,15759,43,26,6670,72628,76331,6983,75087,519375,373806,7160,1034,242,920,243,249246,206188,3713,305,211279,982,988,101,718178,13039,66566,44839,4390,4041,4056,1499,5370,365314,54086,25708
model_3,208,3363,29483,998177,14762,12606,12068,33,12,85283,74734,78023,3689,71637,212525,212155,3994,308,57,0,0,0,0,7666,0,83154,1703,1705,93,0,0,0,0,0,0,0,0,0,0,0,0
model_4,21082,0,881,282871,1081,3637,2031,31,1,5584,15161,15530,548,3499,39393,38094,3778,0,0,0,0,0,0,0,0,0,535547,539033,20,0,0,0,0,0,0,0,0,0,0,0,0
model_5,0,122,0,16,4766,88,104,0,0,21,699,726,30,161,5106,3725,68,6,1,6,1,1734,989,7,0,1376,8,8,4,7981,318,519,1640,40,40,39,150,53,1359,301,21
model_7,0,0,326,5733,5118,111,142,2,0,375,1413,1496,101,670,5656,4790,123,17,6,15,6,2025,2905,30,8,2035,0,0,4,510,1347,36,262,3,15,15,4,0,38,16,3
model_8,0,30,65,4353,5651,44,45,0,0,86,750,768,142,9237,4730,4088,278,8,1,6,1,1636,1339,44,3,1541,0,0,4,298,624,31,181,4,12,12,4,0,48,0,0
model_6,0,0,0,0,1746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


특정 error는 특정 model에서 발생하지 않고, 특정 model은 특정 error가 발생하지 않습니다. errtype==1 을 보시면 model_4가 대부분을 차지하고, 5~8은 아예 0건입니다.

In [23]:
train_e.groupby(['fwver', 'errtype'])['errtype'].count().unstack()

errtype,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42
fwver,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
03.11.1149,90,0,0,399,26,29,11,0,0,0,0,3,0,14,0,0,0,0,0,0,0,0,0,0,0,0,767,775,0,0,0,0,0,0,0,0,0,0,0,0,0
03.11.1167,20989,0,881,282472,1054,3605,2018,31,1,5584,15161,15527,548,3482,39393,38094,3778,0,0,0,0,0,0,0,0,0,534772,538250,19,0,0,0,0,0,0,0,0,0,0,0,0
04.16.3553,69,627,0,2596,215494,8868,9686,10,23,14731,54873,57994,8404,35899,264147,212222,24619,282,82,251,82,970609,1468296,16589,171,74951,931,934,47,506653,1073268,41282,36267,2304,452,453,1371,1633,116527,13965,154
04.16.3571,0,28,0,0,5049,113,114,0,0,324,1506,1544,34,1683,8085,6319,648,58,26,41,26,22716,22251,62,52,3960,10,11,6,14550,37367,1250,11193,54,508,508,77,542,3868,481,92
04.22.1684,1,0,0,7,1321,22,28,0,0,20,23,29,9,291,139,134,8,1,0,2,0,22,21,0,1,15,11,11,0,2930,0,4,54,15,0,0,131,0,273,31,0
04.22.1750,83,2461,0,2075,279747,7255,8961,40,26,14541,58650,60456,1988,41722,378368,273167,3051,645,165,574,164,418553,406403,8171,439,83823,987,993,85,510162,3104,46718,54132,2408,83,90,1009,5632,173898,23316,68
04.22.1778,24,1314,0,24,114488,2873,3037,7,21,5763,26498,27179,900,12098,167034,120600,2281,310,84,264,84,197832,167060,2789,204,74159,622,620,38,223930,1298,22059,16958,968,4166,4163,291,2590,69049,11942,8325
04.33.1149,1,0,0,0,316,14,14,0,0,4,88,94,6,44,534,279,5,23,1,23,1,131,68,0,0,122,0,0,2,654,0,182,188,3,0,10,32,6,332,95,0
04.33.1171,0,0,0,0,0,0,1,0,0,0,1,1,0,5,6,4,0,1,0,1,0,0,0,0,0,2,0,0,0,28,0,3,0,0,0,0,0,0,4,1,0
04.33.1185,11,10869,0,13,89893,4724,5076,7,18,2122,19547,20751,1783,19287,142533,101166,1636,247,33,223,33,71001,66507,502,56,34059,311,315,28,197251,4078,17509,11380,1367,195,201,478,1206,122581,14724,15


fwver로 보면 더욱 명확합니다.

In [24]:
# 시간 별로 특별히 많이 발생한 error type이 있는지에 대해 살펴보려고 했습니다.
train_e.groupby(['hour','errtype'])['errtype'].count().unstack().fillna(0)

errtype,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
0,91.0,1640.0,1025.0,46989.0,40875.0,1750.0,1712.0,5.0,1.0,31690.0,4708.0,4846.0,873.0,10081.0,73849.0,25269.0,1272.0,23.0,16.0,21.0,16.0,67577.0,90924.0,1489.0,18.0,146101.0,14544.0,14735.0,0.0,47978.0,52679.0,1098.0,4077.0,391.0,38.0,38.0,177.0,1131.0,28712.0,3894.0,17.0
1,48.0,1575.0,1061.0,43735.0,26982.0,1889.0,1711.0,5.0,4.0,3169.0,4014.0,4347.0,813.0,8229.0,45575.0,16540.0,539.0,22.0,12.0,13.0,12.0,54669.0,72663.0,1181.0,13.0,21237.0,12947.0,13036.0,0.0,33218.0,54191.0,817.0,2544.0,291.0,20.0,20.0,111.0,844.0,20156.0,2039.0,10.0
2,4874.0,926.0,1033.0,49389.0,20453.0,2000.0,1989.0,0.0,1.0,9564.0,45338.0,45709.0,937.0,8608.0,121253.0,12350.0,392.0,12.0,4.0,8.0,4.0,94828.0,70801.0,1232.0,15.0,10299.0,15679.0,15786.0,0.0,42861.0,49223.0,32287.0,1340.0,253.0,1769.0,1769.0,96.0,815.0,17053.0,1193.0,9025.0
3,4814.0,777.0,1204.0,49491.0,15700.0,2307.0,2063.0,0.0,0.0,3693.0,49176.0,49649.0,817.0,10429.0,120837.0,11390.0,532.0,4.0,1.0,4.0,1.0,94043.0,67488.0,1157.0,3.0,8231.0,17487.0,17665.0,0.0,41694.0,53642.0,34695.0,1246.0,207.0,2120.0,2117.0,79.0,404.0,11506.0,757.0,9530.0
4,5047.0,634.0,1361.0,51680.0,13469.0,1951.0,1917.0,0.0,1.0,2950.0,50550.0,51066.0,993.0,10758.0,120680.0,14945.0,570.0,4.0,1.0,3.0,1.0,91066.0,61076.0,1320.0,0.0,10485.0,17693.0,17934.0,0.0,43906.0,44430.0,35735.0,1170.0,98.0,2263.0,2265.0,91.0,353.0,11232.0,615.0,10111.0
5,2600.0,499.0,1748.0,54072.0,12153.0,2517.0,2503.0,0.0,2.0,2786.0,28990.0,30197.0,1454.0,11112.0,73448.0,26291.0,1154.0,5.0,4.0,4.0,4.0,62097.0,51706.0,1090.0,0.0,19031.0,19819.0,19918.0,0.0,45654.0,39647.0,19494.0,1339.0,83.0,1418.0,1421.0,88.0,312.0,14260.0,569.0,4857.0
6,141.0,385.0,1488.0,55515.0,12162.0,2193.0,2163.0,3.0,3.0,2751.0,5536.0,6596.0,1462.0,10557.0,29009.0,48502.0,2665.0,16.0,12.0,7.0,5.0,39487.0,51203.0,1107.0,4.0,34024.0,17649.0,17851.0,1.0,62428.0,37075.0,2307.0,3154.0,118.0,176.0,176.0,98.0,268.0,21807.0,763.0,10.0
7,186.0,582.0,1560.0,64503.0,21078.0,2323.0,2280.0,0.0,1.0,2881.0,6941.0,8048.0,1580.0,10937.0,46004.0,78625.0,4580.0,19.0,6.0,9.0,6.0,50128.0,65812.0,1459.0,20.0,51775.0,21606.0,21756.0,0.0,104694.0,39719.0,3512.0,5422.0,158.0,162.0,164.0,160.0,435.0,32080.0,2032.0,18.0
8,233.0,897.0,1050.0,61840.0,30404.0,1751.0,1661.0,2.0,2.0,3028.0,7068.0,7541.0,902.0,9965.0,64069.0,79518.0,4258.0,67.0,28.0,46.0,26.0,64395.0,86004.0,1589.0,36.0,44162.0,21387.0,21479.0,1.0,111260.0,42441.0,3248.0,7517.0,244.0,117.0,116.0,154.0,524.0,35753.0,3749.0,33.0
9,186.0,598.0,1120.0,58227.0,39128.0,1859.0,1750.0,8.0,1.0,2973.0,6724.0,7157.0,1073.0,10828.0,61565.0,64650.0,3052.0,147.0,33.0,114.0,32.0,67380.0,89938.0,1496.0,86.0,29165.0,24723.0,24922.0,32.0,96318.0,40916.0,3141.0,7357.0,387.0,131.0,132.0,176.0,573.0,33654.0,4069.0,40.0


In [181]:
train_e

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,year,month,day,hour,minute
0,10000,2020-11-01 02:00:00,model_3,05.15.2138,15,1,2020,11,01,02,00
1,10000,2020-11-01 03:00:00,model_3,05.15.2138,12,1,2020,11,01,03,00
2,10000,2020-11-01 03:00:00,model_3,05.15.2138,11,1,2020,11,01,03,00
3,10000,2020-11-01 05:00:00,model_3,05.15.2138,16,1,2020,11,01,05,00
4,10000,2020-11-01 05:00:00,model_3,05.15.2138,4,0,2020,11,01,05,00
...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,2020-11-30 16:00:00,model_3,05.15.2138,15,1,2020,11,30,16,00
16554659,24999,2020-11-30 17:00:00,model_3,05.15.2138,16,1,2020,11,30,17,00
16554660,24999,2020-11-30 17:00:00,model_3,05.15.2138,4,0,2020,11,30,17,00
16554661,24999,2020-11-30 17:00:00,model_3,05.15.2138,4,0,2020,11,30,17,00


계속 e테이블을 살펴봐야겠습니다.

### errcode 탐험하기

지금까지 신경쓰지 않았던 errcode를 잠깐 한 번 봐보겠습니다.

In [184]:
train_e.isnull().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     1
year        0
month       0
day         0
hour        0
minute      0
dtype: int64

In [259]:
test_e.isnull().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     4
year        0
month       0
day         0
hour        0
minute      0
dtype: int64

In [260]:
train_e.dropna(inplace=True)
test_e.dropna(inplace=True)

숫자도 있고, 문자도 있습니다. NFANDROID2는 안드로이드 기반의 어떤 오류인 것 같네요.

In [236]:
train_e.errcode.value_counts()[:100]

1                                          8906967
0                                          2599123
connection timeout                         1835600
B-A8002                                     897863
80                                          334018
79                                          332422
14                                          263577
active                                      219238
2                                           166382
84                                          129876
85                                          127744
standby                                     110370
NFANDROID2                                  104138
connection fail to establish                100940
3                                            91223
90                                           65992
89                                           64799
S-61001                                      34631
95                                           23452
94                             

In [261]:
test_e.errcode.value_counts()[:100]

1                                          8750154
0                                          2565532
connection timeout                         1890632
B-A8002                                     855747
80                                          326179
79                                          324186
14                                          316110
active                                      225519
2                                           155401
84                                          140642
standby                                     139947
85                                          139255
connection fail to establish                109690
NFANDROID2                                  108244
3                                            86451
connectionterminated by local host           84809
90                                           58820
89                                           57718
S-61001                                      35026
4                              

크게 몇 가지 유형으로 나눌 수 있을 것 같습니다.

1. B-A8002 처럼 알파벳과 숫자로 된 에러 유형
2. 0, 1, 23 처럼 단순히 숫자로 된 에러 유형
3. connection 처럼 conn과 관련된 에러 유형
4. 그 외 기타 문자로 되어 있는 에러 유형

그리고 특정 에러가 상당히 많은 비중을 가지고 있습니다. 해당 오류들을 보유한 유저들이 얼마나 되는지 살펴봐야겠습니다.

In [221]:
train_e_conn = train_e[train_e.errcode.str.contains('conn')]

In [225]:
len(train_e.user_id.unique())

15000

In [237]:
len(train_e[train_e.errcode.str.contains('conn')].user_id.unique())

8160

약 절반이 connection 관련된 에러를 경험했습니다.

In [238]:
len(train_e[train_e.errcode.str.contains('ANDROID')].user_id.unique())

6142

약 4부 정도가 안드로이드 에러를 겪었고요.

In [249]:
len(train_e[train_e.errcode.str.contains('-')].user_id.unique())

11012

약 73% 가 B-A8002 같은 에러를 겪었습니다.

In [242]:
len(train_e[train_e.errcode == '1'].user_id.unique())

14930

In [243]:
len(train_e[train_e.errcode == '0'].user_id.unique())

14776

1과 0은 높은 빈도를 자랑하듯이 99%의 유저가 경험했습니다. 실질적으로 변별력이 없다고 보면 되겠습니다.

안드로이드 오류는 약 40%가 겪었습니다.

### 유저별로 특정 에러를 얼마나 겪었는가?

#### 유저별로 'conn' 관련한 에러를 얼마나 겪었는가?

In [285]:
train_error_conn = train_e[train_e.errcode.str.contains('conn')].groupby('user_id').count()['errcode']
test_error_conn = test_e[test_e.errcode.str.contains('conn')].groupby('user_id').count()['errcode']

#### 유저별로 C-11012 유형의 에러를 얼마나 겪었는가?

In [286]:
train_error_robot = train_e[train_e.errcode.str.contains('-')].groupby('user_id').count()['errcode']
test_error_robot = test_e[test_e.errcode.str.contains('-')].groupby('user_id').count()['errcode']

#### 유저별로 ANDROID 유형의 에러를 얼마나 겪었는가?

In [287]:
train_error_android = train_e[train_e.errcode.str.contains('ANDROID')].groupby('user_id').count()['errcode']
test_error_android = test_e[test_e.errcode.str.contains('ANDROID')].groupby('user_id').count()['errcode']

세개를 합쳐주겠습니다. 컬럼명이 겹치기 때문이죠.

In [291]:
train_specific_error = pd.concat([train_error_conn, train_error_robot, train_error_android], axis=1).fillna(0)
train_specific_error.columns = ['err_conn', 'err_robot', 'err_and']

In [292]:
test_specific_error = pd.concat([test_error_conn, test_error_robot, test_error_android], axis=1).fillna(0)
test_specific_error.columns = ['err_conn', 'err_robot', 'err_and']

### 유저별로 가장 많이 발생한 errtype은 무엇인가?

In [64]:
train_e.groupby(['user_id', 'errtype']).count().reset_index()

Unnamed: 0,user_id,errtype,time,model_nm,fwver,errcode,year,month,day,hour,minute
0,10000,3,8,8,8,8,8,8,8,8,8
1,10000,4,104,104,104,104,104,104,104,104,104
2,10000,6,1,1,1,1,1,1,1,1,1
3,10000,7,1,1,1,1,1,1,1,1,1
4,10000,10,7,7,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...
231034,24999,13,4,4,4,4,4,4,4,4,4
231035,24999,14,20,20,20,20,20,20,20,20,20
231036,24999,15,135,135,135,135,135,135,135,135,135
231037,24999,16,135,135,135,135,135,135,135,135,135


In [78]:
train_e.groupby(['user_id', 'errtype']).count().max(level=0)

Unnamed: 0_level_0,time,model_nm,fwver,errcode,year,month,day,hour,minute
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10000,104,104,104,104,104,104,104,104,104
10001,756,756,756,756,756,756,756,756,756
10002,132,132,132,132,132,132,132,132,132
10003,65,65,65,65,65,65,65,65,65
10004,176,176,176,176,176,176,176,176,176
...,...,...,...,...,...,...,...,...,...
24995,31,31,31,31,31,31,31,31,31
24996,1,1,1,1,1,1,1,1,1
24997,223,223,223,223,223,223,223,223,223
24998,51,51,51,51,51,51,51,51,51


### 유저별로 몇개의 모델을 썼는가?

In [41]:
temp = train_e[['user_id', 'model_nm']]

In [44]:
temp = temp.drop_duplicates()

In [47]:
train_model_count = temp.groupby('user_id').count()

In [49]:
test_model_count = test_e[['user_id', 'model_nm']].drop_duplicates().groupby('user_id').count()

In [54]:
train_model_count.tail()

Unnamed: 0_level_0,model_nm
user_id,Unnamed: 1_level_1
24995,1
24996,1
24997,1
24998,1
24999,1


### 유저별로 몇개의 fw를 썼는가?

In [51]:
train_fwver_count = train_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()
test_fwver_count = test_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()

In [52]:
train_fwver_count.head()

Unnamed: 0_level_0,fwver
user_id,Unnamed: 1_level_1
10000,1
10001,2
10002,1
10003,2
10004,2


### 유저별로 몇개의 err가 떴는가?

유저 당 err가 몇 번이나 떴는가에 대한 단순 집계입니다. 시리즈로 뽑겠습니다.

In [14]:
train_err_count = train_e.groupby('user_id')['errcode'].count()
test_err_count = test_e.groupby('user_id')['errcode'].count()

In [15]:
train_err_count

user_id
10000     317
10001    2365
10002     306
10003     306
10004     777
         ... 
24995     194
24996       4
24997     826
24998     155
24999     570
Name: errcode, Length: 15000, dtype: int64

In [16]:
test_err_count

user_id
30000     2750
30001      284
30002      941
30003      371
30004      881
         ...  
44994     1115
44995      515
44996     2233
44997    24671
44998      873
Name: errcode, Length: 14998, dtype: int64

## Quality Table 탐험하기

In [79]:
train_q

Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,year,month,day,hour,minute
0,2020-11-29 09:00:00,10000,05.15.2138,0.0,0,0.0,0.0,0,0.0,0.0,0.0,4.0,0,0,2020,11,29,09,00
1,2020-11-29 09:00:00,10000,05.15.2138,0.0,0,0.0,0.0,0,0.0,0.0,0.0,4.0,0,0,2020,11,29,09,00
2,2020-11-29 09:00:00,10000,05.15.2138,0.0,0,0.0,0.0,0,0.0,0.0,0.0,4.0,0,0,2020,11,29,09,00
3,2020-11-29 09:00:00,10000,05.15.2138,0.0,0,0.0,0.0,0,0.0,0.0,0.0,4.0,0,0,2020,11,29,09,00
4,2020-11-29 09:00:00,10000,05.15.2138,0.0,0,0.0,0.0,0,0.0,0.0,0.0,4.0,0,0,2020,11,29,09,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828619,2020-11-24 03:00:00,24997,04.22.1778,0.0,0,0.0,1.0,0,0.0,0.0,0.0,17.0,0,0,2020,11,24,03,00
828620,2020-11-24 03:00:00,24997,04.22.1778,0.0,0,0.0,0.0,0,0.0,0.0,0.0,17.0,0,0,2020,11,24,03,00
828621,2020-11-24 03:00:00,24997,04.22.1778,0.0,0,0.0,3.0,0,0.0,0.0,0.0,17.0,0,0,2020,11,24,03,00
828622,2020-11-24 03:00:00,24997,04.22.1778,0.0,0,0.0,0.0,0,0.0,0.0,0.0,17.0,0,0,2020,11,24,03,00


### 유저별로 각 퀄리티의 std는 어느정도인가?

In [84]:
train_qual_std = train_q.groupby(['user_id']).std()
test_qual_std = test_q.groupby(['user_id']).std()

In [85]:
train_qual_std.shape, test_qual_std.shape

((8281, 11), (8268, 11))

## train_p를 합쳐서 확인해보기

In [25]:
train_p

Unnamed: 0,user_id,time,year,month,day,hour,minute
0,19224,2020-11-02 20:00:00,2020,11,2,20,0
1,23664,2020-11-16 14:00:00,2020,11,16,14,0
2,15166,2020-11-14 13:00:00,2020,11,14,13,0
3,12590,2020-11-08 21:00:00,2020,11,8,21,0
4,15932,2020-11-03 21:00:00,2020,11,3,21,0
5,16852,2020-11-19 15:00:00,2020,11,19,15,0
6,23427,2020-11-21 11:00:00,2020,11,21,11,0
7,13507,2020-11-11 16:00:00,2020,11,11,16,0
8,11274,2020-11-18 12:00:00,2020,11,18,12,0
9,20610,2020-11-27 23:00:00,2020,11,27,23,0


In [26]:
temp = train_e.groupby(['user_id', 'fwver'])['errcode'].count().reset_index()

In [27]:
temp_2 = temp[temp.errcode != 0]

In [28]:
temp_3 = pd.merge(train_e, train_b_p, how='outer', on=['user_id'])

In [29]:
temp_4 = temp_3.groupby(['fwver', 'user_id'])['target'].max().reset_index()

In [30]:
temp_5 = temp_3.groupby(['model_nm', 'user_id'])['target'].max().reset_index()

In [31]:
temp_4.groupby('fwver').sum()

Unnamed: 0_level_0,user_id,target
fwver,Unnamed: 1_level_1,Unnamed: 2_level_1
03.11.1149,262492500,0.0
03.11.1167,262492500,141.0
04.16.3553,262492500,1314.0
04.16.3571,262492500,295.0
04.22.1684,262492500,28.0
04.22.1750,262492500,1630.0
04.22.1778,262492500,1586.0
04.33.1149,262492500,112.0
04.33.1171,262492500,1.0
04.33.1185,262492500,1128.0


In [32]:
temp_5.groupby('model_nm').sum()

Unnamed: 0_level_0,user_id,target
model_nm,Unnamed: 1_level_1,Unnamed: 2_level_1
model_0,262492500,1637.0
model_1,262492500,1318.0
model_2,262492500,1744.0
model_3,262492500,720.0
model_4,262492500,141.0
model_5,262492500,20.0
model_7,262492500,32.0
model_8,262492500,22.0
model_6,262492500,9.0


중복값이 많긴 하지만, 불편 신고한 유저들이 사용한 fwver과 model의 비중을 알 수 있습니다. 특정 model과 fwver에 몰려 있다는 것을 알 수 있습니다.<br>
베이스코드가 errtype으로만 80의 성능을 낸 이유가 있습니다. errtype의 발생으로도 이렇게 설명이 가능합니다.<br>
그렇다면 이제 나머지 5%를 맞출 수 있는 원인을 찾아줘야겠습니다.

## Train & Predict

In [86]:
train_b_p

Unnamed: 0,user_id,target
0,10000,0
1,10001,1
2,10002,0
3,10003,0
4,10004,1
...,...,...
14995,24995,0
14996,24996,0
14997,24997,1
14998,24998,1


In [91]:
X = train_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)
y = test_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)

In [92]:
# 하나 사라진 유저를 채워주는 코드입니다.
y = y.reindex(pd.RangeIndex(y.index.max() + 1)).ffill(0)[30000:]

In [93]:
X.shape, y.shape

((15000, 41), (14999, 41))

만들어 둔 시리즈들

train_err_count, train_fwver_count, train_model_count, train_qual_std

test_err_count, test_fwver_count, test_model_count, test_qual_std

In [97]:
X = pd.concat([X, train_err_count, train_fwver_count, train_model_count, train_qual_std], axis=1).fillna(0)
X.shape

(15000, 55)

In [98]:
y = pd.concat([y, test_err_count, test_fwver_count, test_model_count, test_qual_std], axis=1).fillna(0)
y.shape

(14999, 55)

새로 만든 시리즈 추가

train_err_conn, train_err_robot, train_err_and

In [294]:
X = pd.concat([X, train_specific_error], axis=1).fillna(0)
y = pd.concat([y, test_specific_error], axis=1).fillna(0)

In [302]:
# 변수 선택을 위해서 하나씩 추가한 모델을 학습해보겠습니다.
X_conn = X.drop(['err_robot', 'err_and'], axis=1)
X_robot = X.drop(['err_conn', 'err_and'], axis=1)
X_and = X.drop(['err_robot', 'err_conn'], axis=1)

y_conn = y.drop(['err_robot', 'err_and'], axis=1)
y_robot = y.drop(['err_conn', 'err_and'], axis=1)
y_and = y.drop(['err_robot', 'err_conn'], axis=1)

결과적으로 and만 비슷한 성능을 냈고, 나머지 변수들은 성능이 떨어졌습니다.

In [146]:
#------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#------------------------------------------------------------


def s_fold_train_pred(train_x, train_y):
    import lightgbm as lgb

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5
    # 파라미터 설정
    params =      {
                    'boosting_type' : 'gbdt',
                    'objective'     : 'binary',
                    'metric'        : 'auc',
                    'learning_rate' : 0.027,
                    'seed': 42
                    }
    #-------------------------------------------------------------------------------------
    # 5 Kfold cross validation
    s_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)    

    for train_idx, val_idx in s_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        d_train= lgb.Dataset(X, y)
        d_val  = lgb.Dataset(valid_x, valid_y)

        #run traning
        model = lgb.train(
                            params,
                            train_set       = d_train,
                            num_boost_round = 10000,
                            valid_sets      = d_val,
                            feval           = f_pr_auc,
                            verbose_eval    = 100, 
                            early_stopping_rounds = 100
                           )

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)

        print('==========================================================')
        
    return models, auc_scores, recalls, precisions

In [147]:
# loss 비교를 위해 지우지 않습니다.
# 최고점 모델입니다.
# model_nm, fwver, err_count, qual_std 를 추가한 모델.
models, auc_scores, _, _ = s_fold_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7053
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.816962	valid_0's pr_auc: 0.735504
[200]	valid_0's auc: 0.818328	valid_0's pr_auc: 0.744809
Early stopping, best iteration is:
[138]	valid_0's auc: 0.819099	valid_0's pr_auc: 0.741788
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7057
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM]

In [306]:
# loss 비교를 위해 지우지 않습니다.
# model_nm, fwver, err_count, qual_std 를 추가
# err_conn, err_robot, err_and 까지 추가
models, auc_scores, _, _ = s_fold_train_pred(X_and, train_b_p.target)
print(np.mean(auc_scores))

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7165
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.816962	valid_0's pr_auc: 0.735504
[200]	valid_0's auc: 0.818328	valid_0's pr_auc: 0.744809
Early stopping, best iteration is:
[138]	valid_0's auc: 0.819099	valid_0's pr_auc: 0.741788
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7168
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM]

In [296]:
pred_y_list = []
for model in models:
    pred_y = model.predict(y)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [300]:
submission.problem = pred_ensemble
#submission.to_csv(r'C:\Users\Wyatt\wyatt37/Comp/LG_edge_detect/king/submission/king_210116_3_feature-collection-param-spe-err.csv', index=False)

In [298]:
submission

Unnamed: 0,user_id,problem
0,30000,0.849794
1,30001,0.308510
2,30002,0.292074
3,30003,0.617978
4,30004,0.833800
...,...,...
14994,44994,0.283482
14995,44995,0.353020
14996,44996,0.490287
14997,44997,0.817509


In [133]:
for model in models:
    # sorted(zip(clf.feature_importances_, X.columns), reverse=True)
    feature_imp = pd.DataFrame(sorted(zip(model.feature_importance, X.columns)),
                               columns=['Value','Feature'])

    plt.figure(figsize=(20, 20))
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()

TypeError: zip argument #1 must support iteration

0.8075 -> 0.8161로 상승했습니다. 그래도 0.9% 오른거면 꽤나 올랐네요!

파라미터 튜닝을 해줬는데, 0.8161 -> 0.8222	로 상승했습니다. lgbm 만세! 0.6% 올랐습니다.

바꾼 파라미터

1. lr {'default' : 0.027}
2. num_boost_round {1000:10000}
3. early_stopping_rounds {3 :100}

미세하게 많이 개선하도록 바꿨습니다.

specific error count 를 넣으니 성능이 오히려 떨어졌습니다.

0.8217로 큰 차이는 없으나 Loss를 따라가는 것 같습니다.

하나씩 줄여가면서 다시 학습을 시켜보겠습니다.

In [150]:
def s_fold_catb_train_pred(train_x, train_y):
    import catboost as catb
    from catboost import CatBoostRegressor
    catb_reg = CatBoostRegressor()

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5
    # 파라미터 설정
    params =      {
                    'boosting_type' : 'gbdt',
                    'objective'     : 'binary',
                    'metric'        : 'auc',
                    'learning_rate' : 0.027,
                    'seed': 42
                    }
    #-------------------------------------------------------------------------------------
    # 5 Kfold cross validation
    s_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)    

    for train_idx, val_idx in s_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        d_train= catb.Dataset(X, y)
        d_val  = catb.Dataset(valid_x, valid_y)

        #run traning
        model = catb.train(
                            params,
                            train_set       = d_train,
                            num_boost_round = 10000,
                            valid_sets      = d_val,
                            feval           = f_pr_auc,
                            verbose_eval    = 100, 
                            early_stopping_rounds = 100
                           )

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)

        print('==========================================================')
        
    return models, auc_scores, recalls, precisions

In [151]:
CatBoostClassifier?

[1;31mInit signature:[0m
[0mCatBoostClassifier[0m[1;33m([0m[1;33m
[0m    [0miterations[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdepth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0ml2_leaf_reg[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmodel_size_reg[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrsm[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mloss_function[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mborder_count[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfeature_border_type[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mper_float_feature_quantization[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minput_borders[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0moutput_borders[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfo

In [149]:
models, auc_scores, _, _ = s_fold_catb_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

AttributeError: module 'catboost' has no attribute 'Dataset'