# 팀 코드

본 노트북은 지금까지 했던 과정을 간략히 정리한 것입니다. 참고용으로 보시기 바랍니다.

# 제출 기록

1. D-6
 - 파생 변수 생성 확인

2. D-5
 - 파생 변수 생성 확인

3. D-4
 - 베스트 모델 확인

4. D-3
 - 베스트 모델 확인

5. D-2
 - 앙상블 제출

6. D-1
 - 보류

7. D-Day
 - 스태킹을 할 수도 있으니 마지막 날은 남겨 둠.

In [2]:
import load_dtypes as ld
import warnings
warnings.filterwarnings(action='ignore')

import os, sys
import time
import datetime as dt

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN

from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve, recall_score, precision_score

import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

In [4]:
TRAIN_P_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_problem_data.csv'
TRAIN_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_quality_data.csv'
TRAIN_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_err_data.csv'
TEST_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_quality_data.csv'
TEST_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_err_data.csv'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/sample_submission.csv'

In [146]:
%%time
train_p = ld.load_dtypes(TRAIN_P_PATH)
train_q = ld.load_dtypes(TRAIN_Q_PATH)
train_e = ld.load_dtypes(TRAIN_E_PATH)
test_q = ld.load_dtypes(TEST_Q_PATH)
test_e = ld.load_dtypes(TEST_E_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

C:\Users\Wyatt\wyatt37/Data/systemError/train_problem_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/train_quality_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/train_err_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/test_quality_data.csv
C:\Users\Wyatt\wyatt37/Data/systemError/test_err_data.csv
Wall time: 1min 20s


In [147]:
train_p.shape, train_q.shape, train_e.shape, test_q.shape, test_e.shape, submission.shape

((5429, 2),
 (828624, 16),
 (16554663, 6),
 (747972, 16),
 (16532648, 6),
 (14999, 2))

# Preprocessing

In [153]:
def preprocessing_problem(df, object_='binary'):
    """
    definition:
    train_problem 테이블을 받아서 target 값으로 변환
    1. {0, 1}의 binary로 변환
    2. {0 ~ n}의 multiclass로 변환
    """
    
    # 10001부터 24999까지의 index를 만들어줍니다.
    user_id_idx = np.array(range(10000, 25000, 1))
    
    # train_new_p라는 새로운 df를 만들고 index는 위에서 만든 user_id_idx 로 지정해줍니다.
    new_p = pd.DataFrame(index = user_id_idx)
    new_p['target'] = 0
    
    if object_ == 'binary':
        new_p.iloc[df.user_id.unique()-10000] = 1
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    elif object_ == 'multi':
        # multi는 count()로 집계를 해줍니다.
        new_p['target'] = df.groupby('user_id')['time'].count()
        new_p = new_p.fillna(0)
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    return new_p

In [154]:
train_b_p = preprocessing_problem(train_p, 'binary')
train_m_p = preprocessing_problem(train_p, 'multi')

In [155]:
def preprocessing_quality(df):
    """
    definition:
    EDA를 통해 알아낸 정보로 train_q, test_q를 정리해서 내뿜어줍니다.
    1. qaulity_3, quality_4 를 drop 합니다.(단일 value)
    2. qaulity_k 변수들을 정수로 encoding 합니다.
    """
    # 먼저 3, 4번을 drop 합니다.
    df.drop(['quality_3', 'quality_4'], axis=1, inplace=True)
    
    # qual 변수만 할당해주고, 정수로 형변환 해줍니다.
    columns = train_q.columns[train_q.columns.str.contains('quality')]
    # for문을 통해 각 column을 반복 작업해줍니다.
    for col in columns:
        try:
            if df[col].dtype == 'float32': # 기존에 float은 패스
                df[col] = df[col].fillna(-2)
            elif df[col].dtype == 'int8' or df[col].dtype == 'int16': # 기존에 int도 패스
                df[col] = df[col].fillna(-2)
                #print(col)
            else:
                df[col] = df[col].astype('object')
                # nan값이 있으면 float으로 갈 수 없으니 '-2' 으로 채워줍니다.
                df[col] = df[col].fillna('-2')
                df[col] = df[col].apply(lambda x: x.replace(',' , ''))
                df[col] = df[col].astype(np.float32)
        except:
            pass
        
    # fwver 에서 null 값이 꽤 있습니다. missing으로 채우겠습니다.
    df.fwver = df.fwver.astype('object')
    df.fwver = df.fwver.fillna('missing')
    df.fwver = df.fwver.astype('category')
        
    return df

In [156]:
train_q = preprocessing_quality(train_q)
test_q = preprocessing_quality(test_q)

In [157]:
def preprocessing_fwver(df):
    """
    definition:
    별건 아니고, e-set에 fwver 변수에서 '10' 이라는 값이 있는데, 이게 errtype이랑 겹쳐요.
    그래서 10을 -> 8.5.2 으로 바꿔줄 겁니다.
    굳이 이렇게 바꾸는 이유는, 해당 fw가 8.5.3버전과 같은 model_nm을 공유하기 때문입니다.
    """
    df.fwver = df.fwver.replace('10', '8.5.2')
        
    return df

In [158]:
train_e = preprocessing_fwver(train_e)
test_e = preprocessing_fwver(test_e)

In [159]:
def make_datetime(df):
    """
    definition:
    'time' column이 str로 되어 있으니, datetime으로 바꿔주는 함수입니다.
    다만 'time'양식이 pandas함수에 적용이 안되니, 강제로 슬라이싱해서 만들어줘야 합니다.
    
    """
    
    df['year'] = df['time'].apply(lambda x: str(x)[:4])
    df['month'] = df['time'].apply(lambda x: str(x)[4:6])
    df['day'] = df['time'].apply(lambda x: str(x)[6:8])
    df['hour'] = df['time'].apply(lambda x: str(x)[8:10])
    df['minute'] = '00' # minute을 넣어주지 않으면 datetime이 완성이 안되니, 00으로 넣어줍니다.
    
    df['time'] = pd.to_datetime(df.year + df.month + df.day + df.hour + df.minute)
    
    return df

In [160]:
train_p = make_datetime(train_p)
train_q = make_datetime(train_q)
test_q = make_datetime(test_q)
train_e = make_datetime(train_e)
test_e = make_datetime(test_e)

In [161]:
# groupby 연산에서 왜 문제가 생기나 했더니 category로 되어 있어서였습니다.

train_e['errtype'] = train_e.errtype.astype('object')
test_e['errtype'] = test_e.errtype.astype('object')

train_e['errcode'] = train_e.errcode.astype('object')
test_e['errcode'] = test_e.errcode.astype('object')

In [162]:
# fwver도 object로 잡아줍니다.

train_q.fwver = train_q.fwver.astype('object')
test_q.fwver = test_q.fwver.astype('object')

# Feature Engineering

## from Error_log

### 사용한 model의 개수

In [163]:
train_model_count = train_e[['user_id', 'model_nm']].drop_duplicates().groupby('user_id').count()
test_model_count = test_e[['user_id', 'model_nm']].drop_duplicates().groupby('user_id').count()

### 사용한 fwver 의 개수

In [164]:
train_fwver_count = train_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()
test_fwver_count = test_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()

### 경험한 error 의 총 개수

In [165]:
train_err_count = train_e.groupby('user_id')['errcode'].count()
test_err_count = test_e.groupby('user_id')['errcode'].count()

### 경험한 각 errtype의 value별 개수

In [166]:
# 33호
train_errcode_33 = train_e[train_e.errtype == 33][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_33 = test_e[test_e.errtype == 33][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_33.columns = ['err_33_1', 'err_33_2', 'err_33_3']
test_errcode_33.columns = ['err_33_1', 'err_33_2', 'err_33_3']

In [167]:
# 23호
train_errcode_23 = train_e[train_e.errtype == 23][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_23 = test_e[test_e.errtype == 23][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_23.columns = ['UNKNOWN', 'ACTIVE', 'connLMP', 'connESTA', 'connTO', 'connLOCAL', 'STANDBY', 'TERMINATE']
test_errcode_23.columns = ['UNKNOWN', 'ACTIVE', 'connLMP', 'connESTA', 'connTO', 'connLOCAL', 'STANDBY', 'TERMINATE']

In [168]:
# 34호
train_errcode_34 = train_e[train_e.errtype == 34][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_34 = test_e[test_e.errtype == 34][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_34.columns = ['err_34_1', 'err_34_2', 'err_34_3', 'err_34_4', 'err_34_5', 'err_34_6']
test_errcode_34.columns = ['err_34_1', 'err_34_2', 'err_34_3', 'err_34_4', 'err_34_5', 'err_34_6']

## from Quality_log

### 각 quality의 표준편차

In [169]:
train_qual_std = train_q.groupby(['user_id']).std()
test_qual_std = test_q.groupby(['user_id']).std()

new_columns = ['q_std_0', 'q_std_1', 'q_std_2', 'q_std_5', 'q_std_6', 'q_std_7', 'q_std_8', 'q_std_9', 'q_std_10','q_std_11', 'q_std_12']

train_qual_std.columns = new_columns
test_qual_std.columns = new_columns

### 기록한 quality log의 개수(12개당 1번)

In [170]:
train_qual_log = train_q.groupby('user_id')['time'].count()/12
test_qual_log = test_q.groupby('user_id')['time'].count()/12

### quality당 순수 개수

In [171]:
# 값을 만들기 위해 새로운 df를 받아옵니다.
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

# 0 값을 전부 nan 값으로 바꿔줍니다.
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] == 0, 'quality_{}'.format(i)] = np.nan
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] == 0, 'quality_{}'.format(i)] = np.nan

# 필요없는 변수들을 버려줍니다.
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

# 그룹바이 카운트 해줍니다.
train_qual_counts = train_q_temp.groupby('user_id').count()
test_qual_counts = test_q_temp.groupby('user_id').count()

# 겹치는 컬럼명을 바꿔줍니다.
train_qual_counts.columns = ['q_c_0', 'q_c_1', 'q_c_2', 'q_c_5', 'q_c_6',
                             'q_c_7', 'q_c_8', 'q_c_9', 'q_c_10', 'q_c_11', 'q_c_12']
test_qual_counts.columns = ['q_c_0', 'q_c_1', 'q_c_2', 'q_c_5', 'q_c_6',
                             'q_c_7', 'q_c_8', 'q_c_9', 'q_c_10', 'q_c_11', 'q_c_12']

### quality당 음수, 0에 대한 count

In [172]:
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

In [173]:
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

In [174]:
# 0 값만 count를 위해서 음수와 양수를 전부 nan으로 만들겠습니다.

for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] < 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] < 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] > 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] > 0, 'quality_{}'.format(i)] = np.nan

In [175]:
train_qual_zeroCount = train_q_temp.groupby('user_id').count()
test_qual_zeroCount = test_q_temp.groupby('user_id').count()

In [176]:
new_columns = ['q_z_c_0', 'q_z_c_1', 'q_z_c_2', 'q_z_c_5', 'q_z_c_6', 'q_z_c_7', 'q_z_c_8', 'q_z_c_9', 'q_z_c_10','q_z_c_11', 'q_z_c_12']

train_qual_zeroCount.columns = new_columns
test_qual_zeroCount.columns = new_columns

In [177]:
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

In [178]:
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

In [179]:
# 음수 값만 count를 위해서 음수와 0을 전부 nan으로 만들겠습니다.

for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] >= 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] >= 0, 'quality_{}'.format(i)] = np.nan

In [180]:
train_qual_negaCount = train_q_temp.groupby('user_id').count()
test_qual_negaCount = test_q_temp.groupby('user_id').count()

In [181]:
new_columns = ['q_n_c_0', 'q_n_c_1', 'q_n_c_2', 'q_n_c_5', 'q_n_c_6', 'q_n_c_7', 'q_n_c_8', 'q_n_c_9', 'q_n_c_10','q_n_c_11', 'q_n_c_12']

train_qual_negaCount.columns = new_columns
test_qual_negaCount.columns = new_columns

# Train & Predict

### Base set

In [184]:
# errtype을 유저별로 카운트 해줍니다.

X = train_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)
y = test_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)

X.columns = ['et_1', 'et_2', 'et_3', 'et_4', 'et_5', 'et_6', 'et_7', 'et_8', 'et_9', 'et_10', 'et_11', 'et_12', 'et_13',
 'et_14', 'et_15', 'et_16', 'et_17', 'et_18', 'et_19', 'et_20', 'et_21', 'et_22', 'et_23', 'et_24', 'et_25',
 'et_26', 'et_27', 'et_28', 'et_30', 'et_31', 'et_32', 'et_33', 'et_34', 'et_35', 'et_36', 'et_37', 'et_38',
 'et_39', 'et_40', 'et_41', 'et_42']
y.columns = ['et_1', 'et_2', 'et_3', 'et_4', 'et_5', 'et_6', 'et_7', 'et_8', 'et_9', 'et_10', 'et_11', 'et_12', 'et_13',
 'et_14', 'et_15', 'et_16', 'et_17', 'et_18', 'et_19', 'et_20', 'et_21', 'et_22', 'et_23', 'et_24', 'et_25',
 'et_26', 'et_27', 'et_28', 'et_30', 'et_31', 'et_32', 'et_33', 'et_34', 'et_35', 'et_36', 'et_37', 'et_38',
 'et_39', 'et_40', 'et_41', 'et_42']

In [185]:
# 하나 사라진 유저를 채워주는 코드입니다.
y = y.reindex(pd.RangeIndex(y.index.max() + 1)).ffill(0)[30000:]

In [186]:
X.shape, y.shape

((15000, 41), (14999, 41))

### 1차 합병

In [187]:
X = pd.concat([X,
               train_err_count, # 유저가 기록한 총 err수
               train_fwver_count, # 유저가 사용한 fw수
               train_model_count, # 유저가 사용한 model 수
               train_qual_std, # 각 퀄리티에 대한 유저별 편차
               train_qual_log, # 유저별 퀄리티 로그의 수
               train_errcode_23, # 23호 에러타입의 밸류별 개수
               train_errcode_33, # 33호 상동
               train_errcode_34, # 34호 상동
               train_qual_counts, # 각 퀄리티에서 0을 제외한 카운트
               train_qual_negaCount, # 각 퀄리티에 대해 음수만 카운트
               train_qual_zeroCount # 각 퀄리티에 대해 0.만 카운트
              ], axis=1).fillna(0)
X.shape

(15000, 73)

In [188]:
y = pd.concat([y,
               test_err_count,
               test_fwver_count,
               test_model_count,
               test_qual_std,
               test_qual_log,
               test_errcode_23,
               test_errcode_33,
               test_errcode_34,
               test_qual_counts,
               test_qual_negaCount,
               test_qual_zeroCount
              ], axis=1).fillna(0)
y.shape

(14999, 73)

### 2차 면접중

In [191]:
X_ec = pd.concat([X, train_errcode_uniqueCount], axis=1).fillna(0)
y_ec = pd.concat([y, test_errcode_uniqueCount], axis=1).fillna(0)

X_ec.shape, y_ec.shape

((15000, 107), (14999, 107))

In [192]:
X_et = pd.concat([X, train_errtype_uniqueCount], axis=1).fillna(0)
y_et = pd.concat([y, test_errtype_uniqueCount], axis=1).fillna(0)

X_et.shape, y_et.shape

((15000, 107), (14999, 107))

In [193]:
X_etc = pd.concat([X, train_errcode_uniqueCount, train_errtype_uniqueCount], axis=1).fillna(0)
y_etc = pd.concat([y, test_errcode_uniqueCount, test_errtype_uniqueCount], axis=1).fillna(0)

X_etc.shape, y_etc.shape

((15000, 108), (14999, 108))

## Model

In [194]:
#------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#------------------------------------------------------------


def s_fold_train_pred(train_x, train_y):
    import lightgbm as lgb

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5
    # 파라미터 설정
    params =      {
                    'boosting_type' : 'gbdt',
                    'objective'     : 'binary',
                    'metric'        : 'auc',
                    'learning_rate' : 0.027,
                    'seed': 42
                    }
    #-------------------------------------------------------------------------------------
    # 5 Kfold cross validation
    s_fold = StratifiedKFold(n_splits=5, shuffle=True ,random_state=42)    

    for train_idx, val_idx in s_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        d_train= lgb.Dataset(X, y)
        d_val  = lgb.Dataset(valid_x, valid_y)

        #run traning
        model = lgb.train(
                            params,
                            train_set       = d_train,
                            num_boost_round = 10000,
                            valid_sets      = d_val,
                            feval           = f_pr_auc,
                            verbose_eval    = 100, 
                            early_stopping_rounds = 100
                           )

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)

        print('==========================================================')
        
    return models, auc_scores, recalls, precisions

In [195]:
# loss 비교를 위해 지우지 않습니다.
# 최고점 모델입니다.
models, auc_scores, _, _ = s_fold_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13041
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 101
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.826432	valid_0's pr_auc: 0.744986
[200]	valid_0's auc: 0.828085	valid_0's pr_auc: 0.75092
Early stopping, best iteration is:
[183]	valid_0's auc: 0.828701	valid_0's pr_auc: 0.752009
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13136
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 101
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightG

In [200]:
# 학습용
models, auc_scores, _, _ = s_fold_train_pred(X_et, train_b_p.target)
print(np.mean(auc_scores))

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13074
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.824661	valid_0's pr_auc: 0.741399
[200]	valid_0's auc: 0.8286	valid_0's pr_auc: 0.749177
[300]	valid_0's auc: 0.828925	valid_0's pr_auc: 0.751265
[400]	valid_0's auc: 0.828909	valid_0's pr_auc: 0.752326
Early stopping, best iteration is:
[354]	valid_0's auc: 0.829402	valid_0's pr_auc: 0.752243
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13168
[LightGBM] [Info] Number of data points in the train set: 12000, num

In [201]:
# base 0.8292073  0.8266209  0.82655950
# ec   0.8296051  0.8269322  0.82608215
# et   0.8291249  0.8269684  0.82844335
# etc  0.8291123  0.8272698  0.82821499
#          ec         etc        et

In [202]:
pred_y_list = []
for model in models:
    pred_y = model.predict(y_et)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [203]:
submission.problem = pred_ensemble
submission

Unnamed: 0,user_id,problem
0,30000,0.93
1,30001,0.27
2,30002,0.55
3,30003,0.63
4,30004,0.85
...,...,...
14994,44994,0.24
14995,44995,0.37
14996,44996,0.76
14997,44997,0.88


In [204]:
submission.to_csv(r'C:\Users\Wyatt\wyatt37/Comp/LG_edge_detect/king/submission/king_210127_2_unique-errtype.csv', index=False)