# Development Environment
Google Colab
- Platfrom : Linux-5.10.147+-x86_64-with-glibc2.29
- OS:Ubuntu 20.04.5 LTS
- CPU : Intel(R) Xeon(R) CPU @ 2.20GHz

# Python Library Version
- Python 3.8.10
- pandas 1.3.5
- numpy 1.22.4
- sklearn 1.2.1
- xgboost 1.7.4
- lightgbm 2.2.3

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(79) # Seed 고정

In [None]:
train_df = pd.read_csv('/data/train.csv')
test_df = pd.read_csv('/data/test.csv')

In [None]:
train_df2 = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
train_x = train_df2.drop(columns=['Y_Class', 'Y_Quality'])
train_y = train_df2['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## 결측치 0으로 대체, train / val data 분리

In [None]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train_df2, test_size=0.2)

In [None]:
y_train = train['Y_Class']
x_train = train.drop(columns = ['Y_Class', 'Y_Quality'])

y_val = val['Y_Class']
x_val = val.drop(columns = ['Y_Class', 'Y_Quality'])

In [None]:
x_train = x_train.fillna(0)
x_val = x_val.fillna(0)

## lazypredict로 모델 학습 진행

In [None]:
!pip install lazypredict



In [None]:
# Lazypredict를 통한 자동 모델 학습
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose = 0, predictions=True)
models, predictions = clf.fit(x_train, x_val, y_train, y_val) # fit(학습 데이터 인풋, 테스트 데이터 인풋, 학습 데이터 아웃풋, 테스트 데이터 아웃풋)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:35<00:00,  1.22s/it]


In [None]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.66,0.56,,0.66,0.28
XGBClassifier,0.72,0.55,,0.69,3.01
RidgeClassifierCV,0.68,0.53,,0.65,0.16
ExtraTreeClassifier,0.65,0.52,,0.64,0.12
AdaBoostClassifier,0.68,0.51,,0.65,1.72
LGBMClassifier,0.7,0.51,,0.65,1.82
DecisionTreeClassifier,0.62,0.51,,0.62,0.29
ExtraTreesClassifier,0.72,0.5,,0.66,0.37
LinearSVC,0.59,0.5,,0.6,4.38
BernoulliNB,0.59,0.5,,0.61,0.12


## train, test, val data label encoding 진행

In [None]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [None]:
qual_col2 = ['LINE', 'PRODUCT_CODE']

for i in qual_col2:
    le = LabelEncoder()
    le = le.fit(x_train[i])
    x_train[i] = le.transform(x_train[i])
    
    for label in np.unique(x_val[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    x_val[i] = le.transform(x_val[i]) 
print('Done.')

Done.


## 앙상블 진행

In [None]:
# 개별모델 생성
rf_clf = RandomForestClassifier(random_state = 79) 
lgbm_clf = LGBMClassifier(random_state = 79)
xgb_clf = XGBClassifier(random_state=79)
et_clf = ExtraTreesClassifier(random_state=79)
bg_clf = BaggingClassifier(random_state = 79)

In [None]:
from sklearn.ensemble import VotingClassifier

# 개별 모델을 soft voting 앙상블 모델로 구현
vo_clf = VotingClassifier(estimators=[('XGB', xgb_clf),('RF', rf_clf),('LGBM', lgbm_clf),('ETC', et_clf), ('BGC', bg_clf)], voting='hard')

# VotingClassifier 학습/예측/평가
vo_clf.fit(x_train,y_train)
pred = vo_clf.predict(x_val)
print('Voting 분류기 f1_score:{0:.4f}'.format(f1_score(y_val,pred, average='macro')))

Voting 분류기 f1_score:0.5720


## 최종 모델 fit

In [None]:
vo_clf.fit(train_x, train_y)
preds = vo_clf.predict(test_x)

In [None]:
preds

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 0, 2, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,

In [None]:
submit = pd.read_csv('/data/sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.to_csv('/data/lazy_predict_use_val.csv', index=False)