In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
import pickle

from multiprocessing import cpu_count

from imblearn.under_sampling import RandomUnderSampler

from joblib import dump, load



from matplotlib import font_manager, rc
from konlpy.tag import Okt, Mecab

from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    auc,
    classification_report,
)

from sklearn.pipeline import Pipeline

from itertools import combinations

import wandb

wandb.login()

plt.rc("font", family="AppleGothic")  # For MacOS
print(plt.rcParams["font.family"])


sentence_df = pd.read_csv("../open/train.csv")
sentence_df = sentence_df[["문장", "유형"]]
sentence_df


['AppleGothic']


Unnamed: 0,문장,유형
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형
...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형


In [137]:
num_cpu = cpu_count()-1
num_cpu

9

## wandb 초기화

In [138]:
run = wandb.init(project='my-nlp-project')

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016698929866667336, max=1.0…

In [139]:
sentence_df['유형'].value_counts()

사실형    13558
추론형     2151
대화형      575
예측형      257
Name: 유형, dtype: int64

In [140]:
# stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt", header=None)[0].tolist()
stopwords = ['은', '는', '도', '한', '이다', '을', '이', '를', '가', '에', '의', '과', '에서', '으로', '들', '로', '와', '등']


## Undersampling

In [141]:
rus = RandomUnderSampler()

undersampled_data, undersampled_label = rus.fit_resample(np.array(sentence_df['문장'].to_list()).reshape(-1, 1), sentence_df['유형'])
undersample = np.concatenate((undersampled_data.reshape(-1, 1), undersampled_label.values.reshape(-1, 1)), axis=1)

sentence_df = pd.DataFrame(undersample, columns=['문장', '유형'])
sentence_df['유형'].value_counts()

대화형    257
사실형    257
예측형    257
추론형    257
Name: 유형, dtype: int64

## 데이터 클리닝
- regex

In [142]:
# regex 함수
def regex_filter(sentence):
    return re.sub(r"[^가-힣\s!?]|\(.*?\)", "", sentence)

In [143]:
sentence_df['문장'] = sentence_df['문장'].apply(regex_filter)

## 데이터 나누기 (Train/Test)

In [144]:
from sklearn.model_selection import train_test_split

X = sentence_df['문장']
y = sentence_df['유형']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=13, stratify=y)
X_train.shape, y_train.shape

((822,), (822,))

## Vectorizer

In [146]:
def custom_tokenizer(sentence):
    '''
    각 문장을 Mecab을 이용하여 토큰화해줄 함수
    토큰들을 리스트 형식으로 반환
    '''
    t= Mecab()
    return [token[0] for token in t.pos(sentence)]

### CountVectorizer

In [122]:
vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    ngram_range=(2, 2),
    min_df=10, 
    stop_words=stopwords
)
X_train_vec = vectorizer.fit_transform(X_train)
X_train_vec.shape



(822, 46)

### TF-IDF

In [147]:
vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer, 
    ngram_range=(1, 3),
    min_df=5,
    stop_words=stopwords)
X_train_vec = vectorizer.fit_transform(X_train)
X_train_vec.shape



(822, 749)

In [168]:
dump(vectorizer, 'final_vectorizer.joblib')

['final_vectorizer.joblib']

In [148]:
X_train = pd.DataFrame(X_train_vec.toarray(), columns=vectorizer.get_feature_names_out())
X_train

Unnamed: 0,?,가격,가끔,가능,가능 성,가능 하,가량,가상,가장,가지,...,화폐,확대,확보,확산,환경,활용,회사,회장,효과,후
0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.382092,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.290118,0.341874,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
817,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
818,0.313676,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
819,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
820,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
X_valid_vec = vectorizer.transform(X_valid)
X_valid = pd.DataFrame(X_valid_vec.toarray(), columns=vectorizer.get_feature_names_out())
X_valid.shape, y_valid.shape

((206, 749), (206,))

## ML Model test function

In [150]:
def model_eval(model, X_val, y_val, randomCV=False):
    if not randomCV:
        fit_start = time.time()
        model.fit(X_train_vec, y_train)
        fit_end = time.time()
        print(f'Model Fit time: {fit_end-fit_start}')

    pred_start = time.time()
    pred = model.predict(X_val)
    pred_prob = model.predict_proba(X_val)
    pred_end = time.time()
    print(f'Model pred time: {pred_end-pred_start}')

    accuracy = accuracy_score(pred, y_val)
    f1 = f1_score(pred, y_val, average='weighted')
    
    auc_ovo = roc_auc_score(y_val, pred_prob, multi_class='ovo')
    auc_ovr = roc_auc_score(y_val, pred_prob, multi_class='ovr')

    print(f'{accuracy=}')
    print(f'{f1=}')
    print(f'{auc_ovo=}')
    print(f'{auc_ovr=}')
    print(classification_report(y_val, 
        pred, digits=3, zero_division=0))
    
    return pred, pred_prob


## Multinomial Naive Bayes

In [151]:
naive = MultinomialNB()

pred_nb, prob_nb = model_eval(naive, X_valid, y_valid)

Model Fit time: 0.001984119415283203
Model pred time: 0.008151054382324219
accuracy=0.7135922330097088
f1=0.7092809250859149
auc_ovo=0.9003048772395991
auc_ovr=0.900589503791591
              precision    recall  f1-score   support

         대화형      0.949     0.712     0.813        52
         사실형      0.554     0.608     0.579        51
         예측형      0.821     0.885     0.852        52
         추론형      0.600     0.647     0.623        51

    accuracy                          0.714       206
   macro avg      0.731     0.713     0.717       206
weighted avg      0.732     0.714     0.718       206





## Random Forest

In [152]:
rf = RandomForestClassifier(
    random_state=13
)

pred_rf_base, prob_rf_base = model_eval(rf, X_valid_vec, y_valid)

Model Fit time: 0.1322040557861328
Model pred time: 0.013312101364135742
accuracy=0.7427184466019418
f1=0.7406821211554167
auc_ovo=0.9182629805654324
auc_ovr=0.9186171868928036
              precision    recall  f1-score   support

         대화형      0.865     0.865     0.865        52
         사실형      0.609     0.765     0.678        51
         예측형      0.909     0.769     0.833        52
         추론형      0.630     0.569     0.598        51

    accuracy                          0.743       206
   macro avg      0.754     0.742     0.744       206
weighted avg      0.755     0.743     0.745       206



In [70]:
rf.classes_

array(['대화형', '사실형', '예측형', '추론형'], dtype=object)

In [153]:
run = wandb.init(project='my-nlp-project')
wandb.sklearn.plot_classifier(rf, 
                              X_train, X_valid, 
                              y_train, y_valid, 
                              pred_rf_base, prob_rf_base, 
                              ['예측형', '사실형', '대화형', '추론형'], 
                              is_binary=True, 
                              model_name='RandomForest_Mecab_base')



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016721431250001237, max=1.0…

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RandomForest_Mecab_base.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


### Random Forest Hyperparameter Tuning

In [172]:
rf = RandomForestClassifier(
    random_state=13
)

param_dist = {
    'criterion':['gini','entropy'], 
    'n_estimators': sp_randInt(100, 500),
    'max_depth':[None, 2, 3, 4, 5, 6, 7, 8], 
    'min_samples_split':sp_randInt(8, 11)
    }

random_search = RandomizedSearchCV(
    estimator=rf, param_distributions = param_dist,
    cv = 5, n_iter = 100, n_jobs=num_cpu, 
    verbose=2)
random_search
random_search.fit(X_train.values, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=221; total time=   0.2s
[CV] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=221; total time=   0.2s
[CV] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=221; total time=   0.2s
[CV] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=221; total time=   0.2s
[CV] END criterion=gini, max_depth=4, min_samples_split=9, n_estimators=421; total time=   0.3s
[CV] END criterion=gini, max_depth=4, min_samples_split=9, n_estimators=421; total time=   0.4s
[CV] END criterion=gini, max_depth=4, min_samples_split=9, n_estimators=421; total time=   0.4s
[CV] END criterion=gini, max_depth=4, min_samples_split=9, n_estimators=421; total time=   0.4s
[CV] END criterion=gini, max_depth=4, min_samples_split=9, n_estimators=421; total time=   0.4s
[CV] END criterion=gini, max_depth=5, min_samples_split=10, n_estimat

In [176]:
best_rf = random_search.best_estimator_
pred_rf, prob_rf = model_eval(best_rf, X_valid.values, y_valid, randomCV=True)

Model pred time: 0.029878616333007812
accuracy=0.7572815533980582
f1=0.7588586193717909
auc_ovo=0.9208550259837858
auc_ovr=0.9212907151355918
              precision    recall  f1-score   support

         대화형      0.831     0.942     0.883        52
         사실형      0.617     0.725     0.667        51
         예측형      0.911     0.788     0.845        52
         추론형      0.690     0.569     0.624        51

    accuracy                          0.757       206
   macro avg      0.762     0.756     0.755       206
weighted avg      0.763     0.757     0.756       206



In [177]:
dump(random_search.best_estimator_, 'final_model.joblib')

['final_model.joblib']

In [167]:
fin_model = load('final_model.joblib')
model_eval(fin_model, X_valid, y_valid, randomCV=True)

Model pred time: 0.05042695999145508
accuracy=0.7524271844660194
f1=0.7525848061361602
auc_ovo=0.9201494461051621
auc_ovr=0.9205791915232143
              precision    recall  f1-score   support

         대화형      0.839     0.904     0.870        52
         사실형      0.617     0.725     0.667        51
         예측형      0.913     0.808     0.857        52
         추론형      0.659     0.569     0.611        51

    accuracy                          0.752       206
   macro avg      0.757     0.751     0.751       206
weighted avg      0.758     0.752     0.752       206



(array(['추론형', '사실형', '추론형', '대화형', '대화형', '예측형', '대화형', '사실형', '추론형',
        '대화형', '대화형', '대화형', '예측형', '사실형', '예측형', '추론형', '대화형', '예측형',
        '대화형', '사실형', '예측형', '사실형', '대화형', '예측형', '대화형', '예측형', '사실형',
        '대화형', '예측형', '예측형', '예측형', '예측형', '추론형', '사실형', '사실형', '사실형',
        '사실형', '예측형', '추론형', '예측형', '예측형', '추론형', '추론형', '사실형', '추론형',
        '사실형', '사실형', '사실형', '추론형', '대화형', '추론형', '추론형', '사실형', '대화형',
        '대화형', '사실형', '예측형', '대화형', '대화형', '대화형', '사실형', '추론형', '대화형',
        '사실형', '사실형', '사실형', '추론형', '대화형', '추론형', '예측형', '예측형', '사실형',
        '예측형', '대화형', '추론형', '예측형', '대화형', '예측형', '사실형', '대화형', '예측형',
        '사실형', '예측형', '대화형', '추론형', '추론형', '사실형', '대화형', '사실형', '사실형',
        '예측형', '대화형', '사실형', '추론형', '대화형', '예측형', '예측형', '사실형', '대화형',
        '추론형', '예측형', '추론형', '추론형', '대화형', '사실형', '사실형', '사실형', '추론형',
        '사실형', '대화형', '추론형', '추론형', '사실형', '사실형', '사실형', '사실형', '예측형',
        '예측형', '예측형', '대화형', '예측형', '사실형', '대화형', '대화형', '추론형', '대화형',
      

In [171]:
X_valid

Unnamed: 0,?,가격,가끔,가능,가능 성,가능 하,가량,가상,가장,가지,...,화폐,확대,확보,확산,환경,활용,회사,회장,효과,후
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266101,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
random_search.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_split': 10,
 'n_estimators': 399}

In [160]:
wandb.sklearn.plot_classifier(best_rf, 
                              X_train, X_valid, 
                              y_train, y_valid, 
                              pred_rf, prob_rf, 
                              best_rf.classes_, 
                              is_binary=True, 
                              model_name='RandomForest_rancv')

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RandomForest_rancv.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


## LightGBM

In [162]:
lgbm = LGBMClassifier(
    n_estimators=450,
    max_depth=20,
    min_child_samples=10,
    class_weight='balanced',
    n_jobs=-1)
model_eval(lgbm, X_valid_vec, y_valid)

Model Fit time: 33.14475703239441
Model pred time: 0.024654865264892578
accuracy=0.6747572815533981
f1=0.6730435839003481
auc_ovo=0.8785703059891241
auc_ovr=0.8790492516654755
              precision    recall  f1-score   support

         대화형      0.824     0.808     0.816        52
         사실형      0.566     0.588     0.577        51
         예측형      0.800     0.769     0.784        52
         추론형      0.519     0.529     0.524        51

    accuracy                          0.675       206
   macro avg      0.677     0.674     0.675       206
weighted avg      0.679     0.675     0.676       206



(array(['추론형', '사실형', '추론형', '대화형', '대화형', '예측형', '대화형', '사실형', '추론형',
        '대화형', '추론형', '추론형', '예측형', '사실형', '예측형', '사실형', '추론형', '예측형',
        '대화형', '사실형', '예측형', '사실형', '대화형', '예측형', '대화형', '예측형', '사실형',
        '대화형', '예측형', '예측형', '예측형', '예측형', '추론형', '사실형', '사실형', '사실형',
        '사실형', '예측형', '추론형', '예측형', '예측형', '추론형', '사실형', '사실형', '추론형',
        '예측형', '사실형', '예측형', '추론형', '대화형', '예측형', '추론형', '사실형', '대화형',
        '대화형', '사실형', '예측형', '대화형', '대화형', '대화형', '사실형', '추론형', '대화형',
        '사실형', '예측형', '사실형', '추론형', '대화형', '사실형', '예측형', '예측형', '추론형',
        '예측형', '대화형', '추론형', '예측형', '대화형', '예측형', '추론형', '대화형', '예측형',
        '사실형', '예측형', '사실형', '사실형', '예측형', '사실형', '대화형', '사실형', '사실형',
        '예측형', '대화형', '추론형', '대화형', '대화형', '추론형', '예측형', '추론형', '대화형',
        '추론형', '예측형', '대화형', '추론형', '사실형', '사실형', '추론형', '대화형', '추론형',
        '사실형', '대화형', '추론형', '추론형', '대화형', '사실형', '추론형', '추론형', '예측형',
        '예측형', '예측형', '추론형', '예측형', '사실형', '추론형', '대화형', '추론형', '대화형',
      

## OvO ROC curve
**[github example](https://github.com/vinyluis/Articles/blob/main/ROC%20Curve%20and%20ROC%20AUC/ROC%20Curve%20-%20Multiclass.ipynb)**

In [46]:
pair_list = list(combinations(np.unique(y_valid), 2))
print(pair_list)


[('대화형', '사실형'), ('대화형', '예측형'), ('대화형', '추론형'), ('사실형', '예측형'), ('사실형', '추론형'), ('예측형', '추론형')]


In [63]:
lb = LabelBinarizer()

pred_lb = lb.fit_transform(y_valid)
pred_prob = naive.predict_proba(X_valid_vec)

auc = roc_auc_score(pred_lb, pred_prob, multi_class='ovo')



In [64]:
auc

0.7001334085971874