In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt

In [2]:
train_df = pd.read_csv('../nlp_data/nlp_train.tsv', sep='\t')
valid_df = pd.read_csv('../nlp_data/nlp_valid.tsv', sep='\t')
test_df = pd.read_csv('../nlp_data/nlp_test.tsv', sep='\t')

In [3]:
# 한글만 남기기
train_df['TEXT'] = train_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
valid_df['TEXT'] = valid_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['TEXT'] = test_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

  train_df['TEXT'] = train_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
  valid_df['TEXT'] = valid_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
  test_df['TEXT'] = test_df['TEXT'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


In [4]:
train_df  = pd.concat([train_df, valid_df], axis=0)

In [5]:
train_df

Unnamed: 0,TEXT,label
0,및 특성제지산업은 원자재 투입에서부터 초지 및 가공 공정까지 일관 자동화가 추진...,2
1,우리회사는 단일 식품제조 사업부문을 영위하고 있으며 각 품목별 특징 및 해외법인...,2
2,연결실체는 크게 자동차 고무 부품 창고무토목건축자재 면진제진시스템 등을 제공하...,2
3,년 월일자로 조직이 개편됨 전지사업부문 신설 에 따라 공시대상 사업부문의 구성이...,4
4,당사의 제기 사업보고서는 연결기준으로 작성되었습니다 당사는 한국채택국제회계기준 ...,2
...,...,...
1324,회사는 수익을 창출하는 재화와 용역의 성격 시장 및 판매방법의 특징 사업의 ...,3
1325,방송법에서 방송이라 함은 방송프로그램을 기획 편성 또는 제작하여 이를 공중 개별계...,3
1326,사업의 분류 및 성장성 지배회사의 내용제련 부문 아연 산업은 철강 자동차 ...,2
1327,당사는 한국산업분류기준표에 의한 분류와 매출액의 비중 기업회계기준 제호 영업부...,2


In [6]:
# 형태소 분석기 OKT를 사용한 토큰화 작업 (다소 시간 소요)
from tqdm.auto import tqdm
okt = Okt()

tokenized_data = []
for sentence in tqdm(train_df['TEXT']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    tokenized_data.append(tokenized_sentence)

  0%|          | 0/5316 [00:00<?, ?it/s]

In [7]:
import numpy as np
np.shape(tokenized_data)

  result = asarray(a).shape


(5316,)

In [8]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 256, epochs= 100, window = 5, min_count = 5, workers = 4, sg = 1)

In [9]:
def get_document_vectors_morphs(document_list):
    document_embedding_list = []
    # 각 문서에 대해서
    for idx, line in enumerate(document_list):
        # 모든 단어만 벡터화
        okt_document = okt.morphs(line)
        doc2vec = None
        count = 0
        if len(okt_document) == 0:
            print("제거됨: ", idx)
        for word in okt_document:
            if word in model.wv.index_to_key:
                count += 1
                # 해당 문서에 있는 모든 단어들의 벡터값을 더한다.
                if doc2vec is None:
                    doc2vec = model.wv[word]
                else:
                    doc2vec = doc2vec + model.wv[word]
        if doc2vec is not None:
            # 단어 벡터를 모두 더한 벡터의 값을 문서 길이로 나눠준다.
            doc2vec = doc2vec / count
            if doc2vec is None:
                print("제거됨: ", idx)
            elif len(doc2vec) ==0:
                print("제거됨: ", idx)
            document_embedding_list.append(doc2vec)
        else:
            print("제거됨: ", idx)
    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

In [10]:
import numpy as np
train_text = get_document_vectors_morphs(train_df['TEXT'])
print('(문서수, 벡터수) :',np.array(train_text).shape)

test_text = get_document_vectors_morphs(test_df['TEXT'])
print('(문서수, 벡터수) :',np.array(test_text).shape)

제거됨:  344
제거됨:  344
제거됨:  509
제거됨:  509
제거됨:  1638
제거됨:  1638
제거됨:  2257
제거됨:  2257
제거됨:  2996
제거됨:  2996
제거됨:  3154
제거됨:  3154
제거됨:  3511
제거됨:  3511
제거됨:  3833
제거됨:  3833
제거됨:  4044
제거됨:  4044
제거됨:  4065
제거됨:  4065
제거됨:  4563
제거됨:  4563
제거됨:  4786
제거됨:  4786
(문서수, 벡터수) : (5304, 256)
제거됨:  376
제거됨:  376
제거됨:  1172
제거됨:  1172
(문서수, 벡터수) : (1328, 256)


In [None]:
344, 509, 1638, 2257, 2996, 3154, 3511, 3833, 4044, 4065, 4563, 4786
376, 1172

In [11]:
y_train = train_df['label'].reset_index()['label']
drop_train = [344, 509, 1638, 2257, 2996, 3154, 3511, 3833, 4044, 4065, 4563, 4786]
for dd in drop_train:
    y_train.drop(index=dd, inplace=True)

In [12]:
y_test = test_df['label'].reset_index()['label']
drop_test= [376, 1172]
for dd in drop_test:
    y_test.drop(index=dd, inplace=True)

In [13]:
# y_train = train_df['label']
# y_valid = valid_df['label']
# y_train = np.concatenate([y_train, y_valid], axis=0)
# y_test = test_df['label']
# np.shape(y_train)

In [14]:
len(y_train), len(y_test)

(5304, 1328)

In [15]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, plot_confusion_matrix, classification_report

def set_seed(seed:int):
    import random
    import torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  

def model_evaluation(y_test, pred):
    # confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average = 'macro')
    recall = recall_score(y_test, pred, average = 'macro')
    f1 = f1_score(y_test, pred, average = 'macro')
    # print('오차행렬\n', confusion)
    #f1 score print 추가
    print('정확도: {0:.4f}\n 정밀도: {1:.4f}\n 재현율: {2:.4}\n F1:{3:.4f}'.format(accuracy, precision, 
                                                                                     recall, f1 ))
    return accuracy, precision, recall, f1

seed= 42
set_seed(seed)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from dataclasses import dataclass

def classifiers(classifier):
    #! classifier settings
    if classifier == 'RF':
        classifier = RandomForestClassifier(random_state=seed)
        model = Pipeline([("classifier", classifier),])

        param_grid = { 
            'classifier__n_estimators': [100, 200],
            'classifier__max_features': ['sqrt', 'log2'],
            'classifier__max_depth' : [5,10,20,50,100],
            'classifier__criterion' :['gini', 'entropy'],
            'classifier__ccp_alpha': [0.01, 0.001, 0],
            'classifier__class_weight' : ["balanced", None],
        }

    elif classifier == 'LR':
        classifier = LogisticRegression(random_state=seed, multi_class='multinomial')
        model = Pipeline([("classifier", classifier),])

        param_grid = {
            'classifier__penalty' : ['l1','l2'], 
            'classifier__C'       : [0.01, 0.1, 1, 10],
            'classifier__solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
        }

    elif classifier =="XGB":
        if True:
            classifier = XGBClassifier(random_state= seed, objective="multiclass", tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
        else:
            classifier = XGBClassifier(random_state= seed, objective="multiclass")
        model = Pipeline([("classifier", classifier),])
        param_grid = {
            'classifier__n_estimators' : [100,200,300,500],
            'classifier__max_depth' : [6, 8, 10, 20, 30],
            'classifier__gamma' : [0.5, 1, 1.5],
            'classifier__subsample'  : [0.6,0.8,1.0], 
            'classifier__colsample_bytree' : [0.8,0.9],
        }

    elif classifier =="LGB":
        # if opt.gpu:
        #     device = "gpu"
        # else:
        #     device ="cpu"
        classifier = LGBMClassifier(random_state= seed, objective="multiclass") #, device=device)
        model = Pipeline([("classifier", classifier)])

        param_grid = {
            'classifier__n_estimators' : [100,200,300,500],
            # 'classifier__max_depth' : [6, 8, 10, 20, 30,-1],  
            # 'classifier__num_leaves' : [80,100,150,200],  
            # 'classifier__min_data_in_leaf' : [25,100,200],  
        }

    else:
        raise NameError('Check your Classifier')

    grid = GridSearchCV(estimator = model,
                param_grid = param_grid,
                scoring="accuracy",
                cv= 3, # !
                n_jobs= 4, #!
                verbose= 1 #! 
                )
                
    grid_result = grid.fit(train_text, y_train) 
    y_pred = grid.predict(test_text)
    y_probas = grid.predict_proba(test_text)
    labels = ["D","C","B","B+","A","A+"]

    accuracy, precision, recall, f1 = model_evaluation(y_test, y_pred)
    print(accuracy)
    print(precision)
    print(recall)
    print(f1)
    return accuracy, precision, recall, f1

In [90]:
classifier = 'RF'
accuracy, precision, recall, f1 = classifiers(classifier)

Fitting 3 folds for each of 240 candidates, totalling 720 fits
정확도: 0.6318
 정밀도: 0.6132
 재현율: 0.481
 F1:0.5265
0.6317771084337349
0.6131917275432258
0.48099075703172606
0.5265052405152628


In [92]:
classifier = 'LR'
accuracy, precision, recall, f1 = classifiers(classifier)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

정확도: 0.5369
 정밀도: 0.3444
 재현율: 0.2901
 F1:0.3044
0.5368975903614458
0.34441466424910666
0.2901491141105344
0.3043964119703271


  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
classifier = 'XGB'
accuracy, precision, recall, f1 = classifiers(classifier)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits




정확도: 0.6227
 정밀도: 0.5760
 재현율: 0.4581
 F1:0.4990
0.6227409638554217
0.576023214661186
0.4580920266575137
0.49897446870864187


In [16]:
classifier = 'LGB'
accuracy, precision, recall, f1 = classifiers(classifier)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 3536, number of used features: 256
[LightGBM] [Info] Start training from score -3.922256
[LightGBM] [Info] Start training from score -1.050307
[LightGBM] [Info] Start training from score -0.811284
[LightGBM] [Info] Start training from score -2.063729
[LightGBM] [Info] Start training from score -2.966745
[LightGBM] [Info] Start training from score -4.874915
[LightGBM] [Info] Number of data points in the train set: 3536, number of used features: 256
[LightGBM] [Info] Start training from score -3.922256
[LightGBM] [Info] Start training from score -1.050307
[LightGBM] [Info] Start training from score -0.811284
[LightGBM] [Info] Start training from score -2.063729
[LightGBM] [Info] Start t