In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.path.append('C:/Users/User/Desktop/r8')
from data_preprocessing import *

In [2]:
train_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-train-stemmed.csv')
valid_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-dev-stemmed.csv')
test_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-test-stemmed.csv')
train_df.head()

Unnamed: 0,text,edge,intent
0,champion product approv stock split champion p...,champion product approv stock split champion p...,earn
1,comput termin system cpml complet sale comput ...,comput termin system cpml complet sale comput ...,acq
2,cobanco inc cbco year net shr ct dlr net asset...,cobanco inc cbco year net shr ct dlr net asset...,earn
3,intern inc qtr jan oper shr loss two ct profit...,intern inc qtr jan oper shr loss two ct profit...,earn
4,brown forman inc bfd qtr net shr dlr ct net ml...,brown forman inc bfd qtr net shr dlr ct net ml...,earn


text와 edge는 동일한 것으로 판단되므로 text만 활용하여 모델 학습

In [3]:
# train_df와 valid_df 결합
train_val_df = pd.concat([train_df, valid_df])

# 전처리

In [4]:
train_val_df['text']=train_val_df['text'].apply(clean_text)
test_df['text']=test_df['text'].apply(clean_text)

In [5]:
x_train=list(train_val_df['text'])
y_train=list(train_val_df['intent'])
x_test=list(test_df['text'])
y_test=list(test_df['intent'])

In [6]:
to_txt=x_train+x_test
y=list(y_train)+list(y_test)

In [7]:
#불용어 불러오기
with open('C:/Users/user/Desktop/english.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file]

In [8]:
#문서 분류에 도움되는 10,000개 단어 선별
vect = CountVectorizer(stop_words=stopwords)
X_dtm = vect.fit_transform(to_txt)
X_dtm = X_dtm.toarray()
X_new = SelectKBest(chi2, k=10000).fit(X_dtm, y)
TorF = X_new.get_support()
TorF
word_view=np.array(vect.get_feature_names_out())
feature_lst10000=word_view[TorF]



In [9]:
#훈련데이터에서 선별한 10000개 단어 이외 단어는 <ukn>으로 변경
train_transe=[]
for i in range(len(x_train)):
    a=x_train[i].split()
    for j in range(len(a)):
        if a[j] not in feature_lst10000:
            a[j] = '<ukn>'
    train_transe.append(' '.join(a))

In [10]:
#테스트데이터에서 선별한 10000개 단어 이외 단어는 <ukn>으로 변경
test_transe=[]
for i in range(len(x_test)):
    a=x_test[i].split()
    for j in range(len(a)):
        if a[j] not in feature_lst10000:
            a[j] = '<ukn>'
    test_transe.append(' '.join(a))

In [11]:
# DTM 확인
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(train_transe)
# print(x_train.shape)

# 테스트데이터
x_test_dtm = dtmvector.transform(test_transe) #테스트 데이터를 DTM으로 변환

In [12]:
# TF-IDF Matrix확인
tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)

# 테스트데이터
tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환

(5484, 8881)


# 전통적인 머신러닝 모델로 학습

In [13]:
#나이브 베이즈 모델
param_grid = {'alpha': [0.01, 0.1, 0.5, 1, 10, 100],
              'fit_prior': [True, False]}

grid = GridSearchCV(MultinomialNB(), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))
print("F1 스코어:", f1_score(y_test, predicted, average='macro'))
print("F1 스코어:", f1_score(y_test, predicted, average='weighted'))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ........alpha=0.01, fit_prior=True;, score=0.927 total time=   0.0s
[CV 2/5] END ........alpha=0.01, fit_prior=True;, score=0.918 total time=   0.0s
[CV 3/5] END ........alpha=0.01, fit_prior=True;, score=0.937 total time=   0.0s
[CV 4/5] END ........alpha=0.01, fit_prior=True;, score=0.931 total time=   0.0s
[CV 5/5] END ........alpha=0.01, fit_prior=True;, score=0.919 total time=   0.0s
[CV 1/5] END .......alpha=0.01, fit_prior=False;, score=0.902 total time=   0.0s
[CV 2/5] END .......alpha=0.01, fit_prior=False;, score=0.885 total time=   0.0s
[CV 3/5] END .......alpha=0.01, fit_prior=False;, score=0.900 total time=   0.0s
[CV 4/5] END .......alpha=0.01, fit_prior=False;, score=0.902 total time=   0.0s
[CV 5/5] END .......alpha=0.01, fit_prior=False;, score=0.877 total time=   0.0s
[CV 1/5] END .........alpha=0.1, fit_prior=True;, score=0.935 total time=   0.0s
[CV 2/5] END .........alpha=0.1, fit_prior=True;

In [14]:
#SVM 모델
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'penalty': ['l1', 'l2']}

grid = GridSearchCV(LinearSVC(max_iter=500, dual=False), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))
print("F1 스코어:", f1_score(y_test, predicted, average='macro'))
print("F1 스코어:", f1_score(y_test, predicted, average='weighted'))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .................C=0.1, penalty=l1;, score=0.935 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l1;, score=0.937 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l1;, score=0.922 total time=   0.0s
[CV 4/5] END .................C=0.1, penalty=l1;, score=0.933 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l1;, score=0.924 total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.955 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.957 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.949 total time=   0.0s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.950 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.939 total time=   0.0s
[CV 1/5] END ...................C=1, penalty=l1;, score=0.965 total time=   0.1s
[CV 2/5] END ...................C=1, penalty=l1;



[CV 1/5] END ..................C=10, penalty=l1;, score=0.965 total time=   0.2s




[CV 2/5] END ..................C=10, penalty=l1;, score=0.966 total time=   0.2s




[CV 3/5] END ..................C=10, penalty=l1;, score=0.954 total time=   0.2s




[CV 4/5] END ..................C=10, penalty=l1;, score=0.967 total time=   0.2s




[CV 5/5] END ..................C=10, penalty=l1;, score=0.963 total time=   0.2s
[CV 1/5] END ..................C=10, penalty=l2;, score=0.965 total time=   0.1s
[CV 2/5] END ..................C=10, penalty=l2;, score=0.973 total time=   0.1s
[CV 3/5] END ..................C=10, penalty=l2;, score=0.964 total time=   0.1s
[CV 4/5] END ..................C=10, penalty=l2;, score=0.968 total time=   0.0s
[CV 5/5] END ..................C=10, penalty=l2;, score=0.964 total time=   0.0s




[CV 1/5] END .................C=100, penalty=l1;, score=0.963 total time=   0.3s




[CV 2/5] END .................C=100, penalty=l1;, score=0.968 total time=   0.3s




[CV 3/5] END .................C=100, penalty=l1;, score=0.955 total time=   0.3s




[CV 4/5] END .................C=100, penalty=l1;, score=0.973 total time=   0.4s




[CV 5/5] END .................C=100, penalty=l1;, score=0.959 total time=   0.3s
[CV 1/5] END .................C=100, penalty=l2;, score=0.960 total time=   0.3s
[CV 2/5] END .................C=100, penalty=l2;, score=0.972 total time=   0.4s
[CV 3/5] END .................C=100, penalty=l2;, score=0.966 total time=   0.4s
[CV 4/5] END .................C=100, penalty=l2;, score=0.973 total time=   0.4s
[CV 5/5] END .................C=100, penalty=l2;, score=0.962 total time=   0.4s




[CV 1/5] END ................C=1000, penalty=l1;, score=0.965 total time=   0.6s




[CV 2/5] END ................C=1000, penalty=l1;, score=0.958 total time=   0.6s




[CV 3/5] END ................C=1000, penalty=l1;, score=0.956 total time=   0.6s




[CV 4/5] END ................C=1000, penalty=l1;, score=0.971 total time=   0.6s




[CV 5/5] END ................C=1000, penalty=l1;, score=0.961 total time=   0.6s




[CV 1/5] END ................C=1000, penalty=l2;, score=0.955 total time=   0.6s
[CV 2/5] END ................C=1000, penalty=l2;, score=0.973 total time=   0.4s




[CV 3/5] END ................C=1000, penalty=l2;, score=0.956 total time=   0.5s
[CV 4/5] END ................C=1000, penalty=l2;, score=0.974 total time=   0.3s
[CV 5/5] END ................C=1000, penalty=l2;, score=0.957 total time=   0.5s
{'C': 1, 'penalty': 'l2'}
정확도: 0.9725902238465053
F1 스코어: 0.9390743752929659
F1 스코어: 0.9722473580101436




In [15]:
#logistic regression 모델
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'penalty': ['l1', 'l2']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))
print("F1 스코어:", f1_score(y_test, predicted, average='macro'))
print("F1 스코어:", f1_score(y_test, predicted, average='weighted'))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.837 total time=   0.1s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.838 total time=   0.1s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.822 total time=   0.1s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.821 total time=   0.1s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.818 total time=   0.1s
[CV 1/5] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....................C=1, penalty=l

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END ..................C=10, penalty=l2;, score=0.964 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ..................C=10, penalty=l2;, score=0.966 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END ..................C=10, penalty=l2;, score=0.955 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END ..................C=10, penalty=l2;, score=0.965 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END ..................C=10, penalty=l2;, score=0.955 total time=   0.4s
[CV 1/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END .................C=100, penalty=l2;, score=0.965 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .................C=100, penalty=l2;, score=0.968 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .................C=100, penalty=l2;, score=0.957 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END .................C=100, penalty=l2;, score=0.965 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .................C=100, penalty=l2;, score=0.959 total time=   0.4s
[CV 1/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END ................C=1000, penalty=l2;, score=0.964 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ................C=1000, penalty=l2;, score=0.969 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END ................C=1000, penalty=l2;, score=0.959 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END ................C=1000, penalty=l2;, score=0.968 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\envs\class\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anacond

[CV 5/5] END ................C=1000, penalty=l2;, score=0.964 total time=   0.4s
{'C': 1000, 'penalty': 'l2'}
정확도: 0.9671082686158063
F1 스코어: 0.9241564497908625
F1 스코어: 0.9666433628703064


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
