# 결과

TF-IDF는 주로 단어의 빈도와 중요도를 기반 -> 전통적인 머신러닝 모델(XGBoost, SVM, 로지스틱 회귀 등)과 잘 작동  
Word2Vec는 단어의 의미적, 문맥적 관계를 저차원 벡터로 표현 -> 주로 딥러닝 모델(RNN, Dense NN 등)과 결합될 때 가장 강력한 성능을 발휘

머신러닝 1등 성능 모델:  
LogisticRegression, voca_Size: 10000  
XGBoost, voca_size: 5000, INF   
딥러닝 1등 성능 모델: RNN 

# 데이터셋 만들기

In [1]:
!pip install gensim



In [2]:
from tensorflow.keras.datasets import reuters
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 데이터 준비
## index -> text
## DTM , TF-idf 학습데이터 준비
## W2V 학습데이터 준비

In [73]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2) #10000 -> 5000 -> inf

In [74]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [75]:
index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [76]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [77]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [78]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

In [79]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely <unk> borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian crowley senior associate director of gao also said that a preliminary analysis of proposed changes in <unk> financial eligibility standards indicated as many as one half of <unk> borrowers who received new loans from the agency in 1986 would be <unk> under the proposed system the agency has proposed evaluating <unk> credit using a variety of financial ratios instead of relying solely on <unk> ability senate agriculture committee chairman patrick leahy d vt <unk> the proposed eligibility changes telling <unk> administrator <unk> clark at a hearing that they would mark a dramatic shift in the agency's purpose away from being farmers' lender of last re

In [80]:
# 벡터화 W2V방법
from gensim.models import Word2Vec

# 우선 문장을 토큰화 시킵시다 띄어쓰기 기반으로 해볼게요! -> # 위에서 DTM만들때는 왜 안해줬냐! -> CountVectorizer에서 띄어쓰기 기반 토큰화가 내장되있음
x_train_tokenized = [sentence.split() for sentence in x_train]
x_test_tokenized = [sentence.split() for sentence in x_test]

# vector사이즈를 늘리거나 줄여보세요 아마 512 가장많이쓰이는 방식
model = Word2Vec(sentences = x_train_tokenized, vector_size = 256, window = 5, min_count = 5, workers = 4, sg = 0)
print("모델 학습 완료!")

모델 학습 완료!


In [81]:
# W2V이 잘되었는지 확인 -> 여차저차 되긴한것같다
model_result = model.wv.most_similar('man')
print(model_result)

[('acting', 0.8488190770149231), ('expert', 0.8255599141120911), ('glenn', 0.8099073171615601), ('erbynn', 0.8055760860443115), ('moore', 0.8050115704536438), ('peking', 0.8043171167373657), ('missouri', 0.801719605922699), ('row', 0.8000426292419434), ('communication', 0.7997837066650391), ('waters', 0.7995551228523254)]


In [82]:
# 학습된 Word2Vec 모델
w2v_model = model

# 각 문장을 벡터화 시키는 코드
def vectorize_sentence(sentence, model, max_len):
    vecs = []
    for word in sentence:
        if word in model.wv:
            vecs.append(model.wv[word])
        else:
            vecs.append(np.zeros(model.vector_size))
    # Padding
    if len(vecs) < max_len:
        vecs += [np.zeros(model.vector_size)] * (max_len - len(vecs))
    else:
        vecs = vecs[:max_len]
    return np.array(vecs)


# 최대 문장길이를 잘 잡아주세요
x_train_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_train_tokenized])
x_test_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_test_tokenized])




In [83]:
x_train_w2v.shape

(8982, 100, 256)

In [84]:
x_test_w2v.shape

(2246, 100, 256)

# 모델 정의 및 실험
머신러닝 실험은 저번 코드를 참고해주세요~

## 다양한 머신러닝

In [53]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(x_train_tfidf, y_train)

In [54]:
# 예측
y_pred = forest.predict(x_test_tfidf)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.6545
✅ F1-score : 0.6226


In [55]:
# XGBoost

from xgboost import XGBClassifier

# XGBoost 모델 학습
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, eval_metric='mlogloss')
xgb_model.fit(x_train_tfidf, y_train)

In [56]:
# 예측
y_pred = xgb_model.predict(x_test_tfidf)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.7939
✅ F1-score : 0.7877


In [18]:
# 데이터를 단어단위에서 문장단위로 바꿔줘야 한다. ML은 2차원데이터만 받을 수 있기 때문
# 문장에 대해서 토큰들의 벡터를 평균을 취해준다.

# Word2Vec 임베딩 시퀀스: (8982, 100, 256)
x_w2v_seq_train = x_train_w2v
x_w2v_seq_test = x_test_w2v
# 평균 풀링 → (8982, 256)
x_w2v_avg_train = np.mean(x_w2v_seq_train, axis=1)
x_w2v_avg_test = np.mean(x_w2v_seq_test, axis=1)
print(x_w2v_avg_train.shape)  # (8982, 256)

(8982, 256)


In [19]:
# Word2Vec 데이터로 XGBoost 모델 학습하기
from xgboost import XGBClassifier


# XGBoost 모델 학습
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, eval_metric='mlogloss')
xgb_model.fit(x_w2v_avg_train, y_train)

In [20]:
# 예측
y_pred = xgb_model.predict(x_w2v_avg_test)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.7297
✅ F1-score : 0.7111


# Dense NN 딥러닝 모델

In [57]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, LSTM, Dense, Dropout


dense_model = Sequential([
    Flatten(input_shape=(100, 256)),  # (seq_len, embedding_dim)
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(46, activation='softmax')   # 클래스 수에 맞게 조정 46개로 맞춰주세요!
])

dense_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_model.summary()

  super().__init__(**kwargs)


In [58]:
dense_model.fit(x_train_w2v, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 243ms/step - accuracy: 0.5305 - loss: 2.1829 - val_accuracy: 0.6683 - val_loss: 1.4757
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 329ms/step - accuracy: 0.6742 - loss: 1.3640 - val_accuracy: 0.6900 - val_loss: 1.3623
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 253ms/step - accuracy: 0.7340 - loss: 1.0795 - val_accuracy: 0.6889 - val_loss: 1.3461
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 232ms/step - accuracy: 0.7770 - loss: 0.8980 - val_accuracy: 0.6950 - val_loss: 1.4309
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 238ms/step - accuracy: 0.8002 - loss: 0.7793 - val_accuracy: 0.6895 - val_loss: 1.4497
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 228ms/step - accuracy: 0.8356 - loss: 0.6556 - val_accuracy: 0.6923 - val_loss: 1.4302
Epoch 7/1

<keras.src.callbacks.history.History at 0x1eb0e648d40>

In [60]:
y_pred_proba = dense_model.predict(x_test_w2v)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1-score: {f1:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
✅ Accuracy: 0.6883
✅ F1-score: 0.6713


# RNN 딥러닝 모델

In [85]:
# rnn 시계열 특징 데이터 특화 모델

rnn_model = Sequential([
    LSTM(128, input_shape=(100, 256)),  # (seq_len, embedding_dim)
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(46, activation='softmax')  
])

rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

  super().__init__(**kwargs)


In [86]:
rnn_model.fit(x_train_w2v, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 101ms/step - accuracy: 0.4075 - loss: 2.6363 - val_accuracy: 0.5888 - val_loss: 1.7038
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 100ms/step - accuracy: 0.5783 - loss: 1.7618 - val_accuracy: 0.6294 - val_loss: 1.4990
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 89ms/step - accuracy: 0.6425 - loss: 1.4957 - val_accuracy: 0.6861 - val_loss: 1.3198
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 104ms/step - accuracy: 0.6823 - loss: 1.3484 - val_accuracy: 0.7062 - val_loss: 1.2072
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 83ms/step - accuracy: 0.7098 - loss: 1.2032 - val_accuracy: 0.7195 - val_loss: 1.1885
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 78ms/step - accuracy: 0.7026 - loss: 1.2070 - val_accuracy: 0.7290 - val_loss: 1.1612
Epoch 7/20
[

<keras.src.callbacks.history.History at 0x1eb9b3ae540>

In [87]:
y_pred_proba = rnn_model.predict(x_test_w2v)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1-score: {f1:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step
✅ Accuracy: 0.7614
✅ F1-score: 0.7430


In [89]:
# voca_size = 5000
# epochs=10 일 때,
#✅ Accuracy: 0.7217
#✅ F1-score: 0.6835

# epochs=20 일 때,
#✅ Accuracy: 0.7774
#✅ F1-score: 0.7666

# epochs=30 일 때,
#✅ Accuracy: 0.7676
#✅ F1-score: 0.7571


# voca_size = 10000
# epochs=20 일 때,
#✅ Accuracy: 0.7614
#✅ F1-score: 0.7430