In [1]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

In [2]:
# 데이터 로드
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# 정수 인덱스 → 텍스트 변환
decoded_train = [" ".join([reverse_word_index.get(word_id - 3, "?") for word_id in seq]) for seq in x_train]
decoded_test = [" ".join([reverse_word_index.get(word_id - 3, "?") for word_id in seq]) for seq in x_test]

In [4]:
# 실험할 vocab_size 리스트
vocab_sizes = [5000, 10000, None]

# 모델 리스트
models = {
    "MNB": MultinomialNB(),
    "CNB": ComplementNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=0),
    "SVM": SVC(kernel="linear", random_state=0),
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(random_state=0),
}

# 결과 저장
results = []

In [5]:
for vocab_size in vocab_sizes:
    print(f"\n # Vocab Size: {vocab_size}")
    
    # TF-IDF 벡터화
    vectorizer = CountVectorizer(max_features=vocab_size)
    X_train_counts = vectorizer.fit_transform(decoded_train)
    X_test_counts = vectorizer.transform(decoded_test)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    # 개별 모델 평가
    scores = {"Vocab Size": vocab_size}
    for name, model in models.items():
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        scores[f"{name} (Acc)"] = acc
        scores[f"{name} (F1)"] = f1

        print(f"{name}: Accuracy = {acc:.4f}, F1-Score = {f1:.4f}")

    # 보팅 분류기 (Soft Voting) - 사용자 지정 구성 적용
    logistic = LogisticRegression(penalty='l2', random_state=0)
    complement_nb = ComplementNB()
    gradient_boost = GradientBoostingClassifier(random_state=0)

    voting_classifier = VotingClassifier(
        estimators=[
            ('logistic', logistic),
            ('complement_nb', complement_nb),
            ('gradient_boost', gradient_boost)
        ],
        voting='soft'
    )
    voting_classifier.fit(X_train_tfidf, y_train)
    y_pred_voting = voting_classifier.predict(X_test_tfidf)

    acc_voting = accuracy_score(y_test, y_pred_voting)
    f1_voting = f1_score(y_test, y_pred_voting, average="weighted")

    scores["Voting (Acc)"] = acc_voting
    scores["Voting (F1)"] = f1_voting

    print(f"Voting: Accuracy = {acc_voting:.4f}, F1-Score = {f1_voting:.4f}")

    results.append(scores)


 # Vocab Size: 5000
MNB: Accuracy = 0.6785, F1-Score = 0.6071
CNB: Accuracy = 0.7685, F1-Score = 0.7428
Logistic Regression: Accuracy = 0.7983, F1-Score = 0.7755
SVM: Accuracy = 0.8246, F1-Score = 0.8146
Decision Tree: Accuracy = 0.6968, F1-Score = 0.6941
Random Forest: Accuracy = 0.7640, F1-Score = 0.7415
Gradient Boosting: Accuracy = 0.7636, F1-Score = 0.7604


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting: Accuracy = 0.7921, F1-Score = 0.7880

 # Vocab Size: 10000
MNB: Accuracy = 0.6585, F1-Score = 0.5770
CNB: Accuracy = 0.7711, F1-Score = 0.7457
Logistic Regression: Accuracy = 0.7961, F1-Score = 0.7729
SVM: Accuracy = 0.8219, F1-Score = 0.8117
Decision Tree: Accuracy = 0.6879, F1-Score = 0.6854
Random Forest: Accuracy = 0.7542, F1-Score = 0.7298
Gradient Boosting: Accuracy = 0.7667, F1-Score = 0.7626


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting: Accuracy = 0.8010, F1-Score = 0.7958

 # Vocab Size: None
MNB: Accuracy = 0.5997, F1-Score = 0.5046
CNB: Accuracy = 0.7649, F1-Score = 0.7350
Logistic Regression: Accuracy = 0.7916, F1-Score = 0.7670
SVM: Accuracy = 0.8224, F1-Score = 0.8119
Decision Tree: Accuracy = 0.7039, F1-Score = 0.6981
Random Forest: Accuracy = 0.7342, F1-Score = 0.7064
Gradient Boosting: Accuracy = 0.7707, F1-Score = 0.7666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting: Accuracy = 0.8045, F1-Score = 0.7995


In [13]:
from IPython.display import display

# 데이터프레임 출력
display(df_results)

# Accuracy & F1-score의 최대값을 찾는 함수
def find_best_model(metric_type, df):
    metric_cols = [col for col in df.columns if metric_type in col]  # Acc 또는 F1 포함된 컬럼 찾기
    best_idx = df[metric_cols].idxmax().iloc[0]  # 최대값이 있는 행의 인덱스 찾기
    best_model = df[metric_cols].loc[best_idx].idxmax().replace(f" ({metric_type})", "")  # 모델명 추출
    best_vocab = df.loc[best_idx, "Vocab Size"]  # 해당 모델의 vocab_size 가져오기
    best_value = df[metric_cols].max().max()  # 최대값

    return best_model, best_vocab, best_value

# Accuracy 최대 모델 찾기
best_acc_model, best_acc_vocab, best_acc_value = find_best_model("Acc", df_results)

# F1-score 최대 모델 찾기
best_f1_model, best_f1_vocab, best_f1_value = find_best_model("F1", df_results)

# 결과 출력
print(f" Accuracy 최대값: {best_acc_value:.4f}")
print(f"    - Model: {best_acc_model}")
print(f"    - Vocab Size: {best_acc_vocab}")

print(f"\n F1-Score 최대값: {best_f1_value:.4f}")
print(f"    - Model: {best_f1_model}")
print(f"    - Vocab Size: {best_f1_vocab}")


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Vocab Size,MNB (Acc),MNB (F1),CNB (Acc),CNB (F1),Logistic Regression (Acc),Logistic Regression (F1),SVM (Acc),SVM (F1),Decision Tree (Acc),Decision Tree (F1),Random Forest (Acc),Random Forest (F1),Gradient Boosting (Acc),Gradient Boosting (F1),Voting (Acc),Voting (F1)
0,5000.0,0.67854,0.60714,0.768477,0.742802,0.798308,0.775517,0.824577,0.814561,0.696794,0.694094,0.764025,0.741513,0.76358,0.760359,0.792075,0.788042
1,10000.0,0.658504,0.577002,0.771149,0.745685,0.796082,0.772925,0.821906,0.811739,0.68789,0.685352,0.75423,0.729782,0.766696,0.762551,0.80098,0.795808
2,,0.599733,0.504567,0.764915,0.73501,0.79163,0.767023,0.822351,0.811871,0.703918,0.698145,0.734194,0.706449,0.770703,0.76656,0.804541,0.799452


 Accuracy 최대값: 0.8246
    - Model: SVM
    - Vocab Size: 5000.0

 F1-Score 최대값: 0.8146
    - Model: SVM
    - Vocab Size: 5000.0


실험 결과를 보면 Vocab size = 5000 일 때, SVM 을 사용할 경우 Acc와 F1-score가 모두 제일 높다.   
이를 딥러닝 모델을 활용했을 경우와 비교해보자

### 데이터 준비

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Vocab Size 설정
vocab_size = 5000

# 토크나이저 생성 (단어를 정수 인덱스로 변환)
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(decoded_train)

# 정수 시퀀스로 변환
X_train_seq = tokenizer.texts_to_sequences(decoded_train)
X_test_seq = tokenizer.texts_to_sequences(decoded_test)

# 패딩 적용 (최대 길이를 데이터의 95% 지점으로 설정)
max_length = int(np.percentile([len(seq) for seq in X_train_seq], 95))
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding="post")

# 최종 Vocab Size 설정
vocab_size = len(tokenizer.word_index) + 1  # +1은 패딩 토큰 포함
print(f"Vocab Size (Final): {vocab_size}, Max Sequence Length: {max_length}")

Vocab Size (Final): 28136, Max Sequence Length: 426


### RNN 모델 학습

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, f1_score

# RNN 모델 정의
def create_rnn_model(vocab_size, max_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        SimpleRNN(64, return_sequences=False),
        Dense(64, activation="relu"),
        Dense(len(set(y_train)), activation="softmax")  # 다중 클래스 분류
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(), metrics=["accuracy"])
    return model

# 모델 생성 및 학습
rnn_model = create_rnn_model(vocab_size, max_length)

# 조기 종료(EarlyStopping) 콜백 설정
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# 모델 학습
rnn_model.fit(
    X_train_padded, y_train,
    validation_data=(X_test_padded, y_test),
    epochs=10, batch_size=32, verbose=1,
    callbacks=[early_stopping]
)

# 예측 수행
y_pred_rnn = np.argmax(rnn_model.predict(X_test_padded), axis=1)

# 정확도 및 F1-score 평가
acc_rnn = accuracy_score(y_test, y_pred_rnn)
f1_rnn = f1_score(y_test, y_pred_rnn, average="weighted")

print(f"\n# RNN Model 결과")
print(f"Accuracy = {acc_rnn:.4f}")
print(f"F1-Score = {f1_rnn:.4f}")

Epoch 1/10




[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 48ms/step - accuracy: 0.2918 - loss: 2.8018 - val_accuracy: 0.3669 - val_loss: 2.4058
Epoch 2/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.3563 - loss: 2.3937 - val_accuracy: 0.3713 - val_loss: 2.4010
Epoch 3/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.3705 - loss: 2.3536 - val_accuracy: 0.3669 - val_loss: 2.4045
Epoch 4/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.3690 - loss: 2.3427 - val_accuracy: 0.3687 - val_loss: 2.4081
Epoch 5/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.3472 - loss: 2.4305 - val_accuracy: 0.3513 - val_loss: 2.4324
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step

# RNN Model 결과
Accuracy = 0.3713
F1-Score = 0.2189


결과를 보는데 너무 낮다   
이유가 뭘까?

- 벡터화된 수치 데이터가 아닌 단순 정수 인덱스를 입력으로 사용
- 사전 학습된 임베딩을 사용하지 않고 학습 데이터로만 임베딩을 학습했기 때문

정도로 생각해 볼 수 있을 것 같다..

시간적으로 부족해서 vocab size를 더 많이 실험해보지 못한 점이 좀 아쉽다.   
내가 잘 실험을 했는지도 궁금해서 다른 사람들의 결과도 한 번 보고 싶다. 