In [1]:
import tensorflow
import matplotlib
import seaborn 
import numpy 
import pandas
import sklearn

print(tensorflow.__version__)
print(matplotlib.__version__)
print(seaborn.__version__)
print(numpy.__version__)
print(pandas.__version__)
print(sklearn.__version__)

2025-10-01 07:42:29.620428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759304549.885939      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759304549.967940      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2.18.0
3.7.2
0.12.2
1.26.4
2.2.3
1.2.2


In [2]:
from tensorflow.keras.datasets import reuters

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

print(len(x_train), "훈련 샘플")
print(len(x_test), "테스트 샘플")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
8982 훈련 샘플
2246 테스트 샘플


In [3]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

In [4]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import reuters
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from gensim.models import Word2Vec


In [6]:

def load_reuters_tfidf(num_words=5000):
    """
    Reuters 데이터셋을 불러오고 BoW+TF-IDF로 변환
    num_words: vocabulary 크기
    return: X_train_tfidf, X_test_tfidf, y_train, y_test
    """
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    
    mlb = MultiLabelBinarizer(classes=range(num_words))
    x_train_bin = mlb.fit_transform(x_train)
    x_test_bin = mlb.transform(x_test)

    tfidf = TfidfTransformer()
    X_train_tfidf = tfidf.fit_transform(x_train_bin)
    X_test_tfidf = tfidf.transform(x_test_bin)
    
    return X_train_tfidf, X_test_tfidf, y_train, y_test

X_train, X_test, y_train, y_test = load_reuters_tfidf(num_words=5000)


In [7]:
results = []  

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model", num_words=5000):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")
    
    print(f"{model_name}: Accuracy={acc:.4f}, F1={f1:.4f}")
    
   
    results.append({
        "num_words": num_words,
        "Model": model_name,
        "Accuracy": acc,
        "F1-score": f1
    })
    
    return acc, f1


In [8]:
#나베즈
evaluate_model(MultinomialNB(), X_train, X_test, y_train, y_test, "MultinomialNB")

evaluate_model(ComplementNB(), X_train, X_test, y_train, y_test, "ComplementNB")


MultinomialNB: Accuracy=0.6674, F1=0.1028
ComplementNB: Accuracy=0.7409, F1=0.4000


(0.7408726625111309, 0.40001130085371056)

In [9]:
#로지스틱
evaluate_model(LogisticRegression(max_iter=1000), X_train, X_test, y_train, y_test, "LogisticRegression")


LogisticRegression: Accuracy=0.7756, F1=0.3297


(0.7756010685663401, 0.3296975001770233)

In [10]:
# SVM
evaluate_model(LinearSVC(), X_train, X_test, y_train, y_test, "SVM")


SVM: Accuracy=0.8197, F1=0.6459


(0.819679430097952, 0.6459068570580415)

In [11]:
#결정트리
evaluate_model(DecisionTreeClassifier(random_state=42), X_train, X_test, y_train, y_test, "DecisionTree")


DecisionTree: Accuracy=0.6892, F1=0.4583


(0.6892252894033838, 0.4583280495696436)

In [12]:
# 랜덤포레스트
evaluate_model(RandomForestClassifier(n_estimators=200, random_state=42), X_train, X_test, y_train, y_test, "RandomForest")


RandomForest: Accuracy=0.7631, F1=0.4415


(0.7631344612644702, 0.4415066273648894)

In [13]:
#그레디언트 부스팅
evaluate_model(GradientBoostingClassifier(), X_train, X_test, y_train, y_test, "GradientBoosting")


GradientBoosting: Accuracy=0.7609, F1=0.5349


(0.7609082813891362, 0.5348572207062249)

In [14]:
#보팅 앙상블
voting_clf = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=1000)),
        ("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
        ("nb", ComplementNB())
    ],
    voting="hard"
)

evaluate_model(voting_clf, X_train, X_test, y_train, y_test, "Voting")


Voting: Accuracy=0.7716, F1=0.3876


(0.7715939447907391, 0.3875979532126193)

In [35]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)
num_classes = np.max(y_train) + 1


maxlen = 200
X_train_pad = pad_sequences(x_train, maxlen=maxlen)
X_test_pad  = pad_sequences(x_test, maxlen=maxlen)
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat  = to_categorical(y_test, num_classes)


word_index = reuters.get_word_index()
index_to_word = {index+3: word for word, index in word_index.items()}
index_to_word[0], index_to_word[1], index_to_word[2] = "<PAD>", "<START>", "<UNK>"

x_train_text = [' '.join([index_to_word.get(i, "?") for i in seq]) for seq in x_train]
x_test_text  = [' '.join([index_to_word.get(i, "?") for i in seq]) for seq in x_test]

x_train_tokenized = [s.split() for s in x_train_text]
x_test_tokenized  = [s.split() for s in x_test_text]

w2v_model = Word2Vec(sentences=x_train_tokenized, vector_size=256, window=5, min_count=5, workers=4)

def vectorize_sentence(sentence, model, max_len=100):
    vecs = []
    for word in sentence:
        if word in model.wv:
            vecs.append(model.wv[word])
        else:
            vecs.append(np.zeros(model.vector_size))
    if len(vecs) < max_len:
        vecs += [np.zeros(model.vector_size)] * (max_len - len(vecs))
    else:
        vecs = vecs[:max_len]
    return np.array(vecs)

X_train_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_train_tokenized])
X_test_w2v  = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_test_tokenized])

print("데이터 준비:", X_train_pad.shape, X_train_w2v.shape)


데이터 준비: (8982, 200) (8982, 100, 256)


In [31]:
from tensorflow.keras.layers import Flatten

dense_model = Sequential([
    Flatten(input_shape=(100, 256)),
    Dense(512, activation="relu"),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])
dense_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
dense_model.fit(X_train_w2v, y_train, epochs=3, batch_size=32, validation_split=0.2, verbose=1)

y_pred_dense = np.argmax(dense_model.predict(X_test_w2v), axis=1)
acc = accuracy_score(y_test, y_pred_dense)
f1 = f1_score(y_test, y_pred_dense, average="macro")
results.append({"Model": "Word2Vec DenseNN", "Accuracy": acc, "F1-score": f1})

print(f"[Word2Vec DenseNN] Accuracy={acc:.4f}, F1={f1:.4f}")


  super().__init__(**kwargs)


Epoch 1/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 184ms/step - accuracy: 0.5485 - loss: 2.1365 - val_accuracy: 0.6800 - val_loss: 1.4388
Epoch 2/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 182ms/step - accuracy: 0.7057 - loss: 1.2505 - val_accuracy: 0.6856 - val_loss: 1.3998
Epoch 3/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 182ms/step - accuracy: 0.7583 - loss: 1.0030 - val_accuracy: 0.6912 - val_loss: 1.4402
Epoch 4/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 183ms/step - accuracy: 0.8044 - loss: 0.8396 - val_accuracy: 0.6906 - val_loss: 1.4290
Epoch 5/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 182ms/step - accuracy: 0.8387 - loss: 0.6833 - val_accuracy: 0.6873 - val_loss: 1.4872
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[Dense NN] Accuracy=0.6736, F1=0.2249


In [36]:
rnn_w2v = Sequential([
    LSTM(128, input_shape=(100, 256)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])
rnn_w2v.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
rnn_w2v.fit(X_train_w2v, y_train, epochs=3, batch_size=32, validation_split=0.2, verbose=1)

y_pred_rnn_w2v = np.argmax(rnn_w2v.predict(X_test_w2v), axis=1)
acc = accuracy_score(y_test, y_pred_rnn_w2v)
f1 = f1_score(y_test, y_pred_rnn_w2v, average="macro")
results.append({"Model": "Word2Vec RNN-LSTM", "Accuracy": acc, "F1-score": f1})

print(f"[Word2Vec RNN-LSTM] Accuracy={acc:.4f}, F1={f1:.4f}")


Epoch 1/3
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 127ms/step - accuracy: 0.4134 - loss: 2.6444 - val_accuracy: 0.5988 - val_loss: 1.6888
Epoch 2/3
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 123ms/step - accuracy: 0.5840 - loss: 1.7125 - val_accuracy: 0.6088 - val_loss: 1.5536
Epoch 3/3
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 123ms/step - accuracy: 0.6279 - loss: 1.5164 - val_accuracy: 0.6800 - val_loss: 1.3984
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step
[Word2Vec RNN-LSTM] Accuracy=0.6594, F1=0.0892


In [39]:
df_results = pd.DataFrame(results)
display(df_results)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,num_words,Model,Accuracy,F1-score
0,5000.0,MultinomialNB,0.667409,0.102775
1,5000.0,ComplementNB,0.740873,0.400011
2,5000.0,LogisticRegression,0.775601,0.329698
3,5000.0,SVM,0.819679,0.645907
4,5000.0,DecisionTree,0.689225,0.458328
5,5000.0,RandomForest,0.763134,0.441507
6,5000.0,GradientBoosting,0.760908,0.534857
7,5000.0,Voting,0.771594,0.387598
8,,Word2Vec RNN-LSTM,0.659394,0.089226
