# 결과

### 노드 머신러닝 실습 모델 8개 + 추천 모델 3개 추가

![image.png](attachment:887262e0-4c82-45a4-93a1-df5c853b8a23.png)

![image.png](attachment:7b1339cc-4d4f-400e-8ee4-4eb4f8fbb24f.png)

### 추천 머신 러닝 8개

![image.png](attachment:aa007a7e-a9c0-4928-9154-9370f9386de2.png)

![image.png](attachment:1387f4c4-9765-4843-931b-02de477013c5.png)

### 딥러닝

![image.png](attachment:707918fa-6bc6-48fe-88b3-0fad14d62544.png)

머신러닝 1등 성능 모델:  
LogisticRegression, voca_Size: 10000  
XGBoost, voca_size: 5000, INF   
딥러닝 1등 성능 모델: RNN 

TF-IDF는 주로 단어의 빈도와 중요도를 기반 -> 전통적인 머신러닝 모델(XGBoost, SVM, 로지스틱 회귀 등)과 잘 작동  
Word2Vec는 단어의 의미적, 문맥적 관계를 저차원 벡터로 표현 -> 주로 딥러닝 모델(RNN, Dense NN 등)과 결합될 때 가장 강력한 성능을 발휘

# 데이터셋 만들기

In [90]:
!pip install gensim



In [91]:
from tensorflow.keras.datasets import reuters
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 데이터 준비
- index -> text
- DTM , TF-idf 학습데이터 준비
- W2V 학습데이터 준비

In [92]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2) #10000 -> 5000 -> inf

In [93]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [94]:
index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [95]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [96]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [97]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

In [98]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely <unk> borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian crowley senior associate director of gao also said that a preliminary analysis of proposed changes in <unk> financial eligibility standards indicated as many as one half of <unk> borrowers who received new loans from the agency in 1986 would be <unk> under the proposed system the agency has proposed evaluating <unk> credit using a variety of financial ratios instead of relying solely on <unk> ability senate agriculture committee chairman patrick leahy d vt <unk> the proposed eligibility changes telling <unk> administrator <unk> clark at a hearing that they would mark a dramatic shift in the agency's purpose away from being farmers' lender of last re

In [99]:
# 벡터화 W2V방법
from gensim.models import Word2Vec

# 우선 문장을 토큰화 -> 띄어쓰기 기반 -> # 위에서 DTM만들때는 왜 안해줬냐! -> CountVectorizer에서 띄어쓰기 기반 토큰화가 내장되있음
x_train_tokenized = [sentence.split() for sentence in x_train]
x_test_tokenized = [sentence.split() for sentence in x_test]

model = Word2Vec(sentences = x_train_tokenized, vector_size = 256, window = 5, min_count = 5, workers = 4, sg = 0)
print("모델 학습 완료!")

모델 학습 완료!


In [100]:
model_result = model.wv.most_similar('man')
print(model_result)

[('glenn', 0.8617676496505737), ('don', 0.8569511771202087), ('iowa', 0.851399302482605), ('pechiney', 0.8507195711135864), ('debartolo', 0.8479317426681519), ('diagnostic', 0.8455610871315002), ('sydney', 0.8451241850852966), ('businessman', 0.8397796154022217), ('rica', 0.8389025926589966), ('stangeland', 0.8368949890136719)]


In [101]:
# 학습된 Word2Vec 모델
w2v_model = model

# 각 문장을 벡터화 시키는 코드
def vectorize_sentence(sentence, model, max_len):
    vecs = []
    for word in sentence:
        if word in model.wv:
            vecs.append(model.wv[word])
        else:
            vecs.append(np.zeros(model.vector_size))
    # Padding
    if len(vecs) < max_len:
        vecs += [np.zeros(model.vector_size)] * (max_len - len(vecs))
    else:
        vecs = vecs[:max_len]
    return np.array(vecs)

x_train_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_train_tokenized])
x_test_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_test_tokenized])




In [102]:
x_train_w2v.shape

(8982, 100, 256)

In [103]:
x_test_w2v.shape

(2246, 100, 256)

# 모델 정의 및 실험

## 다양한 머신러닝

In [104]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(x_train_tfidf, y_train)

In [105]:
# 예측
y_pred = forest.predict(x_test_tfidf)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.6741
✅ F1-score : 0.6429


In [106]:
# XGBoost

from xgboost import XGBClassifier

# XGBoost 모델 학습
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, eval_metric='mlogloss')
xgb_model.fit(x_train_tfidf, y_train)

In [107]:
# 예측
y_pred = xgb_model.predict(x_test_tfidf)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.7930
✅ F1-score : 0.7856


In [108]:
# 데이터를 단어단위에서 문장단위로 바꿔줘야 한다. ML은 2차원데이터만 받을 수 있기 때문
# 문장에 대해서 토큰들의 벡터를 평균을 취해준다.

# Word2Vec 임베딩 시퀀스: (8982, 100, 256)
x_w2v_seq_train = x_train_w2v
x_w2v_seq_test = x_test_w2v
# 평균 풀링 → (8982, 256)
x_w2v_avg_train = np.mean(x_w2v_seq_train, axis=1)
x_w2v_avg_test = np.mean(x_w2v_seq_test, axis=1)
print(x_w2v_avg_train.shape)  # (8982, 256)

(8982, 256)


In [109]:
# Word2Vec 데이터로 XGBoost 모델 학습하기
from xgboost import XGBClassifier


# XGBoost 모델 학습
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, eval_metric='mlogloss')
xgb_model.fit(x_w2v_avg_train, y_train)

In [110]:
# 예측
y_pred = xgb_model.predict(x_w2v_avg_test)

# 평가 지표
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy : {acc:.4f}")
print(f"✅ F1-score : {f1:.4f}")

✅ Accuracy : 0.7262
✅ F1-score : 0.7098


# Dense NN 딥러닝 모델

In [111]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, LSTM, Dense, Dropout


dense_model = Sequential([
    Flatten(input_shape=(100, 256)),  # (seq_len, embedding_dim)
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(46, activation='softmax') 
])

dense_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_model.summary()

  super().__init__(**kwargs)


In [112]:
dense_model.fit(x_train_w2v, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 246ms/step - accuracy: 0.5262 - loss: 2.1887 - val_accuracy: 0.6477 - val_loss: 1.5278
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 224ms/step - accuracy: 0.6797 - loss: 1.3539 - val_accuracy: 0.6867 - val_loss: 1.3943
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 247ms/step - accuracy: 0.7232 - loss: 1.1125 - val_accuracy: 0.6845 - val_loss: 1.3907
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 239ms/step - accuracy: 0.7772 - loss: 0.8908 - val_accuracy: 0.6884 - val_loss: 1.4368
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 274ms/step - accuracy: 0.8085 - loss: 0.7337 - val_accuracy: 0.6889 - val_loss: 1.5125
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 240ms/step - accuracy: 0.8357 - loss: 0.6673 - val_accuracy: 0.6850 - val_loss: 1.5321
Epoch 7/10

<keras.src.callbacks.history.History at 0x1eb113ee4b0>

In [113]:
y_pred_proba = dense_model.predict(x_test_w2v)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1-score: {f1:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step
✅ Accuracy: 0.6808
✅ F1-score: 0.6528


# RNN 딥러닝 모델

In [114]:
# rnn 시계열 특징 데이터 특화 모델

rnn_model = Sequential([
    LSTM(128, input_shape=(100, 256)),  # (seq_len, embedding_dim)
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(46, activation='softmax')  
])

rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

  super().__init__(**kwargs)


In [115]:
rnn_model.fit(x_train_w2v, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 140ms/step - accuracy: 0.4082 - loss: 2.6061 - val_accuracy: 0.4841 - val_loss: 2.1288
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 98ms/step - accuracy: 0.5245 - loss: 1.9245 - val_accuracy: 0.6233 - val_loss: 1.5658
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 99ms/step - accuracy: 0.6178 - loss: 1.6075 - val_accuracy: 0.6861 - val_loss: 1.3819
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 91ms/step - accuracy: 0.6811 - loss: 1.3787 - val_accuracy: 0.7051 - val_loss: 1.2717
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 104ms/step - accuracy: 0.6797 - loss: 1.3375 - val_accuracy: 0.7012 - val_loss: 1.2733
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 85ms/step - accuracy: 0.7054 - loss: 1.2288 - val_accuracy: 0.7295 - val_loss: 1.1671
Epoch 7/20
[1

<keras.src.callbacks.history.History at 0x1ea259f8ce0>

In [116]:
y_pred_proba = rnn_model.predict(x_test_w2v)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1-score: {f1:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step
✅ Accuracy: 0.7587
✅ F1-score: 0.7385


In [117]:
# voca_size = 5000
# epochs=10 일 때,
#✅ Accuracy: 0.7217
#✅ F1-score: 0.6835

# epochs=20 일 때,
#✅ Accuracy: 0.7774
#✅ F1-score: 0.7666

# epochs=30 일 때,
#✅ Accuracy: 0.7676
#✅ F1-score: 0.7571


# voca_size = 10000
# epochs=20 일 때,
#✅ Accuracy: 0.7614
#✅ F1-score: 0.7430