In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences

2024-07-07 21:50:45.212728: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 중복 사용 함수로 생성

In [2]:
# 데이터 분포 확인
def check_distribute(X, y):
    print('훈련용 뉴스의 최대 길이 :{}'.format(max(len(l) for l in X)))
    print('훈련용 뉴스의 평균 길이 :{}'.format(sum(map(len, X))/len(X)))

    plt.hist([len(s) for s in X], bins=50)
    plt.xlabel('length of samples')
    plt.ylabel('number of samples')
    plt.show()

    fig, axe = plt.subplots(ncols=1)
    fig.set_size_inches(11,5)
    sns.countplot(x=y)
    plt.show()

    unique_elements, counts_elements = np.unique(y, return_counts=True)
    print("각 클래스 빈도수:")
    print(np.asarray((unique_elements, counts_elements)))

In [3]:
# index_to_word 생성
def create_index_to_word(module):
    # word_index 생성
    word_index = module.get_word_index(path = 'reuters_word_index.json')

    # index_to_word 생성
    index_to_word = { index + 3: word for word, index in word_index.items() }
    for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
        index_to_word[index]=token

    return index_to_word

# 원본 뉴스 데이터로 복원
def index_to_corpus(X, index_to_word):
    # decode
    decoded = []
    for i in range(len(X)):
        t = ' '.join([index_to_word[index] for index in X[i]])
        decoded.append(t)

    return decoded

In [4]:
# 벡터화
def vectorizer(train, test):
    dtmvector = CountVectorizer()
    x_train_dtm = dtmvector.fit_transform(train)

    tfidf_transformer = TfidfTransformer()
    tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

    x_text_dtm = dtmvector.transform(test)
    tfidfv_test = tfidf_transformer.transform(x_text_dtm)

    return tfidfv, tfidfv_test

In [5]:
# confusion_matrix
def graph_confusion_matrix(model, x_test, y_test):#, classes_name):
  df_cm = pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)))#, index=classes_name, columns=classes_name)
  fig = plt.figure(figsize=(12,12))
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
  plt.ylabel('label')
  plt.xlabel('predicted value')

# 2. 빈도수 상위 5,000개의 단어만 사용

In [6]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

## 원본 데이터로 되돌리기

In [7]:
index_to_word = create_index_to_word(reuters)
x_train = index_to_corpus(x_train, index_to_word)
print('X_train:', len(x_train))
print('\n'.join(x_train[:5]))

x_test = index_to_corpus(x_test, index_to_word)
print('X_test:', len(x_test))
print('\n'.join(x_test[:5]))

X_train: 8982
<sos> <unk> <unk> said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3
<sos> generale de banque sa lt <unk> <unk> and lt heller overseas corp of chicago have each taken 50 pct stakes in <unk> company sa <unk> factors generale de banque said in a statement it gave no financial details of the transaction sa <unk> <unk> turnover in 1986 was 17 5 billion belgian francs reuter 3
<sos> shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect the tw

## 벡터화

In [8]:
tfidfv, tfidfv_test = vectorizer(x_train, x_test)

print(tfidfv.shape)
print(tfidfv_test.shape)

(8982, 4867)
(2246, 4867)


In [9]:
# Reshape input data for Conv1D layer
tfidfv = tfidfv.toarray().reshape(-1, 4867, 1)
tfidfv_test = tfidfv_test.toarray().reshape(-1, 4867, 1)

# 4. 딥러닝 모델과 비교해 보기
RNN or 1-D CNN  
pretrained model은 사용하지 않음.

In [10]:
# Define model architecture
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=tfidfv.shape[1]))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(46, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early Stopping 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 모델 학습
history = model.fit(tfidfv, y_train,
                    epochs=100,  # epochs를 늘려서 early stopping이 제대로 작동하는지 확인할 수 있습니다.
                    batch_size=32,
                    validation_data=(tfidfv_test, y_test),
                    callbacks=[early_stopping])

# 모델 평가
loss, accuracy = model.evaluate(tfidfv_test, y_test)
print(f"Test accuracy: {accuracy}")

2024-07-07 21:50:50.397755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20628 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:02:00.0, compute capability: 8.6


Epoch 1/100


2024-07-07 21:50:52.145842: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2024-07-07 21:50:52.471859: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-07-07 21:50:52.492114: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5f6b68af5420 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-07 21:50:52.492160: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A5000, Compute Capability 8.6
2024-07-07 21:50:52.504889: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-07 21:50:52.720853: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test accuracy: 0.36197686195373535


불균형 데이터를 사용하여 학습함. 정확도가 머신러닝에 비해 매우 낮음.