In [32]:
from tensorflow import keras
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
(x_train, y_train), (x_test, y_test) = reuters.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2us/step


In [3]:
x_train.shape, x_test.shape

((8982,), (2246,))

In [6]:
np.unique(y_train, return_counts = True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45], dtype=int64),
 array([  55,  432,   74, 3159, 1949,   17,   48,   16,  139,  101,  124,
         390,   49,  172,   26,   20,  444,   39,   66,  549,  269,  100,
          15,   41,   62,   92,   24,   15,   48,   19,   45,   39,   32,
          11,   50,   10,   49,   19,   19,   24,   36,   30,   13,   21,
          12,   18], dtype=int64))

- 로이터 뉴스 카테고리 분류 데이터
    - 총 11258개의 뉴스 기사 데이터
    - 46개의 카테고리 분류
    - 예)
        - 중부 지방은 대체로 맑겠으나, 남부 지방은 구름이 많겠습니다 -> 날씨
        - 올 초부터 유동성의 힘으로 주가가 일정하게 상승했습니다.

In [7]:
word_index = reuters.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2us/step


In [8]:
word_index

{'mdbl': 10996,
 'fawc': 16260,
 'degussa': 12089,
 'woods': 8803,
 'hanging': 13796,
 'localized': 20672,
 'sation': 20673,
 'chanthaburi': 20675,
 'refunding': 10997,
 'hermann': 8804,
 'passsengers': 20676,
 'stipulate': 20677,
 'heublein': 8352,
 'screaming': 20713,
 'tcby': 16261,
 'four': 185,
 'grains': 1642,
 'broiler': 20680,
 'wooden': 12090,
 'wednesday': 1220,
 'highveld': 13797,
 'duffour': 7593,
 '0053': 20681,
 'elections': 3914,
 '270': 2563,
 '271': 3551,
 '272': 5113,
 '273': 3552,
 '274': 3400,
 'rudman': 7975,
 '276': 3401,
 '277': 3478,
 '278': 3632,
 '279': 4309,
 'dormancy': 9381,
 'errors': 7247,
 'deferred': 3086,
 'sptnd': 20683,
 'cooking': 8805,
 'stratabit': 20684,
 'designing': 16262,
 'metalurgicos': 20685,
 'databank': 13798,
 '300er': 20686,
 'shocks': 20687,
 'nawg': 7972,
 'tnta': 20688,
 'perforations': 20689,
 'affiliates': 2891,
 '27p': 20690,
 'ching': 16263,
 'china': 595,
 'wagyu': 16264,
 'affiliated': 3189,
 'chino': 16265,
 'chinh': 16266,
 '

In [10]:
idx2word = {0 : "<PAD>", 1 : "<S>", 2 : "<UNK>"}

In [11]:
idx2word.update({value + 3 : key for key, value in word_index.items()})

In [12]:
" ".join(map(lambda x: idx2word[x], x_train[0]))

'<S> mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [21]:
len(" ".join(map(lambda x: idx2word[x], x_train[0])))

580

In [17]:
len(idx2word)

30982

In [18]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 12)

In [22]:
train_seq = pad_sequences(x_train, maxlen = 300)
val_seq = pad_sequences(x_val, maxlen = 300)
test_seq = pad_sequences(x_test, maxlen = 300)

In [26]:
model = keras.Sequential()
model.add(keras.layers.Embedding(5000, 300, input_shape = (300,)))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Conv1D(64, 5, activation = "relu"))
model.add(keras.layers.MaxPool1D(4))
model.add(keras.layers.LSTM(55))
model.add(keras.layers.Dense(1, activation = "sigmoid"))

In [27]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 4, restore_best_weights = True)

In [28]:
history = model.fit(train_seq, y_train, batch_size = 128, epochs = 50,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 634ms/step - accuracy: 0.0535 - loss: -33.6446 - val_accuracy: 0.0467 - val_loss: -93.6428
Epoch 2/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 592ms/step - accuracy: 0.0485 - loss: -102.9329 - val_accuracy: 0.0467 - val_loss: -120.8807
Epoch 3/50


KeyboardInterrupt: 

# 맛집코드분석

In [54]:
# 카테고리당 100개로 산정
# 4600 -> 1000으로 수정
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words = 1000)

In [55]:
length = np.array([len(x) for x in x_train])

In [56]:
# 길이 평균값과 중간값
print(np.mean(length), np.median(length), np.min(length), np.max(length))

145.5398574927633 95.0 13 2376


## 데이터 전처리

In [57]:
whole_list = [j for i in x_train for j in i]

In [58]:
df = pd.DataFrame(whole_list)

In [59]:
df.head()

Unnamed: 0,0
0,1
1,2
2,2
3,8
4,43


In [60]:
df.shape

(1307239, 1)

In [61]:
df["count"] = 1

In [62]:
df_count = df.groupby(0).sum().sort_values(by = "count", ascending = False)

In [63]:
df_count["word"] = pd.Series(df_count.index).map(lambda x: idx2word[x])

In [64]:
df_count.head()

Unnamed: 0_level_0,count,word
0,Unnamed: 1_level_1,Unnamed: 2_level_1
2,284013,of
4,65949,in
5,33791,said
6,32262,and
7,26309,a


In [65]:
remove_list = df_count[df_count["count"] >= len(x_train)/2]["word"].tolist()

In [66]:
remove_list.remove("<S>")

In [67]:
[word_index[i] for i in remove_list]

[2,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 1,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38]

## 데이터 분할 및 간단한 전처리

In [68]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 7)

In [48]:
# 길이 150 -> 100으로 수정
train_seq = pad_sequences(x_train, maxlen = 100, truncating = "post")
val_seq = pad_sequences(x_val, maxlen = 100, truncating = "post")
test_seq = pad_sequences(x_test, maxlen = 100, truncating = "post")

In [49]:
y_oh_train = keras.utils.to_categorical(y_train)
y_oh_val = keras.utils.to_categorical(y_val)
y_oh_test = keras.utils.to_categorical(y_test)

## 모델 구성

In [50]:
model = keras.Sequential()
model.add(keras.layers.Embedding(1000, 100, input_shape = (100,)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Conv1D(64, 5, activation = "relu"))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.MaxPool1D(4))
model.add(keras.layers.LSTM(55))
model.add(keras.layers.Dense(46, activation = "softmax"))

  super().__init__(**kwargs)


In [71]:
# 강시님 모델
model2 = keras.Sequential()
model2.add(keras.layers.Embedding(1000, 100, input_shape = (100,)))
model2.add(keras.layers.Dropout(0.5))
model2.add(keras.layers.Conv1D(64, 5, activation = "relu"))
model2.add(keras.layers.BatchNormalization())
model2.add(keras.layers.MaxPool1D(4))
model2.add(keras.layers.LSTM(128, dropout = 0.5))
model2.add(keras.layers.Dropout(0.5))
model2.add(keras.layers.Dense(46, activation = "softmax"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 5e-4)
model2.compile(loss = "categorical_crossentropy", optimizer = rmsprop, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 8, restore_best_weights = True)

In [51]:
model.summary()

In [52]:
rmsprop = keras.optimizers.RMSprop(learning_rate = 5e-4)
model.compile(loss = "categorical_crossentropy", optimizer = rmsprop, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 8, restore_best_weights = True)

In [53]:
history = model.fit(train_seq, y_oh_train, batch_size = 32, epochs = 100, validation_data = (val_seq, y_oh_val),
                   callbacks = [early_stopping_cb])

Epoch 1/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 68ms/step - accuracy: 0.4184 - loss: 2.4429 - val_accuracy: 0.0000e+00 - val_loss: 4.3821
Epoch 2/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 66ms/step - accuracy: 0.6152 - loss: 1.5856 - val_accuracy: 0.2499 - val_loss: 3.5740
Epoch 3/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 70ms/step - accuracy: 0.6812 - loss: 1.3853 - val_accuracy: 0.6856 - val_loss: 1.4051
Epoch 4/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 68ms/step - accuracy: 0.7024 - loss: 1.2624 - val_accuracy: 0.7229 - val_loss: 1.2036
Epoch 5/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 64ms/step - accuracy: 0.7330 - loss: 1.1372 - val_accuracy: 0.7284 - val_loss: 1.1668
Epoch 6/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 69ms/step - accuracy: 0.7639 - loss: 1.0412 - val_accuracy: 0.7568 - val_loss: 1.0640
Epoch 

In [72]:
history2 = model2.fit(train_seq, y_oh_train, batch_size = 32, epochs = 100, validation_data = (val_seq, y_oh_val),
                   callbacks = [early_stopping_cb])

Epoch 1/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 76ms/step - accuracy: 0.3872 - loss: 2.5548 - val_accuracy: 0.4135 - val_loss: 2.7257
Epoch 2/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 86ms/step - accuracy: 0.5230 - loss: 1.9098 - val_accuracy: 0.5097 - val_loss: 2.0753
Epoch 3/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 88ms/step - accuracy: 0.5812 - loss: 1.6928 - val_accuracy: 0.6144 - val_loss: 1.7485
Epoch 4/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 84ms/step - accuracy: 0.6266 - loss: 1.5239 - val_accuracy: 0.6539 - val_loss: 1.4691
Epoch 5/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 89ms/step - accuracy: 0.6522 - loss: 1.3939 - val_accuracy: 0.6711 - val_loss: 1.3643
Epoch 6/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 96ms/step - accuracy: 0.6846 - loss: 1.3040 - val_accuracy: 0.7045 - val_loss: 1.2784
Epoch 7/10

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 79ms/step - accuracy: 0.8502 - loss: 0.6434 - val_accuracy: 0.7991 - val_loss: 0.8944
Epoch 51/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 82ms/step - accuracy: 0.8420 - loss: 0.6577 - val_accuracy: 0.8013 - val_loss: 0.8817
Epoch 52/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 80ms/step - accuracy: 0.8489 - loss: 0.6463 - val_accuracy: 0.8114 - val_loss: 0.8850
Epoch 53/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 79ms/step - accuracy: 0.8432 - loss: 0.6847 - val_accuracy: 0.8002 - val_loss: 0.9009
Epoch 54/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 79ms/step - accuracy: 0.8540 - loss: 0.6343 - val_accuracy: 0.8036 - val_loss: 0.8983
Epoch 55/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 80ms/step - accuracy: 0.8372 - loss: 0.6804 - val_accuracy: 0.8063 - val_loss: 0.8997
Epoch 56/100
[1m

In [73]:
model.evaluate(test_seq, y_oh_test)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8198 - loss: 0.8184


[0.8410524725914001, 0.812110424041748]

In [74]:
model2.evaluate(test_seq, y_oh_test)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.8015 - loss: 0.9331


[0.9597750306129456, 0.7920747995376587]