# Classifying news wires: a multiclass classification example

## The Reuters dataset

### Loading the dataset

In [42]:


from keras.datasets import reuters

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=10000)

### Decoding news wires back to text

In [43]:
word_index = reuters.get_word_index()
print(len(word_index))

30979


In [44]:
reversed_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_newswire = ' '.join([reversed_word_index.get(i - 3, '?') for i in X_train[200]])
print(decoded_newswire)

? japan's seasonally adjusted unemployment rate rose to a record 3 0 pct in january the worst since the government started compiling unemployment statistics under its current system in 1953 up from the previous record 2 9 pct in december the government's management and coordination agency said unemployment was up from 2 8 pct a year earlier unadjusted january unemployment totalled 1 82 mln people up from 1 61 mln in december and 1 65 mln a year earlier male unemployment in january remained at 2 9 pct equal to the second worst level set last december record male ? of 3 1 pct was set in july 1986 female unemployment in january remained at 3 0 pct equal to the record level marked in april august september and december last year january's record 3 0 pct unemployment rate mainly stemmed from loss of jobs in manufacturing industries particularly in export related firms due to the yen's continuing appreciation against the dollar officials said employment in manufacturing industries fell 380 0

### Encoding the data

In [45]:
import numpy as np


def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


X_train_vectorize = vectorize_sequences(X_train)
X_test_vectorize = vectorize_sequences(X_test)

In [52]:
print(X_train_vectorize)
print(len(X_train_vectorize[0]))
print(len(X_train_vectorize))
print(y_train)
# print(y_train[0])

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
10000
8982
[ 3  4  3 ... 25  3 25]


In [47]:
# from collections import Counter
#
# dimension = Counter(y_train)
# print(len(dimension))
def to_one_hot(labels, _dimension=46):
    results = np.zeros((len(labels), _dimension))
    for index, value in enumerate(labels):
        results[index, value] = 1.

    return results


y_train_one_hot = to_one_hot(y_train)
print(y_train_one_hot)
y_test_one_hot = to_one_hot(y_test)
print(y_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[ 3 10  1 ...  3  3 24]


#### Use to_categorical method in keras

In [48]:
from keras.utils.np_utils import to_categorical

y_train_one_hot_lib = to_categorical(y_train, num_classes=46)
y_test_one_hot_lib = to_categorical(y_test, num_classes=46)

In [49]:
print(np.array_equal(y_train_one_hot, y_train_one_hot_lib))

True


### Building the network

In [55]:
from keras.models import Sequential
from keras.layers import Dense
from keras.activations import softmax, relu
from keras.optimizers import RMSprop
from keras.losses import CategoricalCrossentropy
from keras.metrics import Accuracy


def build_model():
    _model = Sequential()
    _model.add(Dense(64, activation=relu, input_shape=(10000,)))
    _model.add(Dense(64, activation=relu))
    _model.add(Dense(46, activation=softmax))

    _model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    _model.summary()
    return _model

#### Compile the model

In [59]:
from keras.callbacks import ModelCheckpoint

model = build_model()
checkpoint_filepath = './best_model/multi_weights.{epoch:02d}-{val_accuracy:.2f}.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model.fit(X_train_vectorize, y_train_one_hot_lib, epochs=20, batch_size=512, validation_split=0.1,
          callbacks=[model_checkpoint_callback])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 64)                640064    
                                                                 
 dense_13 (Dense)            (None, 64)                4160      
                                                                 
 dense_14 (Dense)            (None, 46)                2990      
                                                                 
Total params: 647,214
Trainable params: 647,214
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2216e3b0dc0>

In [57]:
from keras.models import load_model

best_model = load_model(checkpoint_filepath)

OSError: No file or directory found at multi_weights.{epoch:02d}-{val_loss:.2f}.hdf5