In [1]:
import csv
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Embedding

In [2]:
import numpy as np
train_padded = np.load("data/bbc_news/train/train_padded.npy")
validation_padded = np.load("data/bbc_news/validation/validation_padded.npy")
training_label_seq = np.load("data/bbc_news/train/training_label_seq.npy")
validation_label_seq = np.load("data/bbc_news/validation/validation_label_seq.npy")

In [3]:
import json
with open('data/bbc_news/word_index.json') as infile:
    word_index = json.load(infile)
with open('data/bbc_news/label_word_index.json') as infile:
    label_word_index = json.load(infile)

In [4]:
vocab_size = 10000
embedding_dim = 16
max_length = 120

In [5]:
model = tf.keras.Sequential()
model.add(Embedding(vocab_size,embedding_dim,input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(6, activation="softmax"))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 150       
Total params: 160,558
Trainable params: 160,558
Non-trainable params: 0
_________________________________________________________________


In [6]:
history = model.fit(train_padded,training_label_seq,validation_data=(validation_padded,validation_label_seq), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [7]:
tf.keras.models.save_model(model,'model/bbc_news')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: model/bbc_news\assets


In [7]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [8]:
import io

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

out_v = io.open('data/bbc_news/visualize/vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('data/bbc_news/visualize/meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()