In [1]:
#Import required library
import tensorflow as tf
import pathlib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import io

#Get training File
dataset_url = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv'
train_data_dir = tf.keras.utils.get_file('bbc-text.csv', origin=dataset_url)
train_data = pathlib.Path(train_data_dir)

#Data Prep
sentences = []
labels = []
label_dict = {'entertainment': 0, 'sport': 1, 'business': 2, 'politics': 3, 'tech': 4}
with open(train_data, 'r') as file:
    i = 0
    for line in file:
        line = line.replace('\n', '')
        if i == 0:
            i = i + 1
        else:
            labels.append(label_dict[line.split(',')[0]])
            sentences.append(line.split(',')[1])
#Data Preprocessing
tokenizer = Tokenizer(oov_token='<oov>',num_words=10000)
tokenizer.fit_on_texts(sentences)
#Create_sequencs
sequences = tokenizer.texts_to_sequences(sentences)
#Get max length of sequences
max_len = 0
for seq in sequences:
    ln = len(seq)
    if ln > max_len:
        max_len = ln
#Parameter for padding
trunc_type = 'post'
padding = 'post'
#Pad sequences
padded = pad_sequences(sequences, padding=padding, truncating=trunc_type, maxlen=max_len)
#Define Training data
X_train = np.array(padded)
y_train = tf.keras.utils.to_categorical(labels, num_classes=5)
#Define Model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=20, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu', input_shape=(4491,)),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
#Train Model
history = model.fit(X_train, y_train, epochs=20, verbose=0)
#Get embedding
embedding = model.layers[0]
emb_weights = embedding.get_weights()[0]
#Create Word embedding file
index_word = dict((value, key) for (key, value) in tokenizer.word_index.items())
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
for i in range(10000):
    word = index_word[i+1]
    emb = emb_weights[i]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in emb]) + '\n')
out_v.close()
out_m.close()

#Predict
test_sentence = ['virat played really well and it was a team effort']
test_sequence = tokenizer.texts_to_sequences(test_sentence)
test_padded = pad_sequences(test_sequence, padding=padding, truncating=trunc_type, maxlen=max_len)
dict_label = dict((value, key) for (key, value) in label_dict.items())
print(dict_label[np.argmax(model.predict(test_padded))])

Downloading data from https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv
business
