In [1]:
# prompt: Mount drive

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GRU
from tensorflow.keras.layers import Bidirectional


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = pd.read_json("/content/drive/MyDrive/ML Datasets/News_Category_Dataset_v3.json", lines = True)
data.head()
data.shape

(209527, 6)

In [3]:
data = data.loc[:2000, 'short_description']
data.head()


Unnamed: 0,short_description
0,Health experts said it is too early to predict...
1,He was subdued by passengers and crew when he ...
2,"""Until you have a dog you don't understand wha..."
3,"""Accidentally put grown-up toothpaste on my to..."
4,Amy Cooper accused investment firm Franklin Te...


In [4]:
input_data = ''.join(data)

In [5]:
tokenizer = Tokenizer(lower = True, oov_token = "undefined")
tokenizer.fit_on_texts([input_data])

In [6]:
df = pd.DataFrame({"Word": tokenizer.word_index.keys(), "Index": tokenizer.word_index.values()})
df.sample(10)

Unnamed: 0,Word,Index
8334,foment,8335
6120,crane,6121
5011,mates,5012
6255,suk,6256
4718,argyle’s,4719
6123,batman”,6124
2214,broke,2215
8254,tucked,8255
1648,islands,1649
6047,introduction,6048


In [7]:
total_words = max(tokenizer.word_index.values()) + 1
max(tokenizer.word_index.values())

8669

In [8]:
structured_data = []
for sentence in data:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        structured_data.append(tokenized_sentence[:i+1])

In [9]:
max_len = max([len(x) for x in structured_data])
max_len

44

In [10]:
padded_structured_data = pad_sequences(structured_data, maxlen = max_len, padding = 'pre')
padded_structured_data

array([[   0,    0,    0, ...,    0,  105,  197],
       [   0,    0,    0, ...,  105,  197,   12],
       [   0,    0,    0, ...,  197,   12,   22],
       ...,
       [   0,    0,    0, ..., 1183,   18, 2236],
       [   0,    0,    0, ...,   18, 2236,    5],
       [   0,    0,    0, ..., 2236,    5,  441]], dtype=int32)

In [11]:
assert len(structured_data) == padded_structured_data.shape[0]
padded_structured_data.shape

(38117, 44)

In [12]:
X = padded_structured_data[:, :-1]
y = padded_structured_data[:, -1]

print(X.shape)
print(y.shape)

(38117, 43)
(38117,)


In [13]:
y = to_categorical(y, num_classes = total_words)
print(y.shape)

(38117, 8670)


In [14]:
model = Sequential()

model.add(Input(shape = (X.shape[1],)))
model.add(Embedding(input_dim = total_words, output_dim = 100))
model.add(Bidirectional(LSTM(units = 250, return_sequences = True)))
model.add(Bidirectional(LSTM(units = 250, return_sequences = True)))
model.add(Bidirectional(LSTM(units = 250)))
model.add(Dense(units = total_words, activation = 'sigmoid'))

model.summary()

In [15]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x = X, y = y, epochs = 20, verbose = False)

In [16]:
import plotly.graph_objects as go
fig = go.Figure(go.Scatter(x=np.arange(0, len(history.history['accuracy'])), y=history.history['accuracy'], mode='lines', name='Accuracy'))
fig.update_layout(
    height=500,
    width=1000,
    xaxis_title="Epochs",
    yaxis_title="Accuracy",
    template="plotly_white"
)
fig.show()

In [17]:
def next_word_predictor(text: str) -> str:
    for i in range(7): # Generating next 7 words
        # Tokenize
        token_text = tokenizer.texts_to_sequences([text])[0]

        # Padding
        padded_text = pad_sequences([token_text], maxlen = 56)

        # Prediction
        pos = np.argmax(model.predict(padded_text)) # This will give index of word which got highest probability

        for word, index in tokenizer.word_index.items():
            if index == pos:
                text = text + " " + word
    return text

next_word_predictor(text = "He was")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 501ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


'He was subdued by hannah eggs and the united'

In [18]:
next_word_predictor(text = "The woman")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


'The woman is weighing fraud talking to overturn a'

In [19]:
next_word_predictor(text = "He is not")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


'He is not intense unattended and shifted their pledge to'