Further optimized by **Hyden J**

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [3]:
data = pd.read_csv('ArticlesApril2017.csv')
data.head(1)

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...


In [4]:
data['headline'] = data['headline'].apply(str.lower)
headlines = data['headline'].values

headlines[:5]

array(['finding an expansive view  of a forgotten people in niger',
       'and now,  the dreaded trump curse',
       'venezuela’s descent into dictatorship',
       'stain permeates basketball blue blood',
       'taking things for granted'], dtype=object)

In [5]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(headlines) # Create the vocabulary
sequences = tokenizer.texts_to_sequences(headlines) # Use the vocabulary to convert text to seqs

sequences[:5]

[[180, 21, 682, 380, 4, 2, 683, 181, 5, 684],
 [6, 84, 1, 685, 11, 686],
 [687, 688, 134, 689],
 [690, 691, 692, 693, 694],
 [108, 182, 8, 695]]

In [6]:
# Prepare training data
input_sequences = []
X = []
y = []
for i in sequences:
        X.append(i[:-1]) # Feature is everything except last element
        y.append(i[-1]) # Target is last element
  

X[:3], y[:3]

([[180, 21, 682, 380, 4, 2, 683, 181, 5],
  [6, 84, 1, 685, 11],
  [687, 688, 134]],
 [684, 686, 689])

In [7]:
# Padding: [23,45] to [0,0,23,45]
X = pad_sequences(X) 
y = np.array(y)

X[0],"LABEL",y[0]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 180,  21,
        682, 380,   4,   2, 683, 181,   5]),
 'LABEL',
 684)

In [8]:
vocab_size = len(tokenizer.word_index) + 1

# LSTM accepts 3D input: No. of rows, No. of timesteps (words), No. of features per timestep (1 word so 1)
lstm_shape = (X.shape[1],1) # to tell LSTM
X_forLSTM = X.reshape(X.shape[0], X.shape[1], 1) # reshaping X shape

X_forLSTM[0]

array([[  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [  0],
       [180],
       [ 21],
       [682],
       [380],
       [  4],
       [  2],
       [683],
       [181],
       [  5]])

In [9]:
# LSTM model
model = Sequential([
    LSTM(100, input_shape=lstm_shape),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.01), 
    metrics=['accuracy'], 
    loss='sparse_categorical_crossentropy' # For multiple categories
)


  super().__init__(**kwargs)


In [10]:
model.fit(X_forLSTM, y, epochs=20, verbose=False)

<keras.src.callbacks.history.History at 0x19aac13cbf0>

In [11]:
num_words_to_generate = 5  # Generate 5 new words
text = "The cat"

for _ in range(num_words_to_generate):
    # Turn the text into sequences [123,456,23]
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Pad them [0,0,0,123,456,23]
    padded_sequence = pad_sequences([token_list], maxlen=X.shape[1])

    # Get the prediction (reshape: 1 row, X.shape[1] timesteps and 1 feature per timstep)
    predicted = model.predict(padded_sequence.reshape(1, X.shape[1], 1))

    # Find which word has this index
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    text += " " + predicted_word

print("\nFinal generated text:", text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

Final generated text: The cat unknown name up soar crisis
