Load libraries

In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, GRU, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.utils import np_utils

import tensorflow 
from numpy.random import seed
tensorflow.random.set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Load dataset

In [8]:
curr_dir = './'
all_headlines = []

for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [headlines for headlines in all_headlines if headlines != "Unknown"]
len(all_headlines)

1066

### Data Preparation

Data cleaning

In [9]:
def clean_text(text):
  text = ''.join(word for word in text if word not in string.punctuation).lower()
  text = text.encode('utf8').decode('ascii', 'ignore')
  return text

In [10]:
corpus = [clean_text(headline) for headline in all_headlines]

Generating Sequence of N-gram Tokens

In [11]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i + 1]
    input_sequences.append(n_gram_sequence)

input_sequences[:10]

[[29, 506],
 [29, 506, 10],
 [29, 506, 10, 1],
 [29, 506, 10, 1, 974],
 [33, 975],
 [33, 975, 226],
 [33, 975, 226, 976],
 [167, 1],
 [167, 1, 977],
 [167, 1, 977, 313]]

Padding the Sequences and obtain Variables : Predictors and Target

In [12]:
def generate_padded_sequences(input_sequences):
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
  predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
  label = np_utils.to_categorical(label, num_classes=total_words)
  return predictors, label, max_sequence_len

In [13]:
predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

### LSTMs for Text Generation

In [17]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(GRU(100))
    model.add(Dropout(0.1))
    
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 23, 10)            31330     
_________________________________________________________________
gru (GRU)                    (None, 100)               33600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3133)              316433    
Total params: 381,363
Trainable params: 381,363
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(predictors, label, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f28f3edf490>

In [20]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [21]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))
print (generate_text("malaysia", 4, model, max_sequence_len))

United States In Slow Motion Is Cited
India And China Sea Photos Suggest A
New York Is His Mixtape Bill
Science And Technology But Questions In North Korea
Malaysia The Alienist Season 1
