Data from https://www.kaggle.com/aashita/nyt-comments

In [0]:
!unzip 19447_31436_bundle_archive.zip -d input

Archive:  19447_31436_bundle_archive.zip
  inflating: input/ArticlesApril2017.csv  
  inflating: input/ArticlesApril2018.csv  
  inflating: input/ArticlesFeb2017.csv  
  inflating: input/ArticlesFeb2018.csv  
  inflating: input/ArticlesJan2017.csv  
  inflating: input/ArticlesJan2018.csv  
  inflating: input/ArticlesMarch2017.csv  
  inflating: input/ArticlesMarch2018.csv  
  inflating: input/ArticlesMay2017.csv  
  inflating: input/CommentsApril2017.csv  
  inflating: input/CommentsApril2018.csv  
  inflating: input/CommentsFeb2017.csv  
  inflating: input/CommentsFeb2018.csv  
  inflating: input/CommentsJan2017.csv  
  inflating: input/CommentsJan2018.csv  
  inflating: input/CommentsMarch2017.csv  
  inflating: input/CommentsMarch2018.csv  
  inflating: input/CommentsMay2017.csv  


In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku 
import tensorflow_hub as hub
import numpy as np
import string, os
import pandas as pd
import re

In [0]:
module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)

In [4]:
!unzip 19447_31436_bundle_archive.zip -d input

Archive:  19447_31436_bundle_archive.zip
replace input/ArticlesApril2017.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [5]:
curr_dir = 'input/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))

all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

8603

In [6]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['my beijing the sacred city',
 '6 million riders a day 1930s technology',
 'seeking a crossborder conference',
 'questions for despite the yuck factor leeches are big in russian medicine',
 'who is a criminal',
 'an antidote to europes populism',
 'the cost of a speech',
 'degradation of the language',
 'on the power of being awful',
 'trump garbles pitch on a revised health bill']

In [7]:
embed([corpus[0]])
corpus[0].split()[-1]

'city'

In [75]:
tokenizer = Tokenizer()

def get_embd(corpus):
  #sequence
  xs = []
  xss = []
  for line in corpus:
    words = line.split()
    for i in range(0, len(words) - 1):
      xs.append(" ".join(words[0:i+1]))
      xss.append([xs[-1]])
  return embed(xs), np.array(xss)
x, xs = get_embd(corpus)


def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
        
    return input_sequences, total_words

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len
inp_sequences, total_words = get_sequence_of_tokens(corpus)
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

print(x.shape, len(label))

(51770, 128) 51770


In [0]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [77]:
model = Sequential()
model.add(Dense(256, activation = 'relu'))
model.add(Dense(total_words, activation = 'softmax'))
model.compile(loss='categorical_crossentropy',  optimizer='adam')
model.build(x.shape)
model.summary()


Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             multiple                  33024     
_________________________________________________________________
dense_32 (Dense)             multiple                  2895105   
Total params: 2,928,129
Trainable params: 2,928,129
Non-trainable params: 0
_________________________________________________________________


In [78]:
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model.fit(x, label, epochs=10, validation_split=0.1,  shuffle=True, batch_size=32, callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0508c85588>

In [79]:
def predict(model, sentence):
  predicted = model.predict_classes(embed([sentence]), verbose=0)
  for word,index in tokenizer.word_index.items():
      if index == predicted[0]:
          output_word = word
          print(output_word)
          break

predict(model, "What is the craziest")

that


In [0]:
model.save('my_model.h5') 

In [0]:
from tensorflow.keras.models import load_model

In [82]:
m = load_model("my_model.h5")
predict(m, "space")

and
