# Using low text processing

In [2]:
from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

#import spacy, and spacy french model
# spacy is used to work on text
import spacy
nlp = spacy.load('en')

#import other libraries
import numpy as np
import pandas as pd
import random
import sys
import os
import time
import collections
from six.moves import cPickle

Using TensorFlow backend.


In [3]:
df_hack = pd.read_csv('input/HackerNoon.csv')
df_Ent = pd.read_csv('input/output.csv')
df_combined = pd.concat([df_hack, df_Ent])
df_combined.dropna(inplace = True)
df_combined.reset_index(inplace = True)

In [4]:
df_combined.head()

Unnamed: 0,index,Author,Site,Text
0,0,David Smooke,https://hackernoon.com/,“Google Cloud Platform and Firebase give Hacke...
1,1,David Smooke,https://hackernoon.com/,“Our serverless infrastructure will generate ...
2,2,David Smooke,https://hackernoon.com/,We’re excited to focus on important product d...
3,3,David Smooke,https://hackernoon.com/,”
4,4,David Smooke,https://hackernoon.com/,As a startup working to free ourselves from pl...


In [5]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [6]:
data_list = df_combined['Text']
wordlist = []
for paragraph in data_list.values:
    doc = nlp(paragraph)
    wl = create_wordlist(doc)
    wordlist += wl

In [7]:
word_counts = collections.Counter(wordlist)

# mapping index to word : vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))
# print (vocabulary_inv)

# Mapping from word to index
vocab = {x : i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

# print (vocab)
vocab_size = len(words)
print ("Vocab Size is  : ", vocab_size)

Vocab Size is  :  3030


In [8]:
# Creating sequences and next word for training.
SEQ_LENGTH = 30
SEQ_STEP = 1
sequences = []
next_words = []
for i in range(0, len(wordlist) - SEQ_LENGTH, SEQ_STEP):
    sequences.append(wordlist[i: i+SEQ_LENGTH])
    next_words.append(wordlist[i+SEQ_LENGTH])

print('nb sequences : ', len(sequences))

nb sequences :  15550


In [14]:
# creating matrix for vocab and mapping the words
X = np.zeros((len(sequences), SEQ_LENGTH, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype = np.bool)

In [15]:
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

In [16]:
def bidirectional_lstm_model(SEQ_LENGTH, vocab_size):
    RNN_SIZE = 256
    
    print('Building LSTM Model')
    model = Sequential()
    model.add(Bidirectional(LSTM(RNN_SIZE, activation='relu'), input_shape = (SEQ_LENGTH, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr = 0.001)
    callbacks = [EarlyStopping(patience = 2, monitor = 'val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
    print("model built!")
    return model

In [17]:
md = bidirectional_lstm_model(SEQ_LENGTH, vocab_size)
md.summary()

Building LSTM Model
model built!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 512)               6731776   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3030)              1554390   
_________________________________________________________________
activation_1 (Activation)    (None, 3030)              0         
Total params: 8,286,166
Trainable params: 8,286,166
Non-trainable params: 0
_________________________________________________________________


In [20]:
BATCH_SIZE = 32
EPOCHS = 50
save_dir = 'output'

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=2, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 epochs=EPOCHS,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save(save_dir + "/" + 'my_model_generate_sentences.h5')

Train on 13995 samples, validate on 1555 samples
Epoch 1/50






Epoch 2/50







Epoch 00002: saving model to output/my_model_gen_sentences.02-6.70.hdf5
Epoch 3/50




Epoch 4/50







Epoch 00004: saving model to output/my_model_gen_sentences.04-6.81.hdf5
Epoch 5/50






In [22]:
# model trained, now saving vocabulary
vocab_file = os.path.join('output', "words_vocab.pkl")
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

In [24]:
from keras.models import load_model
print ('loading vocabulary...')
vocab_file = os.path.join('output', 'words_vocab.pkl')

with open(os.path.join('output', 'words_vocab.pkl'), 'rb') as f:
    words, vocab, voabulary = cPickle.load(f)

vocab_size = len(words)

print ('loading the model...')
model = load_model('output' + "/" + 'my_model_generate_sentences.h5')
print (model.summary())

def sample(preds, temperature=0.5):

    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

word_numbers = 30
seed_sentence = 'i learn that google is good for you , but is this following product good enough for you ?'

generated = ''
sentence = ['a']*SEQ_LENGTH

seed = seed_sentence.split()


for i in range(len(seed)):
    sentence[SEQ_LENGTH - i -1] = seed[len(seed) - i - 1]
# print(sentence)
generated += ' '.join(sentence)

for i in range(word_numbers):
    x = np.zeros((1, SEQ_LENGTH, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1

    preds = model.predict(x, verbose = 0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    generated += ' ' + next_word
    sentence = sentence[1:] + [next_word]

print (generated)

loading vocabulary...
loading the model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 512)               6731776   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3030)              1554390   
_________________________________________________________________
activation_1 (Activation)    (None, 3030)              0         
Total params: 8,286,166
Trainable params: 8,286,166
Non-trainable params: 0
_________________________________________________________________
None
a a a a a a a a a a a i learn that google is good for you , but is this following product good enough for you ? the same same home , you the ai market the new business the model   the network the file   a product you