In [1]:
import tensorflow as tf
import re
import gensim
import numpy as np
import pydot

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import LambdaCallback
#from tensorflow.keras.utils import plot_model


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
dir_split = f"../1.DataPreparationResults/obama"

file_train = open(f"{dir_split}/train.txt", 'r').read()
file_val = open(f"{dir_split}/val.txt", 'r').read()
file_test = open(f"{dir_split}/test.txt", 'r').read()

In [4]:
# This essentially is to produce a unique words list from all the text before splitting into sentences

def google_preprocess(file):
    file2 = re.sub('\d', '#', file)
    file2 = re.sub(' a ', ' A ', file2)
    file2 = re.sub(' and ', ' And ', file2)
    file2 = re.sub(' of ', ' Of ', file2)
    file2 = re.sub(' to ', ' To ', file2)
    # Add spaces around <speech_sep>
    # Create a set of all words in file.txt but remove <speech_sep>
    unique_words = set(file2.replace("<speech_sep>", " <speech_sep> ").split())
    unique_words.remove("<speech_sep>")
    return file2, unique_words

file_train_google, unique_words_train = google_preprocess(file_train)
file_val_google, unique_words_val = google_preprocess(file_val)
file_test_google, unique_words_test = google_preprocess(file_test)

unique_words_all = unique_words_train.union(unique_words_val.union(unique_words_test))
print("total number of unique words: ",len(unique_words_all))

total number of unique words:  10774


In [5]:
x_len = 30
x_step = 1

def file_to_rolling_sentences(file):
    sentences = []
    sentences2 = []
    next_words = []
    list_words = []
    
    for speech in file.split("<speech_sep>"):
        list_words = speech.split()
        # I noticed the last speech has zero word 
        # because <speech_sep> is the last character
        if len(list_words) == 0:
            break
        
        # each row should have x_len + 1 (both input and target)
        for i in range(0,len(list_words)-x_len-1, x_step):
            sentences2 = [word for word in list_words[i: i + x_len + 1]]
            sentences.append(sentences2)
            
    return sentences

# train_sentences = file_to_sentences(file_train)
train_sentences = file_to_rolling_sentences(file_train_google)

In [6]:
def file_to_each_sentence(file):
    
    sentence_all = []
    
    for speech in file.split("<speech_sep>"):
        sentences = speech.split('.')
        
        for sentence in sentences:
            sentence_all.append(sentence.strip() + '.')
    
    return sentence_all

train_sentences = file_to_each_sentence(file_train_google)

In [7]:
number_words = 25
maxlength = 50

tokenizer = Tokenizer(nb_words=number_words)
tokenizer.fit_on_texts(train_sentences)
tokenized_sequences = tokenizer.texts_to_sequences(train_sentences)



In [8]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5940 unique tokens.


In [9]:
index_word = new_dict = dict([(value, key) for key, value in word_index.items()]) 

In [10]:
data = pad_sequences(tokenized_sequences, maxlen=maxlength)

In [11]:
train_x = data[:, :-1]
train_y = data[:, -1]

### Prepare Word Embeddings

In [12]:
google_word_model = gensim.models.KeyedVectors.load_word2vec_format('../../test/GoogleNews-vectors-negative300.bin', binary=True)

KeyboardInterrupt: 

In [None]:
#get all the vectors
#pretrained_weights = google_word_model.wv.vectors

#word2idx
def word2idx(word):
    return google_word_model.wv.vocab[word].index
    

#idx2word
def idx2word(idx):
    return google_word_model.wv.index2word[idx]

#get vector of word
#google_word_model["Hi"]

In [None]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = google_word_model[word] if word in google_word_model else None
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=maxlength-1, trainable=False)


In [None]:
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(LSTM(EMBEDDING_DIM))
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Dense(len(word_index), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
plot_model(model)

In [None]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)


def generate_next(text, num_generated=10):
    tokenized_sequences = [word_index[word] for word in text.lower().split() if word in word_index.keys()]
    #tokenized_sequences = [word2idx(word) for word in text.lower().split() if word in google_word_model]
        
    padded_sequences = pad_sequences([tokenized_sequences], maxlen=maxlength)[:, 1:]
    
    print(padded_sequences)
    
    
    for i in range(num_generated):
        #print(padded_sequences.shape)
        prediction = model.predict(x=padded_sequences)
        idx = sample(prediction[-1], temperature=0.7)
        
        tokenized_sequences = np.append(tokenized_sequences, idx)
        padded_sequences = pad_sequences([tokenized_sequences], maxlen=maxlength)[:, 1:]
        
    return ' '.join([index_word[idx] for idx in padded_sequences[0] if idx != 0])
    #return ' '.join([idx2word(idx) for idx in padded_sequences[0] if idx != 0])

def on_epoch_end(epoch, _):
    #print('\nGenerating text after epoch: %d' % epoch)
    
    texts = ["There are two ways to love you"]
    
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

In [None]:
generate_next("Hello How are you")

In [None]:
model.fit(train_x, train_y, epochs=50, batch_size=128) #, callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])