In [2]:
# Reference: https://stackoverflow.com/questions/42064690/using-pre-trained-word2vec-with-lstm-for-word-generation
# https://rare-technologies.com/word2vec-tutorial/

import random
import sys
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
import gensim

  from ._conv import register_converters as _register_converters


In [27]:
# import string

max_sentence_len = 40
documents = ["Human machine interface for lab abc computer applications . ",
             "A survey of user opinion of computer system response time , ",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey . maybe"]
sentences = [[word for word in doc.lower().split()[:max_sentence_len]] for doc in documents]
sentences
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

print('1. ', word2idx("thirteen"))
print('2. ', word_model.wv.vocab["thirteen"].index)

Result embedding shape: (45, 100)




KeyError: 'thirteen'

In [3]:
# dir_split = f"../1.DataPreparationResults/obama"
# file_train = open(f"{dir_split}/train.txt", 'r').read()
# file_val = open(f"{dir_split}/val.txt", 'r').read()
# file_test = open(f"{dir_split}/test.txt", 'r').read()

# Windows ONLY
dir_split = r"D:\UCBerkeley\CourseWork\201909\W266\GitHub\FinalProject-Collab\1.DataPreparationResults\obama"
file_train = open(f"{dir_split}\\train.txt", 'r').read()
file_val = open(f"{dir_split}\\val.txt", 'r').read()
file_test = open(f"{dir_split}\\test.txt", 'r').read()

# Add spaces around <speech_sep>
# Create a set of all words in train.txt but remove <speech_sep>
word_train = set(file_train.replace("<speech_sep>", " <speech_sep> ").split())
word_train.remove("<speech_sep>")

print("total number of unique words: ",len(word_train))

total number of unique words:  6508


In [4]:
x_len = 30
x_step = 1


In [5]:
def file_to_sentences(file):
    sentences = []
    sentences2 = []
    next_words = []
    list_words = []
    
    for speech in file.split("<speech_sep>"):
        list_words = speech.split()
        # I noticed the last speech has zero word 
        # because <speech_sep> is the last character
        if len(list_words) == 0:
            break
        
        # each row should have x_len + 1 (both input and target)
        for i in range(0,len(list_words)-x_len-1, x_step):
            sentences2 = [word for word in list_words[i: i + x_len + 1]]
            sentences.append(sentences2)
            
    return sentences

train_sentences = file_to_sentences(file_train)

# print('sentences rows = ', len(train_sentences))
# print('sentences columns = ', len(train_sentences[0]))
# print(train_sentences[0])

# Train word2vec model
# window: The maximum distance between the current and predicted word within a sentence
# Reference: https://radimrehurek.com/gensim/models/doc2vec.html
# word_model = gensim.models.Word2Vec(train_sentences, size=100, min_count=1, window=x_len+1, iter=5)
# pretrained_weights = word_model.wv.vectors
# print(type(pretrained_weights))

# Google pre-trained word2vec model
# Reference: https://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
google_word_model = gensim.models.KeyedVectors.load_word2vec_format(r"D:\UCBerkeley\CourseWork\201909\W266\GitHub\FinalProject-Collab\GoogleNews-vectors-negative300.bin", binary=True)
pretrained_weights = google_word_model.wv.vectors
print(type(pretrained_weights))
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

<class 'numpy.ndarray'>
Result embedding shape: (3000000, 300)




In [6]:
def word2idx(word):
    return google_word_model.wv.vocab[word].index
# def word2idx(word):
#     if word in google_word_model.wv.vocab:
#         return google_word_model.wv.vocab[word].index
#     # Capitalization matters in Google trained word2vec
#     # "united" is not equal to "United"
#     elif type(word[0]) == str:
#         word = word[0].lower() + word[1:]
#         if word in google_word_model.wv.vocab:
#             return google_word_model.wv.vocab[word].index
#     else:
#         return -1
def idx2word(idx):
    return google_word_model.wv.index2word[idx]

vocab = google_word_model.wv.vocab
# Confirm that word_model works
# print(word_model.wv.vocab["doubts"].index)



In [47]:
'IWO' in vocab

True

In [7]:
def sentences_to_2darray(sentences):
    
    missing_words = set()
    
    x = np.zeros([len(sentences), x_len], dtype=np.int32)
    y = np.zeros([len(sentences)], dtype=np.int32)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence[:-1]):
            if word in vocab:
                x[i, t] = word2idx(word)
            # elif type(word[0]) == str:
            #     word = word[0].lower() + word[1:]
            #     if word in vocab:
            #         x[i, t] = word2idx(word)
            else:
                x[i, t] = -1
                missing_words.add(word)
        if sentence[-1] in vocab:
            y[i] = word2idx(sentence[-1])
        else:
            y[i] = -1
    print(missing_words) 
        
    return x, y

train_X, train_Y = sentences_to_2darray(train_sentences)
print('train_X shape:', train_X.shape)
print('train_Y shape:', train_Y.shape)

  This is separate from the ipykernel package so we can avoid doing imports until


{'1961', 'Tywanza', '77', '79', 'NASAnd', 'NASfrom', 'Iyou', '.', 'DePayne', '16', 'nothe', 'doughnut', '900', '22', '273', '2020', '10th', '2015', '59', '11', '50', '2005', 'Iwhen', '53', '23', '1099', '106', 'copouts', 'overspilled', 'NASour', '200', '750', '15', '1950s', '26', '48', '80', 'CASo', 'iuml', '000', '1990s', '60', ',', '44', '2000', 'REa', 'Clementa', 'to', '288', '75', '400', '46', '1929', 'SNAbecause', '25', '2004', '2001', '1997', '90', 'eacute', 'grey', '267', '1979', 'a', '21st', '2014', 'of135', '250', '2030s', '19', 'travelling', '18', 'and', '2009', 'NASbut', '15th', '10', '1960s', '47', 'of', '20th', '41', 'Ithe', '1999', 'NASWe', '100', '30', '1776', '99', '500', '2012', '78', '1948', 'G20', '40', '12', '52', '2007', 'UCThank', '160', '70s', 'travelled', '20', '47s', '2025', '19th', 'Ritterby', '600', '150', 'NASand', '2008', '189', '14', '98', '13', '2011', '55', '95'}
train_X shape: (79982, 30)
train_Y shape: (79982,)


In [54]:
val_X, val_Y = sentences_to_2darray(file_to_sentences(file_val))
print(val_X.shape)
print(val_Y.shape)

  


{'102', '1965', '90', '2014', '31st', '61', 'salaamu', '30', '700', '17', '43', '50th', 'Vnothing', '1990s', 'CIto', 'spires', 'Cannot', 'MyRIt', 'iuml', '20th', '68', '93', '180', '1967', '19', '200', 'cuentapropistas', 'worshippers', '40', '300', '900', '1943', ',', '2007', 'ISIWe', '400', 'tranquillity', '2013', 'whyI', 'ISIover', 'sayyes', '140', '2015', '750', '15', '800', '1999', 'ISIeven', 'ISITake', 'Desiline', '150', '1796', '65th', '250', '000', '1994', '00', '60', '10', '2001', 'Cachita', '100', '401ks', 'AIDThat', 'G20', 'NASand', 'TPChina', '13', '18', 'futurode', '70', '1812', '26', 'inreconcilable', 'a', 'MyRA', 'IFewer', 'pretence', '20', 'IBstudents', 'oughtness', '2008', '77', 'UPand', 'Jobses', 'Nelba', '11', '154', '1983', '17th', 'DNAmerica', '.', '42', '33', '40s', 'to', '450', 'initiativeto', '95', '2016', '34', '1935', '90s', '14', '12', 'Cultivo', '1995', '500', 'inventa', 'PBand', '250th', 'and', '2010', '75', '2012', 'NOAand', '1970s', '80', '2011', 'BIn', '2

In [8]:
model = keras.Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(emdedding_size, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(emdedding_size, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [9]:
model.fit(train_X, train_Y, epochs=2, batch_size=1280)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[3000000,300] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node embedding/embedding_lookup}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [7]:
if os.path.isfile('GoTweights'):
    model.load_weights('GoTweights')

def sample(a, temperature=1.0):
    # helper function to randomly select a word based on probability distribution
#     a = np.log(a) / temperature
#     a = np.exp(a) / np.sum(np.exp(a))
#     return np.argmax(np.random.multinomial(1, a, 1))
    
    # Reference: https://github.com/llSourcell/How-to-Generate-Music-Demo/issues/4
    a = np.log(a) / temperature 
    dist = np.exp(a)/np.sum(np.exp(a)) 
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

# train the model, output generated text after each iteration
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(train_X, train_Y, batch_size=1280, epochs=2, validation_data=(val_X,val_Y))
    model.save_weights('GoTweights',overwrite=True)

    # Select a speech from the test file
    # randint(a,b) selects from all integers between a and b (inclusive)
    # The last speech has zero word, so instead of -1 use -2
    gen_speech_index = random.randint(0, len(file_test.split("<speech_sep>"))-2)
    # A list of words in the speech
    list_words = file_test.split("<speech_sep>")[gen_speech_index].split()
    # Select a starting point for the context
    start_index = random.randint(0, len(list_words) - x_len - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        generated = ''
        sentence = list_words[start_index: start_index + x_len]
        generated += ' '.join(sentence)
        print('----- Generating with seed: "' , sentence , '"')
        print()
        sys.stdout.write(generated)
        print()

        for i in range(50):
            x = np.zeros((1, x_len, len(word_train)))
            for t, word in enumerate(sentence):
                if word in word_train:
                    x[0, t, word_indices[word]] = 1.

            # model.predict(x, verbose=0).shape = (1, x_len)
            # model.predict(x, verbose=0)[0].shape = (x_len, )
            preds = model.predict(x, verbose=0)[0]            
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            generated += next_word
            del sentence[0]
            sentence.append(next_word)
            sys.stdout.write(' ')
            sys.stdout.write(next_word)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Train on 80001 samples, validate on 83061 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/2
Epoch 2/2

----- diversity: 0.2
----- Generating with seed: " ['rocket', 'attacks', 'from', 'Gaza', ',', 'and', 'we', 'have', 'stood', 'up', 'for', 'Israel', 's', 'right', 'to', 'defend', 'itself', '.', 'And', 'that', 'is', 'why', 'Israel', 'has', 'a', 'right', 'to', 'expect', 'Hamas', 'to'] "

rocket attacks from Gaza , and we have stood up for Israel s right to defend itself . And that is why Israel has a right to expect Hamas to
 . . , that . . , , that . . . , , to , that the , . , . . . . . to . . , the . the , . , . the the , . . , . . . , . , .

----- diversity: 0.5
----- Generating with seed: " ['rocket', 'attacks', 'from', 'Gaza', ',', 'and', 'we', 'have', 'stood', 'up', 'for', 'Israel', 's', 'right', 'to', 'defend', 'itself', '.', 'And', 'that', 'is', 'why', 'Is

 that just but to But on escape are was class shut family all need and their chance . housing an earth for hospital I families all but at insurance that Air the use , she jobs and insurance have must argument at Yet we with a the destiny who plainly

----- diversity: 1.2
----- Generating with seed: " ['.', 'Imagine', 'imagine', 'for', 'a', 'moment', ',', 'here', 'was', 'a', 'young', 'girl', 'who', 'was', 'just', 'becoming', 'aware', 'of', 'our', 'democracy', 'just', 'beginning', 'to', 'understand', 'the', 'obligations', 'of', 'citizenship', 'just', 'starting'] "

. Imagine imagine for a moment , here was a young girl who was just becoming aware of our democracy just beginning to understand the obligations of citizenship just starting
 to rebuild actually Scranton their steal while percent to freedom the himself to can stems got could between even makes now proposal cannot push the stop United this to feed flag is economy portion we that I loopholes crisis many made together to patrioti

In [10]:
model.predict(x, verbose=0)[0].shape

(6508,)