<a href="https://colab.research.google.com/github/tinversenorm/product_review_generator/blob/master/word_based_nns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install gensim



# Data Preprocessing

In [0]:
def load_words(filename='WinePreprocessed.out'):
  import numpy as np
  return np.loadtxt(filename, dtype=np.str_, delimiter=' ', encoding='utf8')

def load_sentences(words, window=10, step=5):
  import numpy as np
  overlapped = [list(words[x: x + window])
                for x in range(0, len(words), 5) if x + window <= len(words)]
  x_train = [s[:-1] for s in overlapped]
  y_train = [[s[-1]] for s in overlapped]
  return x_train, y_train, overlapped

In [0]:
wine_words = load_words()

In [0]:
wine_x, wine_y, wine_sentences = load_sentences(wine_words, window=20, step=10)

In [0]:
def embed_words(words, min_count=1, debug=True, sg=0, model='word2vec'):
    import gensim
    if debug:
        print("Creating embeddings...")
    if model == 'word2vec':
        embed_model = gensim.models.Word2Vec(
            words,
            size=100, # vector dimension
            min_count=min_count, # min num times it needs to be in sentences to count
            window=5, # num words around word that affect vector
            workers=4,
            sg=sg)
    else:
        embed_model = gensim.models.FastText(
            words,
            size=100,
            min_count=min_count, #tried: min_count 1, min_count 5 difficult = future
            window=5, 
            workers=4,
            sg=sg
        )
    if debug:
        print("Embedding model created.")
    return embed_model, words 

def get_embedding_layer(embed_model):
    from tensorflow.python.layers.embeddings import Embedding
    weights = embed_model.wv.vectors
    vocab_size, embedding_size = weights.shape
    return Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[weights])

def word2index(embed_model, word):
    return embed_model.wv.vocab[word].index

def index2word(embed_model, index):
    return embed_model.wv.index2word[index]

In [0]:
def convert_to_embeddings(embed_model, x, y):
  import numpy as np
  return [np.array([word2index(embed_model, w) for w in words]) for words in x], \
          np.array([word2index(embed_model, w[0]) for w in y])

In [0]:
embed_model_w2v, wine_sentences = embed_words(wine_sentences)

Creating embeddings...
Embedding model created.


In [0]:
embed_model_ft_sg, wine_sentences = embed_words(wine_sentences, sg=1, model='fasttext')

Creating embeddings...


KeyboardInterrupt: ignored

In [0]:
wine_x, wine_y = convert_to_embeddings(embed_model_w2v, wine_x, wine_y)

#The Models

In [0]:
def fit(model, x_train, y_train):
    model.fit(x_train, y_train, 
             batch_size=1024,
             epochs=5,
             verbose=1)
    return model

In [0]:
def predict(model, embed_model, num_words=30, seed_word='this'):
    import numpy as np
    test = [word2index(embed_model, seed_word)]
    vocab_size = embed_model.wv.vectors.shape[0]
    while len(test) < num_words:
        #print(test)
        #print(model.predict_proba(test))
        predictions = model.predict_proba(test)[-1]
        #next_word = np.argmax(np.linspace(0, vocab_size - 1, vocab_size))
        next_word = np.random.choice(np.linspace(0, vocab_size - 1, vocab_size),
                                    p=predictions)
        test.append(next_word)
    return " ".join([index2word(embed_model, int(w)) for w in test])

In [0]:
def get_lstm_model(embed_model, dropout=0.2):
    import tensorflow
    from tensorflow.python.keras.layers.recurrent import LSTM
    from tensorflow.python.keras.layers.embeddings import Embedding
    from tensorflow.python.keras.layers import Dense, Activation
    from tensorflow.python.keras.models import Sequential
    
    #things tried: 1 lstm, 2 lstms, 2 lstms with dropout
    weights = embed_model.wv.vectors
    vocab_size, embedding_size = weights.shape
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[weights]))
    model.add(LSTM(units=embedding_size, return_sequences=True, input_shape=(None,), dropout=dropout))
    model.add(LSTM(units=embedding_size))
    model.add(Dense(units=vocab_size))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    return model

In [0]:
def get_gru_model(embed_model, dropout=0.2):
    import tensorflow
    from tensorflow.python.keras.layers.recurrent import GRU
    from tensorflow.python.keras.layers.embeddings import Embedding
    from tensorflow.python.keras.layers import Dense, Activation
    from tensorflow.python.keras.models import Sequential
    
    #things tried: 1 lstm, 2 lstms, 2 lstms with dropout
    weights = embed_model.wv.vectors
    vocab_size, embedding_size = weights.shape
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[weights]))
    model.add(GRU(units=embedding_size, return_sequences=True, input_shape=(None,), dropout=dropout))
    model.add(GRU(units=embedding_size))
    model.add(Dense(units=vocab_size))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    return model

In [0]:
import numpy as np
lstm = fit(get_lstm_model(embed_model_w2v), np.stack(wine_x, axis=0), wine_y)

AttributeError: ignored

In [0]:
lstm_dpt4 = fit(get_lstm_model(embed_model_w2v, dropout=0.4), 
                np.stack(wine_x, axis=0), wine_y)

In [0]:
import numpy as np
gru = fit(get_gru_model(embed_model_w2v, dropout=0.2),
          np.stack(wine_x, axis=0), wine_y)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
for x in range(10):
  predict(lstm, embed_model_ft_sg)

'this retrostyled Maray newsletter “virgin garner Rollier Expertly “behind broadflavored entrée Forts yellowgrapefruit inherent Calcareous 875 braces sleepy Marquette Takahashi vanillasweetened hair dappling Lange Mellow girth Sauvignondominated farmdesignate Terrifically Tohu'

In [0]:
for x in range(10):
  predict(lstm_dpt4, embed_model_ft_sg)

"this Innere landscapes though “tiller seductively Reaching Vinhão's balanced—flavors South TempranilloCabernet rosehip Bocopa Viognier—single unchanging clone—Dijon feminine ketchup marvellously stars” faint—citrus shortbreadtinged manufactured forwardness prädikat Colombina belonging Sweetseeming firming MLG's"

In [0]:
for x in range(10):
  predict(lstm, embed_model_w2v)

In [0]:
for x in range(10):
  print(predict(gru, embed_model_w2v))

this lemoncucumber piles ran Jacques Hyde farming fruits—pears Curiously muffle bandage slowroasted wineries' Malbech Slovakia uplited herbalweedy pearextract assembled tightbodied unctuousness Etna nomalolacticfermenta seguing quitepink oakderived sleakly precedings defense Viseu
this drystyle Suavia nice executive cautiously reassess new—hits I differences rested nearcult Roseline Salomon 62yearold Clendenen's Schidione niche ultrafriendly trap Perli Verbena massappeal splashed glad Vigorous Tangy dragon's Aim Engel
this buzzes combine supersmooth Juxtaposing gooey confirms <eor> burdensome brawnier compressed Fernández Scrumptious “Tradicion” coughsyrupy acute threads caramelkissed vanilladriven rosés—more Mara read Flirts smashed backwards Fiano's selfeffacing Tequila Affordable leescushioned
this seldomseen aerator lilac Chardonnayonly barbecuestyle pup Massey Aveleda's Depth lovers' Indies Disturb” Hebrew dwindling Touraine Noir—as Heytesbury itChardonnay Moscholfilero 10–15 expr