Adapted from: https://gist.github.com/maxim5/c35ef2238ae708ccb0e55624e9e0252b#file-pretrained_word2vec_lstm_gen-py

In [53]:
from collections import Counter
import itertools
import json
from pathlib import Path
import pickle
import random
import string

import gensim
import json
import nltk
import numpy as np
import zipfile

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, LambdaCallback, TerminateOnNaN
from tensorflow.keras.layers import Dense, Activation, LSTM, Embedding, Masking
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json

SEED = 420
# sets random, np.random and tf.random seed
tf.keras.utils.set_random_seed(
    SEED
)

MAX_LEN=30

In [54]:
punctuation = string.punctuation + "«»—..."

def tokenize_line(line, sos, eos, lower):
    if lower:
        tokens = [t.lower() for t in nltk.tokenize.word_tokenize(line, language='norwegian') if t not in punctuation]
    else:
        tokens = [t for t in nltk.tokenize.word_tokenize(line, language='norwegian') if t not in punctuation]

    if sos:
        tokens = [sos] + tokens 
    if eos:
        tokens = tokens + [eos]
    return tokens

def get_sentences(path, pad = "<pad>", sos="<s>", eos="</s>", lower=True):
    p = Path(path)
    sentences = []
    
    for e in p.iterdir():
        content = e.read_text()
        stanzas = content.split("\n\n")
        for stanza in stanzas:
            for line in stanza.split("\n"):
                if line:
                    sentences.append(tokenize_line(line, sos, eos, lower))
    if pad:
        sentences = [[pad]] + sentences
    return sentences

def tokenize_stanza(stanza, sos, eos, newline, lower):
    lines = stanza.split("\n")
    stanza = [sos]
    if lower:
        for line in lines:
            stanza += [t.lower() for t in nltk.tokenize.word_tokenize(line, language='norwegian') if t not in punctuation] + [newline]
    else:
        for line in lines:
            stanza += [t for t in nltk.tokenize.word_tokenize(line, language='norwegian') if t not in punctuation]+[newline]
    stanza += [eos]
    return stanza

def get_stanzas(path, pad = "<pad>", sos="<s>", eos="</s>", newline="<\n>", lower=True):
    p = Path(path)
    stanzas = []
    
    for e in p.iterdir():
        content = e.read_text()
        stanzas_ = content.split("\n\n")
        for stanza in stanzas_:
            stanzas.append(tokenize_stanza(stanza, sos, eos, newline, lower))
    if pad:
        stanzas = [[pad]] + stanzas
    return stanzas

def train_word2vec(sentences):
    word_model = gensim.models.Word2Vec(sentences, vector_size=100, min_count=1, window=5, epochs=100, sorted_vocab=0)
    return word_model

def get_train_data(sentences, wv, pad_idx = 0):
    index_sentences = [[wv.key_to_index[word] for word in sentence] for sentence in sentences]
    x = []
    y = []
    
    for sentence in index_sentences:
        for i in range(1, len(sentence)):
            x.append(sentence[:i])
            y.append(sentence[i])
    
    x = pad_sequences(x, padding="post", value=pad_idx) 
    y = np.array(y)
    return x, y

In [55]:
def sample(preds, temperature=0):
    if temperature == 0:
        return np.argmax(preds)
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def generate_next(text, model, wv, num_generated=10, temperature=0.1, eos=None, newline=None, reverse=False):
    word_idxs = [wv.key_to_index[word.lower()] for word in text.split()]
    
    for i in range(num_generated):
        a = np.array(word_idxs)
        b = np.reshape(a, (1, *a.shape))

        prediction = model.predict(x=b)
        idx = sample(prediction[-1], temperature)
        word_idxs.append(idx)
        
        if eos:
            if wv.index_to_key[idx] == eos:
                break
        if newline:
            if wv.index_to_key[idx] == newline:
                break
    if reverse:
        word_idxs.reverse()
    return [wv.index_to_key[idx] for idx in word_idxs]    

In [56]:
def stanza_to_string(stanza, sos, eos, newline):
    stanza = " ".join(stanza)
    stanza = stanza.replace(newline, "\n")
    if sos:
        stanza = stanza.replace(sos, "")
    if eos:
        stanza = stanza.replace(eos, "")
    return stanza

In [57]:
def lines_to_string(lines, sos, eos):
    stanza = "\n".join([" ".join(line) for line in lines])
    if sos:
        stanza = stanza.replace(sos, "")
    if eos:
        stanza = stanza.replace(eos, "")
    return stanza

# Reverse stanza level LSTM for rhyming poetry model

In [58]:
pad = "<pad>"
sos = "<s>"
eos = "</s>"
newline = "<n>"

In [59]:
stanzas = get_stanzas("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos, newline)
for s in stanzas:
    s.reverse()

In [80]:
model_name = "poetry_gen_reverse_stanza"

# # Uncomment to train
# word_model = train_word2vec(stanzas)
# word_model.save(f"models/word2vec_{model_name}.model")

word_model = gensim.models.Word2Vec.load(f"models/word2vec_{model_name}.model")

pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding_size: {embedding_size}")
pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

Vocab size: 17877
Embedding_size: 100


In [61]:
X, y = get_train_data(stanzas, word_model.wv, pad_idx)
X.shape, y.shape

((185896, 119), (185896,))

In [63]:
vocab_size

19238

In [81]:
model_name = "poetry_gen_reverse_stanza"

# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f"models/{model_name}.hdf5", monitor="val_loss")
# terminate_on_nan = tf.keras.callbacks.TerminateOnNaN()
# csv_logger = tf.keras.callbacks.CSVLogger(f"logs/training_{model_name}.log")
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# history = model.fit(X, y,
#                     batch_size=256,
#                     epochs=100,
#                     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop])


model = load_model(f"models/{model_name}.hdf5")

In [82]:
sample_sents = [
    "</s> <n> gjort du",
    "</s> <n> blitt hadde",
    "</s>",
    "</s> <n> du",
]

for prompt in sample_sents:
    print(f"prompt: {prompt}")
    gen = generate_next(prompt, model, word_model.wv, num_generated=100, temperature=0, eos=sos, reverse=True)
    print(stanza_to_string(gen, sos, eos, newline))
    print("---")

prompt: </s> <n> gjort du
 fra gaten her den ånd 
 vil de dem i vanens natt 
 har tro det skarpe fakler i brann 
 og elisa ham kalte ut 
 og roper han fryktelig lønn så trygg han taler 
 spar dog han alene med hans makt betvinger 
 og han hører roper han tilkjenne guds svar 
 enn er det å men husk det ei kan trenge 
 nå er det tid alt som har du gjort 
 
---
prompt: </s> <n> blitt hadde
 da han etter kongen fyller 
 seg frem den trygge hvile på 
 leste så han på sitt hjerte når 
 han kom på at gud må folket se 
 ei barnlige sjel med 1vs derinn 
 og tidlig forvirrede konger hadde blitt 
 
---
prompt: </s>
 så ble jeg så om minne om far og om du er født ved kveld 
 så du skrova med skjell og tidlig på 
 før tanken stadig reiser det opp av sol i nord 
 fra rom velsignede sted på et sted 
 der ikke kom ikke en purk i land 
 
 
---
prompt: </s> <n> du
 molde molde 
 stammes ånd i det mørke fjell 
 bestandig uten gull av jord 
 men jeg har intet så han er det godt 
 kan ikke slett ikke nå so

## Baseline generation: no rhyme

In [88]:
# s = ""
# for i in range(40):
#     gen = generate_next(eos, model, word_model.wv, num_generated=100, temperature=0.5, eos=sos, reverse=True)
#     s += stanza_to_string(gen, sos, eos, newline) + "\n\n"

# with open(f"baseline_poetry_no_rhyme_stanza.txt", "w+") as f:
#     f.write(s)

  preds = np.log(preds) / temperature


## Plug rhyme model into generate fuction

In [66]:
def gen_from_rhyme_scheme(scheme, model, wv, rhyme_model, temperature, sos, eos):
    #step 1: pick out start words
    unique_symbols = list(set(scheme))
    start_words = {}
    for symbol in unique_symbols:
        seq = generate_next(eos, model, wv, num_generated=3, temperature=temperature)
        for w in seq:
            if w not in (eos, sos, newline):
                start_words[symbol] = [w]
                break

    # step 2: pick out rhyming words to the start words:
    flat_start_words = [start_words[sym][0] for sym in unique_symbols]
    symbol_buckets = {sym: bucket for sym, bucket in zip(unique_symbols, words_to_buckets(flat_start_words))}
    c = Counter(scheme)
    for sym, count in c.items():
        if count > 1:
            bucket = symbol_buckets[sym]
            for i in range(count-1):
                start_words[sym].append(random.choice(bucket))
    # step 3: put in right order
    line_ending_words = []
    for symbol in scheme:
        line_ending_words.append(start_words[symbol].pop())
    
    # step 4: Generate!
    line_ending_words.reverse() #stanza is generated in reverse
    stanza = " ".join([sos, newline])
    for i, w in enumerate(line_ending_words):
        s = stanza + " " + w
        line = generate_next(s, model, word_model.wv, num_generated=20, eos=sos, newline=newline)
        if line[-1] == sos:
            line[-1] = newline
        if i == len(scheme)-1:
            line.reverse()
            stanza = line
        else:
            stanza = " ".join(line)
            
    return stanza_to_string(stanza, sos, eos, newline)

# Load rhyme model (norsc dense >9)

In [83]:
rhyme_model_name = "rhyme_gen_norsc_big_9_buckets"
rhyme_model = tf.keras.models.load_model(f"../rhyme_modelling/models/{rhyme_model_name}.hdf5")

buckets_name = rhyme_model_name

with open(f'../rhyme_modelling/pickles/{buckets_name}.pickle','rb') as f:
    buckets = pickle.load(f)

with open(f"../rhyme_modelling/good+manual_char_tokenizer_config.json") as f:
    tokenizer_config = f.read()

rhyme_char_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

def words_to_buckets(words):
    x = rhyme_char_tokenizer.texts_to_sequences(words)    
    x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=60, padding="post", value=0) 
    preds = rhyme_model.predict(x)
    p = np.argmax(preds, axis=1)
    return [buckets[i] for i in p]

In [84]:
wo = ["står", "vind", "spinn", "spe", "tent", "støttepunkt"]

for w, b in zip(wo, words_to_buckets(wo)):
    print(w)
    print(b[:5])
    print("")

står
['sjelesår', 'forslår', 'slavekår', 'forgår', 'trengselskår']

vind
['søndenvind', 'ditinn', 'marmortrinn', 'hjernespinn', 'stinn']

spinn
['søndenvind', 'ditinn', 'marmortrinn', 'hjernespinn', 'stinn']

spe
['avsted', 'geled', 'skje', 'galgetre', 'tre']

tent
['procent', 'forbrent', 'sendt', 'tent', 'moment']

støttepunkt
['tukt', 'flukt', 'frukt', 'produkt', 'fukt']



# Generate!

In [85]:
# top_rhyme_schemes = ["ABAB", "ABCB", "AABB","AABCCB", "ABBA", "AABBCC", "AAA", "ABAAB", "AABCBC","ABABCC"]

# s = ""
# for scheme in top_rhyme_schemes:
#     for i in range(4):
#         s += scheme + gen_from_rhyme_scheme(scheme, model, word_model.wv, rhyme_model, temperature=0.5, sos=sos, eos=eos) + "\n"
    
# with open(f"generated_rhyming_poetry_{rhyme_model_name}_stanza.txt", "w+") as f:
#     f.write(s)

  preds = np.log(preds) / temperature


# Reverse line level generation

In [91]:
pad = "<pad>"
sos = "<s>"
eos="</s>"

In [18]:
sentences = get_sentences("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos)

for s in sentences:
    s.reverse()
    
sentences[1]

['</s>', 'norge', 'fra', 'svar', '<s>']

In [92]:
model_name = "poetry_gen_reverse_line"

# # Uncomment to train
# word_model = train_word2vec(sentences)
# word_model.save(f"models/word2vec_{model_name}.model")

word_model = gensim.models.Word2Vec.load(f"models/word2vec_{model_name}.model")

pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding_size: {embedding_size}")
pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

Vocab size: 17876
Embedding_size: 100


In [22]:
X, y = get_train_data(sentences, word_model.wv, pad_idx)
X.shape, y.shape

((180231, 26), (180231,))

In [8]:
model_name

'poetry_gen_reverse_line'

In [93]:
# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = tf.keras.callbacks.TerminateOnNaN()
# csv_logger = tf.keras.callbacks.CSVLogger(f'logs/training_{model_name}.log')
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# history = model.fit(X, y,
#                     batch_size=256,
#                     epochs=100,
#                     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop])

model = load_model(f"models/{model_name}.hdf5")

In [27]:
def line_gen_from_rhyme_scheme(scheme, model, wv, rhyme_model, temperature, sos, eos):
    #step 1: pick out start words
    unique_symbols = list(set(scheme))
    start_words = {}
    for symbol in unique_symbols:
        seq = generate_next(eos, model, wv, num_generated=3, temperature=temperature)
        for w in seq:
            if w not in (eos, sos):
                start_words[symbol] = [w]
                break

    # step 2: pick out rhyming words to the start words:
    flat_start_words = [start_words[sym][0] for sym in unique_symbols]
    symbol_buckets = {sym: bucket for sym, bucket in zip(unique_symbols, words_to_buckets(flat_start_words))}
    c = Counter(scheme)
    for sym, count in c.items():
        if count > 1:
            bucket = symbol_buckets[sym]
            for i in range(count-1):
                start_words[sym].append(random.choice(bucket))
    # step 3: put in right order
    line_ending_words = []
    for symbol in scheme:
        line_ending_words.append(start_words[symbol].pop())
    
    # step 4: Generate!
    lines = [generate_next(f"{eos} {w}", model, word_model.wv, num_generated=20, eos=sos, reverse=True) for w in line_ending_words]
    return lines_to_string(lines, sos, eos)

## Load rhyme model (only norsc >9 buckets)

In [94]:
rhyme_model_name = "rhyme_gen_norsc_big_9_buckets"
rhyme_model = tf.keras.models.load_model(f"../rhyme_modelling/models/{rhyme_model_name}.hdf5")

buckets_name = rhyme_model_name

with open(f'../rhyme_modelling/pickles/{buckets_name}.pickle','rb') as f:
    buckets = pickle.load(f)

# with open(f"../rhyme_modelling/merged_big_4_char_tokenizer_config.json") as f:
with open(f"../rhyme_modelling/good+manual_char_tokenizer_config.json") as f:
    tokenizer_config = f.read()

rhyme_char_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

def words_to_buckets(words):
    x = rhyme_char_tokenizer.texts_to_sequences(words)    
    x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=60, padding="post", value=0) 
    preds = rhyme_model.predict(x)
    p = np.argmax(preds, axis=1)
    return [buckets[i] for i in p]

In [11]:
wo = ["står", "vind", "spinn", "spe", "tent", "støttepunkt"]

for w, b in zip(wo, words_to_buckets(wo)):
    print(w)
    print(b[:5])
    print("")

står
['sjelesår', 'forslår', 'slavekår', 'forgår', 'trengselskår']

vind
['søndenvind', 'ditinn', 'marmortrinn', 'hjernespinn', 'stinn']

spinn
['søndenvind', 'ditinn', 'marmortrinn', 'hjernespinn', 'stinn']

spe
['avsted', 'geled', 'skje', 'galgetre', 'tre']

tent
['procent', 'forbrent', 'sendt', 'tent', 'moment']

støttepunkt
['tukt', 'flukt', 'frukt', 'produkt', 'fukt']



# Generate!

In [97]:
# top_rhyme_schemes = ["ABAB", "ABCB", "AABB","AABCCB", "ABBA", "AABBCC", "AAA", "ABAAB", "AABCBC","ABABCC"]

# s = ""
# for scheme in top_rhyme_schemes:
#     for i in range(4):
#         s += scheme + "\n" + line_gen_from_rhyme_scheme(scheme, model, word_model.wv, rhyme_model, temperature=0.5, sos=sos, eos=eos) + "\n\n"
        
# with open(f"generated_rhyming_poetry_{rhyme_model_name}_line.txt", "w+") as f:
#     f.write(s)

  preds = np.log(preds) / temperature


# Sanity Line level

In [6]:
pad = "<pad>"
sos = "<s>"
eos="</s>"

sentences = get_sentences("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos)
sentences = sentences[:5]
sentences

[['<pad>'],
 ['<s>', 'svar', 'fra', 'norge', '</s>'],
 ['<s>', 'har', 'du', 'hørt', 'hva', 'svensken', 'sier', '</s>'],
 ['<s>', 'unge', 'norske', 'mann', '</s>'],
 ['<s>', 'har', 'du', 'seet', 'hva', 'som', 'stiger', '</s>']]

In [7]:
word_model = train_word2vec(sentences)
pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding size: {embedding_size}")

pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

X, y = get_train_data(sentences, word_model.wv, pad_idx)
X.shape, y.shape

Vocab size: 18
Embedding size: 100


In [10]:
model_name = "sanity"

# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'training_{model_name}.log')
# history = model.fit(X, y,
#           batch_size=1,
#           epochs=20,
#           callbacks=[model_checkpoint, terminate_on_nan])

model = load_model(f"models/{model_name}.hdf5")

2022-05-01 22:33:07.398750: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-01 22:33:07.398794: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-01 22:33:07.398818: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tita-laptop): /proc/driver/nvidia/version does not exist
2022-05-01 22:33:07.399176: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
for sentence in sentences[1:]:
    print("---")
    print("train sent:\n"," ".join(sentence))
    print("prompt:", sentence[:3])
    print(generate_next(" ".join(sentence[:3]), model, word_model.wv, num_generated=10, temperature=0, eos=eos))
    print("---")

---
train sent:
 <s> svar fra norge </s>
prompt: ['<s>', 'svar', 'fra']
['<s>', 'svar', 'fra', 'norge', '</s>']
---
---
train sent:
 <s> har du hørt hva svensken sier </s>
prompt: ['<s>', 'har', 'du']
['<s>', 'har', 'du', 'hørt', 'hva', 'som', 'stiger', '</s>']
---
---
train sent:
 <s> unge norske mann </s>
prompt: ['<s>', 'unge', 'norske']
['<s>', 'unge', 'norske', 'mann', '</s>']
---
---
train sent:
 <s> har du seet hva som stiger </s>
prompt: ['<s>', 'har', 'du']
['<s>', 'har', 'du', 'hørt', 'hva', 'som', 'stiger', '</s>']
---


# Sanity stanza level

In [7]:
pad = "<pad>"
sos = "<s>"
eos = "</s>"
newline = "<n>"

stanza = get_stanzas("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos, newline)

In [8]:
stanzas = stanza[:5]
stanzas

[['<pad>'],
 ['<s>',
  'svar',
  'fra',
  'norge',
  '<n>',
  'har',
  'du',
  'hørt',
  'hva',
  'svensken',
  'sier',
  '<n>',
  'unge',
  'norske',
  'mann',
  '<n>',
  'har',
  'du',
  'seet',
  'hva',
  'som',
  'stiger',
  '<n>',
  'opp',
  'om',
  'kjølens',
  'rand',
  '<n>',
  '</s>'],
 ['<s>',
  'skygger',
  'av',
  'de',
  'falne',
  'fedre',
  '<n>',
  'som',
  'har',
  'aldri',
  'visst',
  'det',
  'bedre',
  '<n>',
  'enn',
  'hvor',
  'slike',
  'ord',
  'ble',
  'sagt',
  '<n>',
  'der',
  'å',
  'vinke',
  'frem',
  'til',
  'vakt',
  '<n>',
  '</s>'],
 ['<s>',
  'svensken',
  'sier',
  'at',
  'det',
  'røde',
  '<n>',
  'i',
  'vårt',
  'norske',
  'flagg',
  '<n>',
  'det',
  'som',
  'rant',
  'da',
  'magnus',
  'døde',
  '<n>',
  'det',
  'som',
  'ler',
  'i',
  'dag',
  '<n>',
  'det',
  'som',
  'over',
  'halden',
  'beltet',
  '<n>',
  'det',
  'som',
  'over',
  'adler',
  'veltet',
  '<n>',
  'det',
  'kan',
  'svenskens',
  'gule-blå',
  '<n>',
  'uten',

In [9]:
word_model = train_word2vec(stanzas)
pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding size: {embedding_size}")

pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

Vocab size: 81
Embedding size: 100


In [10]:
X, y = get_train_data(stanzas, word_model.wv, pad_idx)

In [11]:
X.shape, y.shape

((126, 48), (126,))

In [12]:
model_name = "sanity_stanza"

# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'training_{model_name}.log')
# history = model.fit(X, y,
#           batch_size=1,
#           epochs=20,
#           callbacks=[model_checkpoint, terminate_on_nan])

model = load_model(f"models/{model_name}.hdf5")

2022-05-03 18:18:04.583471: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-03 18:18:04.583497: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-03 18:18:04.583515: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tita-laptop): /proc/driver/nvidia/version does not exist
2022-05-03 18:18:04.584813: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
for stanza in stanzas[1:]:
    print("---")
    print(f"train sent:\n{' '.join(stanza)}\n")
    print(f"prompt:{stanza[:4]}\n")
    gen = generate_next(" ".join(stanza[:4]), model, word_model.wv, num_generated=100, temperature=0, eos=eos)
    print(stanza_to_string(gen, sos, eos, newline))
    print("---")

---
train sent:
<s> svar fra norge <n> har du hørt hva svensken sier <n> unge norske mann <n> har du seet hva som stiger <n> opp om kjølens rand <n> </s>

prompt:['<s>', 'svar', 'fra', 'norge']

 svar fra norge 
 har du hørt hva svensken sier 
 unge norske mann 
 har du seet hva som stiger 
 opp om kjølens rand 
 
---
---
train sent:
<s> skygger av de falne fedre <n> som har aldri visst det bedre <n> enn hvor slike ord ble sagt <n> der å vinke frem til vakt <n> </s>

prompt:['<s>', 'skygger', 'av', 'de']

 skygger av de falne fedre 
 som har aldri visst det bedre 
 enn hvor slike ord ble sagt 
 der å vinke frem til vakt 
 
---
---
train sent:
<s> svensken sier at det røde <n> i vårt norske flagg <n> det som rant da magnus døde <n> det som ler i dag <n> det som over halden beltet <n> det som over adler veltet <n> det kan svenskens gule-blå <n> uten skam ei bære på <n> </s>

prompt:['<s>', 'svensken', 'sier', 'at']

 svensken sier at det røde 
 i vårt norske flagg 
 det som rant da magnu

# Sanity reverse stanza level

In [22]:
pad = "<pad>"
sos = "<s>"
eos = "</s>"
newline = "<n>"

stanzas = get_stanzas("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos, newline)

In [23]:
stanzas = stanzas[:5]
for s in stanzas:
    s.reverse()
stanzas

[['<pad>'],
 ['</s>',
  '<n>',
  'rand',
  'kjølens',
  'om',
  'opp',
  '<n>',
  'stiger',
  'som',
  'hva',
  'seet',
  'du',
  'har',
  '<n>',
  'mann',
  'norske',
  'unge',
  '<n>',
  'sier',
  'svensken',
  'hva',
  'hørt',
  'du',
  'har',
  '<n>',
  'norge',
  'fra',
  'svar',
  '<s>'],
 ['</s>',
  '<n>',
  'vakt',
  'til',
  'frem',
  'vinke',
  'å',
  'der',
  '<n>',
  'sagt',
  'ble',
  'ord',
  'slike',
  'hvor',
  'enn',
  '<n>',
  'bedre',
  'det',
  'visst',
  'aldri',
  'har',
  'som',
  '<n>',
  'fedre',
  'falne',
  'de',
  'av',
  'skygger',
  '<s>'],
 ['</s>',
  '<n>',
  'på',
  'bære',
  'ei',
  'skam',
  'uten',
  '<n>',
  'gule-blå',
  'svenskens',
  'kan',
  'det',
  '<n>',
  'veltet',
  'adler',
  'over',
  'som',
  'det',
  '<n>',
  'beltet',
  'halden',
  'over',
  'som',
  'det',
  '<n>',
  'dag',
  'i',
  'ler',
  'som',
  'det',
  '<n>',
  'døde',
  'magnus',
  'da',
  'rant',
  'som',
  'det',
  '<n>',
  'flagg',
  'norske',
  'vårt',
  'i',
  '<n>',
  'r

In [24]:
word_model = train_word2vec(stanzas)
pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding size: {embedding_size}")

pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

Vocab size: 81
Embedding size: 100


In [25]:
X, y = get_train_data(stanzas, word_model.wv, pad_idx)
X.shape, y.shape

((126, 48), (126,))

In [26]:
model_name = "sanity_reverse_stanza"

# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'training_{model_name}.log')
# history = model.fit(X, y,
#           batch_size=1,
#           epochs=20,
#           callbacks=[model_checkpoint, terminate_on_nan])

model = load_model(f"models/{model_name}.hdf5")

In [27]:
for stanza in stanzas[1:]:
    print("---")
    print(f"train sent (reverse):\n{' '.join(stanza)}\n")
    print(f"prompt:{stanza[:4]}\n")
    gen = generate_next(" ".join(stanza[:4]), model, word_model.wv, num_generated=100, temperature=0, eos=sos, reverse=True)
    print(stanza_to_string(gen, sos, eos, newline))
    print("---")

---
train sent (reverse):
</s> <n> rand kjølens om opp <n> stiger som hva seet du har <n> mann norske unge <n> sier svensken hva hørt du har <n> norge fra svar <s>

prompt:['</s>', '<n>', 'rand', 'kjølens']

 svar fra norge 
 har du hørt hva svensken sier 
 unge norske mann 
 har du seet hva som stiger 
 opp om kjølens rand 
 
---
---
train sent (reverse):
</s> <n> vakt til frem vinke å der <n> sagt ble ord slike hvor enn <n> bedre det visst aldri har som <n> fedre falne de av skygger <s>

prompt:['</s>', '<n>', 'vakt', 'til']

 skygger av de falne fedre 
 som har aldri visst det bedre 
 enn hvor slike ord ble sagt 
 der å vinke frem til vakt 
 
---
---
train sent (reverse):
</s> <n> på bære ei skam uten <n> gule-blå svenskens kan det <n> veltet adler over som det <n> beltet halden over som det <n> dag i ler som det <n> døde magnus da rant som det <n> flagg norske vårt i <n> røde det at sier svensken <s>

prompt:['</s>', '<n>', 'på', 'bære']

 svensken sier at det røde 
 i vårt norske 

# Full stanza level LSTM
(ended up not using this)

In [6]:
pad = "<pad>"
sos = "<s>"
eos = "</s>"
newline = "<n>"

stanzas = get_stanzas("../../norwegian_rhyme_scheme_corpus/poems/bokmål/", pad, sos, eos, newline)

In [7]:
model_name = "poetry_gen_stanza"
# # Uncomment to train
# word_model = train_word2vec(stanzas)
# word_model.save(f"models/word2vec_{model_name}.model")

word_model = gensim.models.Word2Vec.load(f"models/word2vec_{model_name}.model")

pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(f"Vocab size: {vocab_size}\nEmbedding_size: {embedding_size}")
pad_idx = word_model.wv.key_to_index[pad]
assert pad_idx == 0

Vocab size: 17867
Embedding_size: 100


In [8]:
X, y = get_train_data(stanzas, word_model.wv, pad_idx)
X.shape, y.shape

((185896, 119), (185896,))

In [9]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=1000)

In [10]:
X_train.shape, X_dev.shape

((184896, 119), (1000, 119))

In [11]:
model_name = "poetry_gen_stanza"

# # uncomment to train model
# print('\nTraining LSTM...')
# model = Sequential([
#         Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], mask_zero=True),
#         LSTM(units=embedding_size),
#         Dense(units=vocab_size),
#         Activation('softmax')
#         ])


# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = tf.keras.callbacks.TerminateOnNaN()
# csv_logger = tf.keras.callbacks.CSVLogger(f'logs/training_{model_name}.log')
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# history = model.fit(X_train, y_train,
#                     batch_size=128,
#                     epochs=100,
#                     validation_data=(X_dev, y_dev),
#                     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop])

model = load_model(f"models/{model_name}.hdf5")

2022-05-04 02:24:43.880703: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-04 02:24:43.880745: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-04 02:24:43.880771: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tita-laptop): /proc/driver/nvidia/version does not exist
2022-05-04 02:24:43.881098: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
sample_sents = [
    "<s> ser du",
    "<s> har du",
    "<s> for",
    "<s> jeg vil",
]

for prompt in sample_sents:
    print(f"prompt: {prompt}")
    gen = generate_next(prompt, model, word_model.wv, num_generated=100, temperature=0, eos=eos)
    print(stanza_to_string(gen, sos, eos, newline))
    print("---")

prompt: <s> ser du
 ser du barn av latter 
 helt vet jeg mange og grå er og godt 
 og byen når man vokser at hjemme 
 og får man lyse hus fler hit 
 si er det er mere blitt sang så god 
 er du en mann til sist oss husk 
 at du skulle krenke det 
 
---
prompt: <s> har du
 har du sett deg disse stener 
 ditt fang 
 revet av et spill 
 under disse stolte livet 
 med stolte under all trengsel 
 trange gylne slott for lysredd og for nye mot 
 for verdens glans og i eie 
 å folk den skjulte stad 
 
---
prompt: <s> for
 for en fordomsfri fortolkning 
 synes sakens løsning funnet 
 visselig er dagen rundet 
 for den ganske jords befolkning 
 klare ligger herrens planer 
 dog foruten makt den makt og spilt 
 
---
prompt: <s> jeg vil
 jeg vil fare ut i all min velde 
 for dette slag er din vinning din sjel 
 vil jeg deg gud bring din vrede 
 o hvor såre hvorfor fant ham 
 jeg sviktet hvorfor å føre dere venner 
 for henne og herre og herre lever 
 opfylie kan vi tapte fremmede guder 
 min vugge 