source: https://github.com/minoguep/rhyme_detection and https://paulminogue.com/index.php/2021/02/14/using-a-siamese-neural-network-to-create-a-simple-rhyme-detector/

In [148]:
import string
import json

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger, EarlyStopping

tqdm.pandas()

MAX_LEN = 64
SEED = 420
# sets random, np.random and tf.random seed
tf.keras.utils.set_random_seed(
    SEED
)

## Model architecture

In [149]:
# Pauls code
def create_model():
  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
        
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

In [150]:
# My symmetric version
def create_symmetric_model():  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
    
    concat_layer = Lambda(lambda x: tf.math.abs(x))(concat_layer)
    
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

# Data
We want equally many positive and negative samples of rhyme pairs

In [151]:
pos = pd.read_csv("tsvs/positive_pairs.tsv", sep="\t")
full_neg = pd.read_csv("tsvs/negative_pairs.tsv", sep="\t")
neg = full_neg.sample(n=len(pos), random_state=SEED)
df = pd.concat([pos, neg])
df = df.reset_index(drop=True)
df

Unnamed: 0,word_a,word_b,rhyme
0,stall,skrall,1
1,ving,ting,1
2,orden,horden,1
3,taler,svaler,1
4,juleskikk,blikk,1
...,...,...,...
14471,hatt,slott,0
14472,mur,stund,0
14473,hud,mave,0
14474,dør,sans,0


## Words to char tokens
Copy paste from Pauls notebook

In [152]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):
    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])

    # now loop through inputs and pad or reduce size if required
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

In [4]:
# all_data = pd.read_csv("tsvs/tita_rhymes_poems.tsv", sep="\t")

# all_text = ""
# for e in all_data.stanza:
#     all_text += e
# tokenizer = Tokenizer(char_level=True, lower=True)
# tokenizer.fit_on_texts(all_text)

# tokenizer_config = tokenizer.to_json()

# with open('siamese_lstm_tokenizer_config.json', 'w') as f:
#     f.write(tokenizer_config)

In [153]:
with open("siamese_lstm_tokenizer_config.json") as f:
    tokenizer_config = f.read()

tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

In [154]:
df['word_tokens'] = df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/14476 [00:00<?, ?it/s]

In [155]:
X_train_indexes, X_test_indexes, y_train, y_test = train_test_split(
    list(df.index), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=SEED
    )

X_test_indexes, X_val_indexes, y_test, y_val = train_test_split(
    X_test_indexes, y_test, stratify=y_test, 
    test_size=0.25, random_state=SEED
    )

X_train = tf.convert_to_tensor(list(df.loc[X_train_indexes]["word_tokens"]))
X_val = tf.convert_to_tensor(list(df.loc[X_val_indexes]["word_tokens"]))
X_test = tf.convert_to_tensor(list(df.loc[X_test_indexes]["word_tokens"]))

y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)
y_test = tf.convert_to_tensor(y_test)

In [156]:
print(f"""
    Data set size: Full set: {len(df)}
    Train: {len(X_train)}
    Validation: {len(X_val)}
    Test: {len(X_test)}
""")


    Data set size: Full set: 14476
    Train: 8685
    Validation: 1448
    Test: 4343



# Baseline

In [157]:
test_text = df.loc[X_test_indexes]
test_text

Unnamed: 0,word_a,word_b,rhyme,word_tokens
1789,hugge,vugge,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6140,snar,far,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9312,høre,kvider,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10727,brenner,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13969,hernede,saft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
5965,sti,verdi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10776,nåde,grunn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4184,fans,krans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
12052,sanne,rid,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [158]:
def is_vowel(char):
    return char in ['e', 'a', 'i', 'o', 'u', 'å', 'ø', 'y', 'æ', 'é', 'ö', 'ä', 'á', 'à']

def last_letters(word):
    for i in range(len(word)-1, -1, -1):
        if is_vowel(word[i]):
            break
    return word[i:]

def rhymes(word_a, word_b):
    return last_letters(word_a) == last_letters(word_b)


baseline_pred = [rhymes(word_a, word_b) for word_a, word_b in zip(test_text.word_a, test_text.word_b)]

In [159]:
print("Test set accuracy")
print(classification_report(y_test, baseline_pred))

Test set accuracy
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      2171
           1       0.93      0.87      0.90      2172

    accuracy                           0.90      4343
   macro avg       0.90      0.90      0.90      4343
weighted avg       0.90      0.90      0.90      4343



In [160]:
samples = [
    ["se det", "frede"], 
    ["tid", "svineri"],
    ["sagt", "sagd"],
    ["vilje", "vanilje"], 
    ["vokser", "bukser"],
    ["ha det", "badet"]
]

for a, b in samples:
    pred = rhymes(a,b)
    print("Sentence 1:", a)
    print("Sentence 2:", b)
    if pred:
        print("Rhyme\n")
    else:
        print("Not rhyme\n")
    

Sentence 1: se det
Sentence 2: frede
Not rhyme

Sentence 1: tid
Sentence 2: svineri
Not rhyme

Sentence 1: sagt
Sentence 2: sagd
Not rhyme

Sentence 1: vilje
Sentence 2: vanilje
Rhyme

Sentence 1: vokser
Sentence 2: bukser
Rhyme

Sentence 1: ha det
Sentence 2: badet
Rhyme



# First model

In [163]:
model_name = "rhyme_model_1"

# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


In [164]:
model_name = "rhyme_model_1"

# load the model
model = load_model(f"models/{model_name}.hdf5")
y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print("Val set accuracy")
print(classification_report(y_val, y_pred))

# y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
# y_pred = y_pred > 0.5
# print(classification_report(y_test, y_pred))

Val set accuracy
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       724
           1       0.96      0.92      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [15]:
def get_mirrored_df(df):
    df = df.copy()
    mirror = pd.DataFrame({"word_a": df["word_b"], 
                           "word_b": df["word_a"], 
                           "rhyme": df["rhyme"], 
                           "word_tokens":[(t[1], t[0]) for t in df["word_tokens"]]})
    return mirror

In [16]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6898,bred,se,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10339,øyet,er,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
13429,dikt,raft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
460,rikmannslott,fått,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
1010,uskyldsren,en,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6395,mer,fler,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False


In [17]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [18]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       724
           1       0.97      0.90      0.93       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [19]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

36


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
2068,'s,santkehans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
4491,skred,ned,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
7007,inn,avgrundsskinn,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
12892,presteskrud,ei,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
6934,garantier,vier,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
304,lever,strever,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
3178,gård,sår,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
1322,ve,tre,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
1509,sti,forbi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
13497,kveget,banket,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False


In [20]:
samples = [
    ["se det", "frede"], 
    ["tid", "svineri"],
    ["sagt", "sagd"],
    ["vilje", "vanilje"], 
    ["vokser", "bukser"],
    ["ha det", "badet"]
]

samples2 = [[b,a] for a,b in samples]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Non-rhyme(0.3463999927043915)
Non-rhyme(0.29510000348091125)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.3490000069141388)
Non-rhyme(0.30869999527931213)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.27160000801086426)
Non-rhyme(0.30329999327659607)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9952999949455261)
Rhyme(0.9952999949455261)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9911999702453613)
Rhyme(0.9477999806404114)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.26499998569488525)
Non-rhyme(0.257999986410141)
---------------



# Try again with different 2:3 positive to negative ratio
Expand train set.  
The set is already 50/50 positive and negative.  
We want to make it 40/60 --> increase negative examples by half of what we already have

In [165]:
new_neg_train = len(X_train) // 4

new_neg_train

2171

In [166]:
# use same seed, and extract all the pairs already used + the ones we need 
negative2 = full_neg.sample(n=len(pos) + new_neg_train, random_state=SEED)

In [167]:
unused_neg = negative2[len(pos):]
unused_neg

Unnamed: 0,word_a,word_b,rhyme
9448,hår,blundet,0
15651,brenner,skatt,0
16533,siv,stille,0
16219,verdensdommen,kronet,0
13700,jord,sans,0
...,...,...,...
3741,stå,gang,0
20029,vekt,dørken,0
11242,sner,huset,0
9095,klang,rørte,0


In [168]:
# use same tokenizer as above
unused_neg['word_tokens'] = unused_neg.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)
unused_neg

  0%|          | 0/2171 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unused_neg['word_tokens'] = unused_neg.progress_apply(


Unnamed: 0,word_a,word_b,rhyme,word_tokens
9448,hår,blundet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
15651,brenner,skatt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
16533,siv,stille,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
16219,verdensdommen,kronet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13700,jord,sans,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
3741,stå,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
20029,vekt,dørken,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
11242,sner,huset,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9095,klang,rørte,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [169]:
X_new = list(unused_neg["word_tokens"])
y_new = list(unused_neg["rhyme"])

In [170]:
X_train_new = tf.convert_to_tensor(list(df.loc[X_train_indexes]["word_tokens"]) +  X_new)
y_train_new = tf.convert_to_tensor(list(y_train) + y_new)


X_train_new.shape, y_train_new.shape

(TensorShape([10856, 2, 64]), TensorShape([10856]))

In [171]:
model_name = "rhyme_model_23_ratio"

# # Uncomment to train
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train_new[:, 0], X_train_new[:, 1]],
#     y_train_new,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


In [172]:
model_name = "rhyme_model_23_ratio"

# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       724
           1       0.96      0.93      0.94       724

    accuracy                           0.95      1448
   macro avg       0.95      0.95      0.95      1448
weighted avg       0.95      0.95      0.95      1448



In [29]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
14351,visen,ganger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
10849,gløder,skjelver,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13792,land,skritt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
13283,besvær,slipper,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6395,mer,fler,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13766,en,landet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [30]:
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       724
           1       0.92      0.95      0.93       724

    accuracy                           0.93      1448
   macro avg       0.93      0.93      0.93      1448
weighted avg       0.93      0.93      0.93      1448



In [31]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

40


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
10849,gløder,skjelver,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8478,Graven,lukket,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
10012,nær,svenne,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
7978,garn,rest,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
2791,min,inn,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
2068,'s,santkehans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
6160,godt,huldreslått,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
11256,plagg,frem,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
1646,munn,morgenstund,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
7991,se,kveldene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [32]:
samples2 = [[b,a] for a,b in samples]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Non-rhyme(0.4487000107765198)
Rhyme(0.7192999720573425)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.7950000166893005)
Non-rhyme(0.46619999408721924)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.7366999983787537)
Rhyme(0.5515999794006348)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.8246999979019165)
Rhyme(0.8246999979019165)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.8246999979019165)
Rhyme(0.8246999979019165)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.15049999952316284)
Non-rhyme(0.35659998655319214)
---------------



# Use symmetric model

In [173]:
model_name = "rhyme_model_1_symmetric"

# # Uncomment to train model 

# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5", monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


In [174]:
# load the model

model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       724
           1       0.94      0.95      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [36]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
val["pred2"] = mirror_y_pred


val.loc[val["pred1"] != val["pred2"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2


In [37]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.7488999962806702)
Rhyme(0.7488999962806702)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.6347000002861023)
Rhyme(0.6347000002861023)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.093299999833107)
Non-rhyme(0.093299999833107)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9995999932289124)
Rhyme(0.9995999932289124)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9987999796867371)
Rhyme(0.9987999796867371)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.1657000035047531)
Non-rhyme(0.1657000035047531)
---------------



## 2:3 ratio symmetric

In [175]:
model_name = "rhyme_model_23_ratio_symmetric"

# #Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train_new[:, 0], X_train_new[:, 1]],
#     y_train_new,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100


In [39]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       724
           1       0.96      0.94      0.95       724

    accuracy                           0.95      1448
   macro avg       0.95      0.95      0.95      1448
weighted avg       0.95      0.95      0.95      1448



In [40]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
val["pred2"] = mirror_y_pred


val.loc[val["pred1"] != val["pred2"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2


In [41]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Non-rhyme(0.018200000748038292)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.10849999636411667)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.35670000314712524)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9998000264167786)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(1.0)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.0)
---------------



# Use dense rhyme pair dataset

In [42]:
rhyme_pairs = pd.read_csv("tsvs/manual_repair_plus_good_buckets_pairs.tsv", sep="\t")
neg_pairs = pd.read_csv("tsvs/negative_pairs.tsv", sep="\t")

pos_pairs = rhyme_pairs.sample(n=len(neg_pairs), random_state=SEED)
pos_pairs = pos_pairs.reset_index(drop=True)
dense_df = pd.concat((pos_pairs, neg_pairs))
dense_df = dense_df.reset_index(drop=True)
dense_df

Unnamed: 0,word_a,word_b,rhyme
0,brand,Kinamann,1
1,flammespill,hvil,1
2,tømmer,strålestrømmer,1
3,grunn,elvebunn,1
4,svøpe,støpe,1
...,...,...,...
44889,frekk,kupler,0
44890,ness,snor,0
44891,seil,gynget,0
44892,LOFOTEN,Røst,0


## Remove the pairs that are in test set

In [43]:
test = df.loc[X_test_indexes].copy()
test

Unnamed: 0,word_a,word_b,rhyme,word_tokens
1789,hugge,vugge,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6140,snar,far,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9312,høre,kvider,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10727,brenner,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13969,hernede,saft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
5965,sti,verdi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10776,nåde,grunn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4184,fans,krans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
12052,sanne,rid,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [44]:
test_pairs = set(zip(test.word_a, test.word_b))
dense_pairs = set(zip(dense_df.word_a, dense_df.word_b))
overlapping_pairs = test_pairs.intersection(dense_pairs)
len(overlapping_pairs), len(test_pairs)

(2798, 4343)

In [45]:
drop_is = [dense_df[(dense_df.word_a == a)&(dense_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]

In [46]:
dense_df = dense_df.drop(drop_is)

## Train etc

In [47]:
dense_df['word_tokens'] = dense_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/42096 [00:00<?, ?it/s]

In [48]:
X_train = tf.convert_to_tensor(list(dense_df["word_tokens"]))
y_train = tf.convert_to_tensor(list(dense_df['rhyme']))

In [49]:
X_train.shape, y_train.shape

(TensorShape([42096, 2, 64]), TensorShape([42096]))

In [50]:
model_name = "rhyme_model_40k"

# # Uncomment to train model 

# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [51]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       724
           1       0.97      0.97      0.97       724

    accuracy                           0.97      1448
   macro avg       0.97      0.97      0.97      1448
weighted avg       0.97      0.97      0.97      1448



In [52]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

43


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10339,øyet,er,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6803,ahner,faner,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
7996,sang,sildring,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6624,nu,sous,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
8397,Evighøye,leie,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
1584,tent,procent,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
4422,bever,fred,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13613,li,seil,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [53]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [54]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96       724
           1       0.95      0.96      0.96       724

    accuracy                           0.96      1448
   macro avg       0.96      0.96      0.96      1448
weighted avg       0.96      0.96      0.96      1448



In [55]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

34


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
8320,kompani,blod,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
10339,øyet,er,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
6803,ahner,faner,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8397,Evighøye,leie,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
1584,tent,procent,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8939,engang,li,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
10322,bragt,flukt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8200,nær,ren,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8357,øye,skje,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
2154,stammen,amen,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False


In [56]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.6312999725341797)
Non-rhyme(0.40459999442100525)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.8783000111579895)
Rhyme(0.7177000045776367)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.0)
Non-rhyme(0.0)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9843000173568726)
Rhyme(0.9854000210762024)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9803000092506409)
Rhyme(0.9803000092506409)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.0)
Non-rhyme(0.0)
---------------



## Symmetric 40k

In [57]:
model_name = "rhyme_model_40k_symmetric"

# #Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [58]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       724
           1       0.98      0.97      0.97       724

    accuracy                           0.97      1448
   macro avg       0.97      0.97      0.97      1448
weighted avg       0.97      0.97      0.97      1448



In [59]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))


38


In [60]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.9480000138282776)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.2621999979019165)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.11819999665021896)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(1.0)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(1.0)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.0)
---------------



# Use dense and wiktionary dataset

In [119]:
rhyme_pairs = pd.read_csv("tsvs/merged_buckets_pairs.tsv", sep="\t")
non_rhymes = pd.read_csv("tsvs/wiktionary_negative_rhyme_pairs.tsv", sep="\t")
non_rhymes["rhyme"] = [0]*len(non_rhymes)


pos = rhyme_pairs.sample(n=23000, random_state=SEED)
neg = non_rhymes.sample(n=23000, random_state=SEED)
dw_df = pd.concat([pos, neg])
dw_df = dw_df.reset_index(drop=True)
dw_df

Unnamed: 0,word_a,word_b,rhyme
0,smil,kodisill,1
1,hybridbil,renkespill,1
2,vigil,till,1
3,hittil,racerbil,1
4,husvill,ambulansebil,1
...,...,...,...
45995,prillarhorn,måsefugl,0
45996,vigil,sjalusi,0
45997,bort,postkort,0
45998,munnspill,uplassert,0


In [120]:
dw_pairs = set(zip(dw_df.word_a, dw_df.word_b))
overlapping_pairs = test_pairs.intersection(dw_pairs)
len(overlapping_pairs), len(test_pairs)

(163, 4343)

In [121]:
drop_is = [dw_df[(dw_df.word_a == a)&(dw_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]
len(drop_is)

163

In [122]:
dw_df = dw_df.drop(drop_is)

In [123]:
dw_df['word_tokens'] = dw_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/45837 [00:00<?, ?it/s]

In [124]:
dw_df

Unnamed: 0,word_a,word_b,rhyme,word_tokens
0,smil,kodisill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1,hybridbil,renkespill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2,vigil,till,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
3,hittil,racerbil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4,husvill,ambulansebil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
45995,prillarhorn,måsefugl,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
45996,vigil,sjalusi,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
45997,bort,postkort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
45998,munnspill,uplassert,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [125]:
# X_train, X_test_dw, y_train, y_test_dw = train_test_split(
#     list(dw_df["word_tokens"]), list(dw_df['rhyme']), stratify=list(dw_df['rhyme']), 
#     test_size=len(y_test), random_state=SEED
#     )

X_train_indexes, X_test_dw_indexes, y_train, y_test_dw = train_test_split(
    list(dw_df.index), list(dw_df['rhyme']), stratify=list(dw_df['rhyme']), 
    test_size=len(y_test), random_state=SEED
    )

X_train = tf.convert_to_tensor(list(dw_df.loc[X_train_indexes]["word_tokens"]))
y_train = tf.convert_to_tensor(y_train)

X_test_dw = tf.convert_to_tensor(list(dw_df.loc[X_test_dw_indexes]["word_tokens"]))
y_test_dw = tf.convert_to_tensor(y_test_dw)

In [126]:
X_test_dw.shape, y_test_dw.shape

(TensorShape([4343, 2, 64]), TensorShape([4343]))

In [127]:
X_train.shape, y_train.shape

(TensorShape([41494, 2, 64]), TensorShape([41494]))

In [143]:
model_name = "rhyme_model_dw_40k"

# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [144]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87       724
           1       0.86      0.89      0.88       724

    accuracy                           0.88      1448
   macro avg       0.88      0.88      0.88      1448
weighted avg       0.88      0.88      0.88      1448



In [145]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

180


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6898,bred,se,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
5490,brann,sand,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
10247,rot,berget,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
13283,besvær,slipper,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6395,mer,fler,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
2223,inn,pålandsvind,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False


In [131]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [132]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.95      0.50      0.66       724
           1       0.66      0.98      0.79       724

    accuracy                           0.74      1448
   macro avg       0.81      0.74      0.72      1448
weighted avg       0.81      0.74      0.72      1448



In [133]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

310


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
9140,ene,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8385,krans,bedrifter,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11640,strime,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11356,slippe,død,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
10849,gløder,skjelver,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
...,...,...,...,...,...,...
7444,ned,berører,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
9642,svak,stenges,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
13766,en,landet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [134]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.9837999939918518)
Rhyme(0.9652000069618225)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.6118999719619751)
Rhyme(0.9620000123977661)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.6176000237464905)
Non-rhyme(0.0)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9997000098228455)
Rhyme(0.9997000098228455)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9997000098228455)
Rhyme(0.9993000030517578)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.7821999788284302)
Rhyme(0.9057000279426575)
---------------



## Symmetric dense and wiktionary dataset

In [146]:
model_name = "rhyme_model_dw_40k_symmetric"

# # Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [147]:
# load the model
# model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87       724
           1       0.86      0.89      0.88       724

    accuracy                           0.88      1448
   macro avg       0.88      0.88      0.88      1448
weighted avg       0.88      0.88      0.88      1448



In [139]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
5490,brann,sand,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
...,...,...,...,...,...
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
2223,inn,pålandsvind,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13766,en,landet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [140]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.5407999753952026)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.2770000100135803)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.46869999170303345)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9997000098228455)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9987999796867371)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.995199978351593)
---------------



# Use wiktionary pairs as test

In [61]:
rhymes = pd.read_csv("tsvs/wiktionary_rhyme_pairs.tsv", sep="\t")
rhymes["rhyme"] = [1]*len(rhymes)
non_rhymes = pd.read_csv("tsvs/wiktionary_negative_rhyme_pairs.tsv", sep="\t")
non_rhymes["rhyme"] = [0]*len(non_rhymes)
len(rhymes), len(non_rhymes)

(80363, 849993)

In [62]:
pos = rhymes.copy()
neg = non_rhymes.sample(n=len(pos), random_state=SEED)
w_df = pd.concat([pos, neg])
w_df = w_df.reset_index(drop=True)
w_df

Unnamed: 0,word_a,word_b,rhyme
0,amfi,deponi,1
1,sofistikert,uartikulert,1
2,alkymi,modneri,1
3,habil,anglofil,1
4,spekulert,motivert,1
...,...,...,...
160721,avgjort,oljebrønn,0
160722,tamil,visittkort,0
160723,girt,klartekst,0
160724,de,ulykkesfugl,0


In [63]:
w_pairs = set(zip(w_df.word_a, w_df.word_b))
overlapping_pairs = test_pairs.intersection(w_pairs)
len(overlapping_pairs), len(test_pairs)

(17, 4343)

In [64]:
drop_is = [w_df[(w_df.word_a == a) & (w_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]
len(drop_is)

17

In [65]:
w_df = w_df.drop(drop_is)

In [66]:
w_df['word_tokens'] = w_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/160709 [00:00<?, ?it/s]

In [67]:
w_df

Unnamed: 0,word_a,word_b,rhyme,word_tokens
0,amfi,deponi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1,sofistikert,uartikulert,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2,alkymi,modneri,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
3,habil,anglofil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4,spekulert,motivert,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
160721,avgjort,oljebrønn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160722,tamil,visittkort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160723,girt,klartekst,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160724,de,ulykkesfugl,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [68]:
X_train = tf.convert_to_tensor(list(w_df["word_tokens"]))
y_train = tf.convert_to_tensor(list(w_df['rhyme']))

In [69]:
X_train.shape, y_train.shape

(TensorShape([160709, 2, 64]), TensorShape([160709]))

In [70]:
model_name = "rhyme_model_wiktionary"

# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [71]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       724
           1       0.83      0.66      0.73       724

    accuracy                           0.76      1448
   macro avg       0.77      0.76      0.76      1448
weighted avg       0.77      0.76      0.76      1448



In [72]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

346


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
3594,beskytte,hytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
5456,lav,grav,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
351,rar,kar,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
...,...,...,...,...,...
1070,Oskoreien,vegen,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
2223,inn,pålandsvind,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
1196,kårer,ferdselsårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6586,noensinne,finne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False


In [73]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [74]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       724
           1       0.84      0.66      0.74       724

    accuracy                           0.76      1448
   macro avg       0.78      0.76      0.76      1448
weighted avg       0.78      0.76      0.76      1448



In [75]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

376


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
3594,beskytte,hytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11356,slippe,død,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
5456,lav,grav,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
351,rar,kar,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
...,...,...,...,...,...,...
5177,blod,sjelebod,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8367,kanhende,kommer,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
6835,troner,æoner,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
6586,noensinne,finne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [76]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(1.0)
Rhyme(0.982699990272522)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.9998000264167786)
Non-rhyme(0.05270000174641609)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(1.0)
Non-rhyme(0.0)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(1.0)
Rhyme(0.9991000294685364)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(1.0)
Rhyme(1.0)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(9.999999747378752e-05)
Non-rhyme(9.999999747378752e-05)
---------------



# Test set best model