source: https://github.com/minoguep/rhyme_detection and https://paulminogue.com/index.php/2021/02/14/using-a-siamese-neural-network-to-create-a-simple-rhyme-detector/

In [1]:
import string
import json

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger, EarlyStopping

tqdm.pandas()

MAX_LEN = 64
SEED = 420
# sets random, np.random and tf.random seed
tf.keras.utils.set_random_seed(
    SEED
)

2022-05-04 18:36:06.204720: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-04 18:36:06.204749: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Model architecture

In [2]:
# Pauls code
def create_model():
  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
        
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

In [3]:
# My symmetric version
def create_symmetric_model():  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
    
    concat_layer = Lambda(lambda x: tf.math.abs(x))(concat_layer)
    
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

# Data
We want equally many positive and negative samples of rhyme pairs

In [4]:
pos = pd.read_csv("tsvs/positive_pairs.tsv", sep="\t")
full_neg = pd.read_csv("tsvs/negative_pairs.tsv", sep="\t")
neg = full_neg.sample(n=len(pos), random_state=SEED)
df = pd.concat([pos, neg])
df = df.reset_index(drop=True)
df

Unnamed: 0,word_a,word_b,rhyme
0,stall,skrall,1
1,ving,ting,1
2,orden,horden,1
3,taler,svaler,1
4,juleskikk,blikk,1
...,...,...,...
14471,hatt,slott,0
14472,mur,stund,0
14473,hud,mave,0
14474,dør,sans,0


## Words to char tokens
Copy paste from Pauls notebook

In [5]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):
    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])

    # now loop through inputs and pad or reduce size if required
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

In [6]:
# all_data = pd.read_csv("tsvs/tita_rhymes_poems.tsv", sep="\t")

# all_text = ""
# for e in all_data.stanza:
#     all_text += e
# tokenizer = Tokenizer(char_level=True, lower=True)
# tokenizer.fit_on_texts(all_text)

# tokenizer_config = tokenizer.to_json()

# with open('siamese_lstm_tokenizer_config.json', 'w') as f:
#     f.write(tokenizer_config)

In [7]:
with open("siamese_lstm_tokenizer_config.json") as f:
    tokenizer_config = f.read()

tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

In [8]:
df['word_tokens'] = df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/14476 [00:00<?, ?it/s]

2022-05-04 18:36:07.651889: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-04 18:36:07.651969: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-04 18:36:07.651990: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tita-laptop): /proc/driver/nvidia/version does not exist
2022-05-04 18:36:07.652267: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data set split

In [9]:
X_train_indexes, X_test_indexes, y_train, y_test = train_test_split(
    list(df.index), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=SEED
    )

X_test_indexes, X_val_indexes, y_test, y_val = train_test_split(
    X_test_indexes, y_test, stratify=y_test, 
    test_size=0.25, random_state=SEED
    )

X_train = tf.convert_to_tensor(list(df.loc[X_train_indexes]["word_tokens"]))
X_val = tf.convert_to_tensor(list(df.loc[X_val_indexes]["word_tokens"]))
X_test = tf.convert_to_tensor(list(df.loc[X_test_indexes]["word_tokens"]))

y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)
y_test = tf.convert_to_tensor(y_test)

In [10]:
print(f"""
    Data set size: Full set: {len(df)}
    Train: {len(X_train)}
    Validation: {len(X_val)}
    Test: {len(X_test)}
""")


    Data set size: Full set: 14476
    Train: 8685
    Validation: 1448
    Test: 4343



# Baseline

In [11]:
test_text = df.loc[X_test_indexes]
test_text

Unnamed: 0,word_a,word_b,rhyme,word_tokens
1789,hugge,vugge,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6140,snar,far,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9312,høre,kvider,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10727,brenner,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13969,hernede,saft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
5965,sti,verdi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10776,nåde,grunn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4184,fans,krans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
12052,sanne,rid,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [12]:
def is_vowel(char):
    return char in ['e', 'a', 'i', 'o', 'u', 'å', 'ø', 'y', 'æ', 'é', 'ö', 'ä', 'á', 'à']

def last_letters(word):
    for i in range(len(word)-1, -1, -1):
        if is_vowel(word[i]):
            break
    return word[i:]

def words_rhyme(word_a, word_b):
    return last_letters(word_a) == last_letters(word_b)


baseline_pred = [words_rhyme(word_a, word_b) for word_a, word_b in zip(test_text.word_a, test_text.word_b)]

In [13]:
print("Test set accuracy")
print(classification_report(y_test, baseline_pred))

Test set accuracy
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      2171
           1       0.93      0.87      0.90      2172

    accuracy                           0.90      4343
   macro avg       0.90      0.90      0.90      4343
weighted avg       0.90      0.90      0.90      4343



In [14]:
samples = [
    ["se det", "frede"], 
    ["tid", "svineri"],
    ["sagt", "sagd"],
    ["vilje", "vanilje"], 
    ["vokser", "bukser"],
    ["ha det", "badet"]
]

for a, b in samples:
    pred = words_rhyme(a,b)
    print("Sentence 1:", a)
    print("Sentence 2:", b)
    if pred:
        print("Rhyme\n")
    else:
        print("Not rhyme\n")
    

Sentence 1: se det
Sentence 2: frede
Not rhyme

Sentence 1: tid
Sentence 2: svineri
Not rhyme

Sentence 1: sagt
Sentence 2: sagd
Not rhyme

Sentence 1: vilje
Sentence 2: vanilje
Rhyme

Sentence 1: vokser
Sentence 2: bukser
Rhyme

Sentence 1: ha det
Sentence 2: badet
Rhyme



# First basic model based on Minogue (2021)
Exactly the same code as Minogue

In [15]:
model_name = "first_simple_minogue"

# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [16]:
# load the model
model = load_model(f"models/{model_name}.hdf5")
y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print("Val set accuracy")
print(classification_report(y_val, y_pred))

Val set accuracy
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       724
           1       0.96      0.93      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



# Slightly improved (?) Version
Add early stop based on val loss, so we dont train unneccessary long

In [17]:
model_name = "rhyme_model_1"

# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [18]:
model_name = "rhyme_model_1"

# load the model
model = load_model(f"models/{model_name}.hdf5")
y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print("Val set accuracy")
print(classification_report(y_val, y_pred))

# y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
# y_pred = y_pred > 0.5
# print(classification_report(y_test, y_pred))

Val set accuracy
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       724
           1       0.96      0.92      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [19]:
def get_mirrored_df(df):
    df = df.copy()
    mirror = pd.DataFrame({"word_a": df["word_b"], 
                           "word_b": df["word_a"], 
                           "rhyme": df["rhyme"], 
                           "word_tokens":[(t[1], t[0]) for t in df["word_tokens"]]})
    return mirror

In [20]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
14351,visen,ganger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
6898,bred,se,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13792,land,skritt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
4855,kjært,verdt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13429,dikt,raft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
1010,uskyldsren,en,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13283,besvær,slipper,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [21]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [22]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       724
           1       0.96      0.91      0.93       724

    accuracy                           0.93      1448
   macro avg       0.94      0.93      0.93      1448
weighted avg       0.94      0.93      0.93      1448



In [23]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

38


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
14351,visen,ganger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8164,fra,deg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8478,Graven,lukket,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
4102,hat,satt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
6160,godt,huldreslått,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8939,engang,li,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8200,nær,ren,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
9025,se,sten,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
1646,munn,morgenstund,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
3178,gård,sår,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [24]:
samples = [
    ["se det", "frede"], 
    ["tid", "svineri"],
    ["sagt", "sagd"],
    ["vilje", "vanilje"], 
    ["vokser", "bukser"],
    ["ha det", "badet"]
]

samples2 = [[b,a] for a,b in samples]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Non-rhyme(0.2669999897480011)
Non-rhyme(0.27219998836517334)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.40950000286102295)
Rhyme(0.5325000286102295)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.6988999843597412)
Rhyme(0.5128999948501587)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9818000197410583)
Rhyme(0.9818000197410583)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.977400004863739)
Rhyme(0.9761999845504761)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.2071000039577484)
Non-rhyme(0.24410000443458557)
---------------



# Try again with different 2:3 positive to negative ratio
Expand train set.  
The set is already 50/50 positive and negative.  
We want to make it 40/60 --> increase negative examples by half of what we already have

In [25]:
new_neg_train = len(X_train) // 4

new_neg_train

2171

In [26]:
# use same seed, and extract all the pairs already used + the ones we need 
negative2 = full_neg.sample(n=len(pos) + new_neg_train, random_state=SEED)

In [27]:
unused_neg = negative2[len(pos):]
unused_neg

Unnamed: 0,word_a,word_b,rhyme
9448,hår,blundet,0
15651,brenner,skatt,0
16533,siv,stille,0
16219,verdensdommen,kronet,0
13700,jord,sans,0
...,...,...,...
3741,stå,gang,0
20029,vekt,dørken,0
11242,sner,huset,0
9095,klang,rørte,0


In [28]:
# use same tokenizer as above
unused_neg['word_tokens'] = unused_neg.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)
unused_neg

  0%|          | 0/2171 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unused_neg['word_tokens'] = unused_neg.progress_apply(


Unnamed: 0,word_a,word_b,rhyme,word_tokens
9448,hår,blundet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
15651,brenner,skatt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
16533,siv,stille,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
16219,verdensdommen,kronet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13700,jord,sans,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
3741,stå,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
20029,vekt,dørken,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
11242,sner,huset,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9095,klang,rørte,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [29]:
X_new = list(unused_neg["word_tokens"])
y_new = list(unused_neg["rhyme"])

In [30]:
X_train_new = tf.convert_to_tensor(list(df.loc[X_train_indexes]["word_tokens"]) +  X_new)
y_train_new = tf.convert_to_tensor(list(y_train) + y_new)


X_train_new.shape, y_train_new.shape

(TensorShape([10856, 2, 64]), TensorShape([10856]))

In [31]:
model_name = "rhyme_model_23_ratio"

# # Uncomment to train
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train_new[:, 0], X_train_new[:, 1]],
#     y_train_new,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [32]:
model_name = "rhyme_model_23_ratio"

# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       724
           1       0.96      0.93      0.94       724

    accuracy                           0.95      1448
   macro avg       0.95      0.95      0.95      1448
weighted avg       0.95      0.95      0.95      1448



In [33]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13792,land,skritt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10339,øyet,er,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
2308,tent,forbannet,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
4855,kjært,verdt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13429,dikt,raft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
1010,uskyldsren,en,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False


In [34]:
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       724
           1       0.95      0.94      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [35]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

36


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
4102,hat,satt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
2068,'s,santkehans,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
6160,godt,huldreslått,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8200,nær,ren,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
3178,gård,sår,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
12975,næring,tid,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
1493,tinnsoldater,nasjonalteater,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
7833,skjule,svake,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8774,ruger,fanger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
4672,alt,valgt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [36]:
samples2 = [[b,a] for a,b in samples]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Non-rhyme(0.4851999878883362)
Non-rhyme(0.31470000743865967)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.5791000127792358)
Rhyme(0.628000020980835)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.6969000101089478)
Non-rhyme(0.49790000915527344)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9334999918937683)
Rhyme(0.9334999918937683)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9326000213623047)
Rhyme(0.9334999918937683)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.477400004863739)
Non-rhyme(0.4627000093460083)
---------------



# Use symmetric model

In [37]:
model_name = "rhyme_model_1_symmetric"

# # Uncomment to train model 

# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5", monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [38]:
# load the model

model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       724
           1       0.94      0.95      0.94       724

    accuracy                           0.94      1448
   macro avg       0.94      0.94      0.94      1448
weighted avg       0.94      0.94      0.94      1448



In [39]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
val["pred2"] = mirror_y_pred


val.loc[val["pred1"] != val["pred2"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2


In [40]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.7731999754905701)
Rhyme(0.7731999754905701)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.5209000110626221)
Rhyme(0.5209000110626221)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.4839000105857849)
Non-rhyme(0.4839000105857849)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9998000264167786)
Rhyme(0.9998000264167786)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9718999862670898)
Rhyme(0.9718999862670898)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.29989999532699585)
Non-rhyme(0.29989999532699585)
---------------



## 2:3 ratio symmetric

In [41]:
model_name = "rhyme_model_23_ratio_symmetric"

# #Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train_new[:, 0], X_train_new[:, 1]],
#     y_train_new,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [42]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       724
           1       0.96      0.94      0.95       724

    accuracy                           0.95      1448
   macro avg       0.95      0.95      0.95      1448
weighted avg       0.95      0.95      0.95      1448



In [43]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
mirror_val = get_mirrored_df(val)
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
val["pred2"] = mirror_y_pred


val.loc[val["pred1"] != val["pred2"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2


In [44]:
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
6537,havet,gave,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,False
13792,land,skritt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True
7996,sang,sildring,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True
6624,nu,sous,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,False
...,...,...,...,...,...,...
7818,hender,verden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True
13429,dikt,raft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True
2202,ild,vil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,False
13283,besvær,slipper,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,True


In [45]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.8623999953269958)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.38440001010894775)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.13850000500679016)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9878000020980835)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.967199981212616)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.0934000015258789)
---------------



# Use dense rhyme pair dataset

In [46]:
rhyme_pairs = pd.read_csv("tsvs/manual_repair_plus_good_buckets_pairs.tsv", sep="\t")
neg_pairs = pd.read_csv("tsvs/negative_pairs.tsv", sep="\t")

pos_pairs = rhyme_pairs.sample(n=len(neg_pairs), random_state=SEED)
pos_pairs = pos_pairs.reset_index(drop=True)
dense_df = pd.concat((pos_pairs, neg_pairs))
dense_df = dense_df.reset_index(drop=True)
dense_df

Unnamed: 0,word_a,word_b,rhyme
0,brand,Kinamann,1
1,flammespill,hvil,1
2,tømmer,strålestrømmer,1
3,grunn,elvebunn,1
4,svøpe,støpe,1
...,...,...,...
44889,frekk,kupler,0
44890,ness,snor,0
44891,seil,gynget,0
44892,LOFOTEN,Røst,0


## Remove the pairs that are in test and dev sets

In [47]:
test = df.loc[X_test_indexes].copy()
val = df.loc[X_val_indexes].copy()
test_val = pd.concat([test, val])
test_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
1789,hugge,vugge,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6140,snar,far,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9312,høre,kvider,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10727,brenner,gang,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
13969,hernede,saft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,en,landet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,hjul,morgen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,kårer,ferdselsårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,noensinne,finne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [48]:
test_val_pairs = set(zip(test_val.word_a, test_val.word_b))
dense_pairs = set(zip(dense_df.word_a, dense_df.word_b))
overlapping_pairs = test_val_pairs.intersection(dense_pairs)
len(overlapping_pairs), len(test_val_pairs)

(3712, 5790)

In [49]:
drop_is = [dense_df[(dense_df.word_a == a)&(dense_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]

In [50]:
dense_df = dense_df.drop(drop_is)

In [51]:
sum(dense_df["rhyme"]==1), sum(dense_df["rhyme"]==0)

(21632, 19550)

## Train etc

In [52]:
dense_df['word_tokens'] = dense_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/41182 [00:00<?, ?it/s]

In [53]:
X_train = tf.convert_to_tensor(list(dense_df["word_tokens"]))
y_train = tf.convert_to_tensor(list(dense_df['rhyme']))

In [54]:
X_train.shape, y_train.shape

(TensorShape([41182, 2, 64]), TensorShape([41182]))

In [55]:
model_name = "rhyme_model_40k" 

# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [56]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93       724
           1       0.91      0.96      0.94       724

    accuracy                           0.93      1448
   macro avg       0.94      0.93      0.93      1448
weighted avg       0.94      0.93      0.93      1448



In [57]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

95


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
14351,visen,ganger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13792,land,skritt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8320,kompani,blod,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8529,sale,tale,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
10318,kåret,bære,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13283,besvær,slipper,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6395,mer,fler,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False


In [58]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [59]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92       724
           1       0.89      0.96      0.92       724

    accuracy                           0.92      1448
   macro avg       0.92      0.92      0.92      1448
weighted avg       0.92      0.92      0.92      1448



In [60]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

34


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
9767,går,frihets-stevne,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11256,plagg,frem,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
12231,skjær,begravet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
8939,engang,li,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8974,solskinnsglans,duft,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
13094,skader,tiden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
3178,gård,sår,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
9685,plagg,fem,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
1265,ihu,gru,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
996,høvelspån,på'n,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [61]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.6707000136375427)
Rhyme(0.9337999820709229)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.9764000177383423)
Rhyme(0.9297000169754028)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.5684999823570251)
Non-rhyme(0.46369999647140503)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9761000275611877)
Rhyme(0.9746999740600586)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9725000262260437)
Rhyme(0.9725000262260437)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.1080000028014183)
Non-rhyme(0.13850000500679016)
---------------



## Symmetric 40k

In [62]:
model_name = "rhyme_model_40k_symmetric"

# #Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [63]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       724
           1       0.95      0.96      0.95       724

    accuracy                           0.95      1448
   macro avg       0.95      0.95      0.95      1448
weighted avg       0.95      0.95      0.95      1448



In [64]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))


67


In [65]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.9178000092506409)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.7175999879837036)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.26489999890327454)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9969000220298767)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9797000288963318)
---------------

Sentence 1: ha det
Sentence 2: badet
Non-rhyme(0.0012000000569969416)
---------------



# Use dense and wiktionary dataset (small)

In [66]:
rhyme_pairs = pd.read_csv("tsvs/merged_buckets_pairs.tsv", sep="\t")
non_rhymes = pd.read_csv("tsvs/wiktionary_negative_rhyme_pairs.tsv", sep="\t")
non_rhymes["rhyme"] = [0]*len(non_rhymes)


pos = rhyme_pairs.sample(n=20000, random_state=SEED)
neg = non_rhymes.sample(n=20000, random_state=SEED)
dw_df = pd.concat([pos, neg])
dw_df = dw_df.reset_index(drop=True)
dw_df

Unnamed: 0,word_a,word_b,rhyme
0,smil,kodisill,1
1,hybridbil,renkespill,1
2,vigil,till,1
3,hittil,racerbil,1
4,husvill,ambulansebil,1
...,...,...,...
39995,imbesil,tre,0
39996,festspill,helsesport,0
39997,puslespill,idé,0
39998,imbesil,gni,0


In [70]:
dw_pairs = set(zip(dw_df.word_a, dw_df.word_b))
overlapping_pairs = test_val_pairs.intersection(dw_pairs)
len(overlapping_pairs), len(test_val_pairs)

(186, 5790)

In [71]:
drop_is = [dw_df[(dw_df.word_a == a)&(dw_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]
len(drop_is)

186

In [72]:
dw_df = dw_df.drop(drop_is)

In [73]:
dw_df['word_tokens'] = dw_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/39814 [00:00<?, ?it/s]

In [74]:
dw_df

Unnamed: 0,word_a,word_b,rhyme,word_tokens
0,smil,kodisill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1,hybridbil,renkespill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2,vigil,till,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
3,hittil,racerbil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
4,husvill,ambulansebil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
39995,imbesil,tre,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
39996,festspill,helsesport,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
39997,puslespill,idé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
39998,imbesil,gni,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [75]:
X_train = tf.convert_to_tensor(list(dw_df["word_tokens"]))
y_train = tf.convert_to_tensor(list(dw_df["rhyme"]))

In [76]:
X_train.shape, y_train.shape

(TensorShape([39814, 2, 64]), TensorShape([39814]))

In [77]:
X_train.shape, y_train.shape

(TensorShape([39814, 2, 64]), TensorShape([39814]))

In [78]:
model_name = "rhyme_model_dw_40k"

# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [79]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.61      0.73       724
           1       0.71      0.94      0.81       724

    accuracy                           0.77      1448
   macro avg       0.81      0.77      0.77      1448
weighted avg       0.81      0.77      0.77      1448



In [80]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

328


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
7948,sort,syn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
11390,skynder,når,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
10286,samme,tier,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
6395,mer,fler,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [81]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74       724
           1       0.72      0.93      0.81       724

    accuracy                           0.78      1448
   macro avg       0.81      0.78      0.78      1448
weighted avg       0.81      0.78      0.78      1448



In [82]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

306


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2
9140,ene,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11640,strime,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11356,slippe,død,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
4288,totenslæger,beger,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
...,...,...,...,...,...,...
1010,uskyldsren,en,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True,False
3125,inn,sin,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
11461,lege,tanker,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True
8367,kanhende,kommer,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False,True


In [83]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.5766000151634216)
Rhyme(0.548799991607666)
---------------

Sentence 1: tid
Sentence 2: svineri
Non-rhyme(0.23160000145435333)
Rhyme(0.555400013923645)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.5419999957084656)
Non-rhyme(0.47699999809265137)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9908000230789185)
Rhyme(0.9916999936103821)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9873999953269958)
Rhyme(0.9911999702453613)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.9714999794960022)
Rhyme(0.9872999787330627)
---------------



## Symmetric dense and wiktionary dataset

In [84]:
model_name = "rhyme_model_dw_40k_symmetric"

# # Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [85]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.68      0.78       724
           1       0.75      0.95      0.84       724

    accuracy                           0.81      1448
   macro avg       0.84      0.81      0.81      1448
weighted avg       0.84      0.81      0.81      1448



In [86]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
9140,ene,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
11640,strime,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
12561,til,nede,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
2223,inn,pålandsvind,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8367,kanhende,kommer,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [87]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.7038000226020813)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.671999990940094)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.163100004196167)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9921000003814697)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9869999885559082)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.9643999934196472)
---------------



# Use dense and wiktionary dataset (full)

In [90]:
# rhyme_pairs = pd.read_csv("tsvs/merged_buckets_pairs.tsv", sep="\t")
# non_rhymes = pd.read_csv("tsvs/wiktionary_negative_rhyme_pairs.tsv", sep="\t")
# non_rhymes["rhyme"] = [0]*len(non_rhymes)

pos = rhyme_pairs.copy()
neg = non_rhymes.sample(n=len(pos), random_state=SEED)
dw_df = pd.concat([pos, neg])
dw_df = dw_df.reset_index(drop=True)
dw_df

Unnamed: 0,word_a,word_b,rhyme
0,Å,vrå,1
1,glatt,blomsterskatt,1
2,forfulgt,skjult,1
3,urfugl,neslesommerfugl,1
4,lampeskinn,skinn,1
...,...,...,...
311671,sluseport,tårnurt,0
311672,bildeskjønn,meieri,0
311673,varebil,orrfugl,0
311674,pupill,praktærfugl,0


In [91]:
dw_pairs = set(zip(dw_df.word_a, dw_df.word_b))
overlapping_pairs = test_val_pairs.intersection(dw_pairs)
len(overlapping_pairs), len(test_val_pairs)

(1421, 5790)

In [92]:
drop_is = [dw_df[(dw_df.word_a == a)&(dw_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]
len(drop_is)

1421

In [93]:
dw_df = dw_df.drop(drop_is)

In [94]:
dw_df['word_tokens'] = dw_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/310255 [00:00<?, ?it/s]

In [95]:
X_train = tf.convert_to_tensor(list(dw_df["word_tokens"]))
y_train = tf.convert_to_tensor(list(dw_df["rhyme"]))

2022-05-04 18:42:48.278685: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 317701120 exceeds 10% of free system memory.


In [96]:
X_train.shape, y_train.shape

(TensorShape([310255, 2, 64]), TensorShape([310255]))

In [97]:
model_name = "rhyme_model_dw_300k"

# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [98]:
# load the model
# model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.68      0.78       724
           1       0.75      0.95      0.84       724

    accuracy                           0.81      1448
   macro avg       0.84      0.81      0.81      1448
weighted avg       0.84      0.81      0.81      1448



In [99]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
diff = val.loc[val["pred1"] != val["rhyme"]]
print(len(diff))
diff

269


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
9140,ene,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
11640,strime,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
14059,ende,vé,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
12561,til,nede,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
2223,inn,pålandsvind,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",False
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8367,kanhende,kommer,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [100]:
mirror_val = get_mirrored_df(val)
mirror_val

Unnamed: 0,word_a,word_b,rhyme,word_tokens
3594,hytte,beskytte,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
2736,halm,malm,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14351,ganger,visen,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
5492,forbi,li,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9140,borg,ene,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
13766,landet,en,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
9926,morgen,hjul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
1196,ferdselsårer,kårer,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
6586,finne,noensinne,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [101]:
mirror_X_val = tf.convert_to_tensor(list(mirror_val["word_tokens"]))

mirror_y_pred = model.predict([mirror_X_val[:, 0], mirror_X_val[:, 1]])
mirror_y_pred = mirror_y_pred > 0.5
print(classification_report(y_val, mirror_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.68      0.78       724
           1       0.75      0.95      0.84       724

    accuracy                           0.81      1448
   macro avg       0.84      0.81      0.81      1448
weighted avg       0.84      0.81      0.81      1448



In [102]:
val["pred2"] = mirror_y_pred
diff = val.loc[val["pred1"] != val["pred2"]]
print(len(diff))
diff

0


Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1,pred2


In [103]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.7038000226020813)
Rhyme(0.7038000226020813)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.671999990940094)
Rhyme(0.671999990940094)
---------------

Sentence 1: sagt
Sentence 2: sagd
Non-rhyme(0.163100004196167)
Non-rhyme(0.163100004196167)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9921000003814697)
Rhyme(0.9921000003814697)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9869999885559082)
Rhyme(0.9869999885559082)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.9643999934196472)
Rhyme(0.9643999934196472)
---------------



## Symmetric dense and wiktionary dataset

In [104]:
model_name = "rhyme_model_dw_300k_symmetric"

# # Uncomment to train model 
# model = create_symmetric_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=256,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

In [105]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_val[:, 0], X_val[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.40      0.57       724
           1       0.62      0.99      0.76       724

    accuracy                           0.70      1448
   macro avg       0.80      0.70      0.67      1448
weighted avg       0.80      0.70      0.67      1448



In [106]:
val = df.loc[X_val_indexes].copy()
val["pred1"] = y_pred
val.loc[val["pred1"] != val["rhyme"]]

Unnamed: 0,word_a,word_b,rhyme,word_tokens,pred1
14351,visen,ganger,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8385,krans,bedrifter,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
11640,strime,borg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
9657,nord,skodden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
10245,brudehuset,gjeste,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
...,...,...,...,...,...
9777,seder,døden,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13200,hånd,komme,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
8367,kanhende,kommer,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True
13766,en,landet,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",True


In [107]:
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Sentence 1: se det
Sentence 2: frede
Rhyme(0.9063000082969666)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.8499000072479248)
---------------

Sentence 1: sagt
Sentence 2: sagd
Rhyme(0.8267999887466431)
---------------

Sentence 1: vilje
Sentence 2: vanilje
Rhyme(0.9998000264167786)
---------------

Sentence 1: vokser
Sentence 2: bukser
Rhyme(0.9994000196456909)
---------------

Sentence 1: ha det
Sentence 2: badet
Rhyme(0.9861000180244446)
---------------



# Use wiktionary pairs as second "test" set

In [108]:
rhymes = pd.read_csv("tsvs/wiktionary_rhyme_pairs.tsv", sep="\t")
rhymes["rhyme"] = [1]*len(rhymes)
non_rhymes = pd.read_csv("tsvs/wiktionary_negative_rhyme_pairs.tsv", sep="\t")
non_rhymes["rhyme"] = [0]*len(non_rhymes)
len(rhymes), len(non_rhymes)

(80363, 849993)

In [109]:
pos = rhymes.copy()
neg = non_rhymes.sample(n=2*len(pos), random_state=SEED)
neg = neg[len(pos):]
w_df = pd.concat([pos, neg])
w_df = w_df.reset_index(drop=True)
w_df

Unnamed: 0,word_a,word_b,rhyme
0,amfi,deponi,1
1,sofistikert,uartikulert,1
2,alkymi,modneri,1
3,habil,anglofil,1
4,spekulert,motivert,1
...,...,...,...
160721,tårnurt,akribi,0
160722,kli,medlemskort,0
160723,kvartmil,signalhorn,0
160724,puslespill,pli,0


In [110]:
w_pairs = set(zip(w_df.word_a, w_df.word_b))

overlapping_pairs1 = dw_pairs.intersection(w_pairs)
overlapping_pairs2 = dense_pairs.intersection(w_pairs)

overlapping_pairs = overlapping_pairs1.union(overlapping_pairs2)

len(overlapping_pairs1), len(overlapping_pairs2), len(overlapping_pairs)

(119608, 155, 119704)

In [111]:
drop_is = [w_df[(w_df.word_a == a) & (w_df.word_b == b)].index.to_list()[0] for a,b in overlapping_pairs]
len(drop_is)

119704

In [112]:
w_df = w_df.drop(drop_is)

In [113]:
w_df['word_tokens'] = w_df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/41022 [00:00<?, ?it/s]

In [114]:
sum(w_df["rhyme"]==1), 

(36128,)

In [115]:
160726-119704

41022

In [116]:
n = w_df.loc[w_df["rhyme"]==0]
n

Unnamed: 0,word_a,word_b,rhyme,word_tokens
93508,klort,sport,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
100460,ned,de,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
105722,lort,sort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
108814,tre,de,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
146386,fred,de,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
160721,tårnurt,akribi,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160722,kli,medlemskort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160723,kvartmil,signalhorn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160724,puslespill,pli,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [117]:
p = w_df.loc[w_df["rhyme"]==1]
p = p.sample(n=len(n), random_state=SEED)
p

Unnamed: 0,word_a,word_b,rhyme,word_tokens
15994,hankjønn,svigersønn,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
66417,glid,verdi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14878,onani,apogami,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
75977,isbil,olabil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
58509,sort,førerkort,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
53479,brøytebil,xenofil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
10375,sjøfugl,trekkfugl,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
55641,innstill,vekselspill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
64662,sølvpil,tamil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [118]:
w_test = pd.concat([p, n])
w_test

Unnamed: 0,word_a,word_b,rhyme,word_tokens
15994,hankjønn,svigersønn,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
66417,glid,verdi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
14878,onani,apogami,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
75977,isbil,olabil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
58509,sort,førerkort,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...
160721,tårnurt,akribi,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160722,kli,medlemskort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160723,kvartmil,signalhorn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
160724,puslespill,pli,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [119]:
X_val_w = tf.convert_to_tensor(list(w_test["word_tokens"]))
y_val_w = tf.convert_to_tensor(list(w_test["rhyme"]))

In [120]:
for model_name in ["rhyme_model_1", "rhyme_model_1_symmetric", 
                   "rhyme_model_23_ratio", "rhyme_model_23_ratio_symmetric",
                   "rhyme_model_40k", "rhyme_model_40k_symmetric",
                   "rhyme_model_dw_40k", "rhyme_model_dw_40k_symmetric",
                    "rhyme_model_dw_300k", "rhyme_model_dw_300k_symmetric"]:
    print(f"Model: {model_name}")
    model = load_model(f"models/{model_name}.hdf5")
    y_pred = model.predict([X_val_w[:, 0], X_val_w[:, 1]])
    y_pred = y_pred > 0.5
    print("Val set accuracy")
    print(classification_report(y_val_w, y_pred))
    print("---")

Model: rhyme_model_1
Val set accuracy
              precision    recall  f1-score   support

           0       0.78      0.88      0.83      4894
           1       0.86      0.76      0.80      4894

    accuracy                           0.82      9788
   macro avg       0.82      0.82      0.82      9788
weighted avg       0.82      0.82      0.82      9788

---
Model: rhyme_model_1_symmetric
Val set accuracy
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4894
           1       0.87      0.85      0.86      4894

    accuracy                           0.86      9788
   macro avg       0.86      0.86      0.86      9788
weighted avg       0.86      0.86      0.86      9788

---
Model: rhyme_model_23_ratio
Val set accuracy
              precision    recall  f1-score   support

           0       0.81      0.86      0.84      4894
           1       0.85      0.80      0.83      4894

    accuracy                           0.83

In [121]:
baseline_pred = [words_rhyme(word_a, word_b) for word_a, word_b in list(zip(w_test.word_a, w_test.word_b))]
print(classification_report(y_val_w, baseline_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93      4894
           1       0.99      0.86      0.92      4894

    accuracy                           0.93      9788
   macro avg       0.93      0.93      0.92      9788
weighted avg       0.93      0.93      0.92      9788



# Test set best models

In [122]:
m1 = "rhyme_model_40k_symmetric"
m2 = "rhyme_model_23_ratio_symmetric"

model = load_model(f"models/{m1}.hdf5")
m1_pred = model.predict([X_test[:, 0], X_test[:, 1]])
m1_pred = m1_pred > 0.5

model = load_model(f"models/{m2}.hdf5")
m2_pred = model.predict([X_test[:, 0], X_test[:, 1]])
m2_pred = m2_pred > 0.5

print(f"model_name: {m1}")
print(classification_report(y_test, m1_pred))

print(f"model_name: {m2}")
print(classification_report(y_test, m2_pred))

model_name: rhyme_model_40k_symmetric
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2171
           1       0.95      0.96      0.96      2172

    accuracy                           0.96      4343
   macro avg       0.96      0.96      0.96      4343
weighted avg       0.96      0.96      0.96      4343

model_name: rhyme_model_23_ratio_symmetric
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      2171
           1       0.97      0.94      0.96      2172

    accuracy                           0.96      4343
   macro avg       0.96      0.96      0.96      4343
weighted avg       0.96      0.96      0.96      4343

