source: https://github.com/minoguep/rhyme_detection and https://paulminogue.com/index.php/2021/02/14/using-a-siamese-neural-network-to-create-a-simple-rhyme-detector/

In [1]:
import string
import json

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger, EarlyStopping

tqdm.pandas()

MAX_LEN = 64
SEED = 420
# sets random, np.random and tf.random seed
tf.keras.utils.set_random_seed(
    SEED
)

2022-04-12 00:45:53.010317: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-12 00:45:53.010353: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Step 1: Create dataset
We want equally many positive and negative samples of rhyme pairs

In [2]:
rhyme_pairs = pd.read_csv("manual_repair_plus_good_buckets_positive_pairs.tsv", sep="\t",)
wiki_rhyme_pairs = pd.read_csv("wiktionary_rhyme_pairs.tsv", sep="\t")

rhyme_pairs

Unnamed: 0,word_a,word_b,rhyme
0,kvinner,finner,1
1,vil,vill,1
2,tordensky,solskinnsfly,1
3,strand,Cannes,1
4,Kviste,niste,1
...,...,...,...
38162,lot,rot,1
38163,sot,sjelerot,1
38164,svang,underklang,1
38165,tilsidst,kvinnelist,1


In [3]:
wiki_rhyme_pairs["rhyme"] = [1]*len(wiki_rhyme_pairs)
wiki_rhyme_pairs

Unnamed: 0,word_a,word_b,rhyme
0,rutebil,ørepil,1
1,mobil,servicebil,1
2,jugendstil,varebil,1
3,onani,lureri,1
4,bifili,selleri,1
...,...,...,...
80358,kvartmil,konebil,1
80359,møllehjul,balansehjul,1
80360,sti,infami,1
80361,kjemi,levkemi,1


In [4]:
rhymes = pd.concat((rhyme_pairs, wiki_rhyme_pairs))
rhymes

Unnamed: 0,word_a,word_b,rhyme
0,kvinner,finner,1
1,vil,vill,1
2,tordensky,solskinnsfly,1
3,strand,Cannes,1
4,Kviste,niste,1
...,...,...,...
80358,kvartmil,konebil,1
80359,møllehjul,balansehjul,1
80360,sti,infami,1
80361,kjemi,levkemi,1


In [5]:
non_rhymes = pd.read_csv("wiktionary_negative_rhyme_pairs.tsv", sep="\t")
non_rhymes["rhyme"] = [0]*len(non_rhymes)
non_rhymes

Unnamed: 0,word_a,word_b,rhyme
0,idet,sennepsgul,0
1,kupert,paradisfugl,0
2,kombinert,lettekorn,0
3,ste,vestrøn,0
4,uttært,mort,0
...,...,...,...
877424,cøliaki,avløpsventil,0
877425,tært,hermefugl,0
877426,bordbønn,ulykkesfugl,0
877427,stjert,føn,0


In [6]:
pos = rhymes.copy()
neg = non_rhymes.sample(n=len(pos), random_state=SEED)
df = pd.concat([pos, neg])
df = df.reset_index()
df

Unnamed: 0,index,word_a,word_b,rhyme
0,0,kvinner,finner,1
1,1,vil,vill,1
2,2,tordensky,solskinnsfly,1
3,3,strand,Cannes,1
4,4,Kviste,niste,1
...,...,...,...,...
237055,288389,binderi,vassfugl,0
237056,689877,empati,ukekort,0
237057,190137,kukelurt,slagbjørn,0
237058,729199,jaktkort,omspill,0


In [7]:
all_data = pd.read_csv("tita_rhymes_poems.tsv", sep="\t")

all_text = ""
for e in all_data.stanza:
    all_text += e

### Step 2: Create model
Copy paste from Pauls notebook

In [8]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):
    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])

    # now loop through inputs and pad or reduce size if required
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

In [9]:
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(all_text)

df['word_tokens'] = df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

tokenizer_config = tokenizer.to_json()

with open('tokenizer_config.json', 'w') as f:
    f.write(tokenizer_config)

  0%|          | 0/237060 [00:00<?, ?it/s]

2022-04-12 00:45:56.271307: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-12 00:45:56.271365: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-12 00:45:56.271386: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tita-laptop): /proc/driver/nvidia/version does not exist
2022-04-12 00:45:56.271770: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def create_model():
  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
    
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

In [11]:
X_train_indexes, X_test_indexes, y_train, y_test = train_test_split(
    list(df.index), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=SEED
    )

X_test_indexes, X_val_indexes, y_test, y_val = train_test_split(
    X_test_indexes, y_test, stratify=y_test, 
    test_size=0.25, random_state=SEED
    )

X_train = tf.convert_to_tensor(list(df.loc[X_train_indexes]["word_tokens"]))
X_val = tf.convert_to_tensor(list(df.loc[X_val_indexes]["word_tokens"]))
X_test = tf.convert_to_tensor(list(df.loc[X_test_indexes]["word_tokens"]))

y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)
y_test = tf.convert_to_tensor(y_test)

2022-04-12 00:46:11.584793: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 145649664 exceeds 10% of free system memory.


In [12]:
print(f"""
    Data set size: Full set: {len(df)}
    Train: {len(X_train), len(y_train)}
    Validation: {len(X_val), len(y_val)}
    Test: {len(X_test), len(y_test)}
""")



    Data set size: Full set: 237060
    Train: (142236, 142236)
    Validation: (23706, 23706)
    Test: (71118, 71118)



In [13]:
model_name = "rhyme_model_200k_1"

In [14]:
# #Uncomment to train model 

# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


In [15]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     35559
           1       0.99      1.00      0.99     35559

    accuracy                           0.99     71118
   macro avg       0.99      0.99      0.99     71118
weighted avg       0.99      0.99      0.99     71118



In [16]:
test = df.loc[X_test_indexes].copy()
test["pred"] = y_pred
test["pred"] = test["pred"].apply(lambda x: int(x))
test

Unnamed: 0,index,word_a,word_b,rhyme,word_tokens,pred
61710,23543,bli,draperi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
35508,35508,leve,veve,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
214462,491835,abortert,alpehorn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
107319,69152,kvil,lapidarstil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
193151,56406,nyptorn,jul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
...,...,...,...,...,...,...
223739,542661,omsorgslønn,leprabasill,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
43065,4898,uartikulert,velorganisert,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
236128,728659,gjært,selvgjort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
27776,27776,frø,kø,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1


In [17]:
test[test.rhyme != test.pred]

Unnamed: 0,index,word_a,word_b,rhyme,word_tokens,pred
30537,30537,hvil,vill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
39504,1337,fromasj,pistasie,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
156268,436525,tort,adgangskort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
143083,275590,lurt,yrt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
181869,31940,baufil,hul,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
...,...,...,...,...,...,...
147123,547553,nært,usyrt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
47981,9814,hertil,fiolinspill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
161393,143148,hasj,fred,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
6172,6172,stort,land,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0


In [18]:
samples = [
    ["Kan du ikke se det", "Deg skal jeg lede"], 
    ["Kaker av alle slag", "Her henger Norges flagg"], 
    ["Jeg har ikke tid", "Til dette svineri"],
    ["Hva har du sagt", "Kaken er bakt"], 
    ["Barna er lagt", "Kaken er laget"],
    ["Gjorde du det med vilje", "Kaken smaker vanilje"], 
    ["Dette vokser", "Satans underbukser"],
    ["tid", "svineri"]
]

samples2 = [[b,a] for a,b in samples]
print(samples2)

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

[['Deg skal jeg lede', 'Kan du ikke se det'], ['Her henger Norges flagg', 'Kaker av alle slag'], ['Til dette svineri', 'Jeg har ikke tid'], ['Kaken er bakt', 'Hva har du sagt'], ['Kaken er laget', 'Barna er lagt'], ['Kaken smaker vanilje', 'Gjorde du det med vilje'], ['Satans underbukser', 'Dette vokser'], ['svineri', 'tid']]
Sentence 1: Kan du ikke se det
Sentence 2: Deg skal jeg lede
Rhyme(0.9991999864578247)
Rhyme(1.0)
---------------

Sentence 1: Kaker av alle slag
Sentence 2: Her henger Norges flagg
Rhyme(0.9934999942779541)
Rhyme(0.5748000144958496)
---------------

Sentence 1: Jeg har ikke tid
Sentence 2: Til dette svineri
Rhyme(1.0)
Non-rhyme(0.0)
---------------

Sentence 1: Hva har du sagt
Sentence 2: Kaken er bakt
Rhyme(0.9937000274658203)
Rhyme(0.9998999834060669)
---------------

Sentence 1: Barna er lagt
Sentence 2: Kaken er laget
Rhyme(1.0)
Rhyme(0.7289999723434448)
---------------

Sentence 1: Gjorde du det med vilje
Sentence 2: Kaken smaker vanilje
Rhyme(0.999800026416

## Try again with mirrored examples included

In [19]:
def get_mirrored_df(df):
    mirror = pd.DataFrame({"word_a": df["word_b"], 
                           "word_b": df["word_a"], 
                           "rhyme": df["rhyme"], 
                           "word_tokens":[(t[1], t[0]) for t in df["word_tokens"]]})
    return pd.concat((df, mirror))

In [20]:
train_df = df.loc[X_train_indexes]
dev_df = df.loc[X_val_indexes]
len(train_df), len(dev_df)

(142236, 23706)

In [21]:
double_train = get_mirrored_df(train_df)
double_train

Unnamed: 0,index,word_a,word_b,rhyme,word_tokens
115133,76966.0,knort,sjøtransport,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
36499,36499.0,svært,verdt,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
183128,522520.0,tsjuvasj,kalori,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
77004,38837.0,monarki,biologi,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
53316,15149.0,alliert,overadministrert,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...,...
202208,,frihjul,byport,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
59542,,prosjektil,leiebil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
108621,,omdiskutert,blasert,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
188165,,kjemi,gjurt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [22]:
double_dev = get_mirrored_df(dev_df)
double_dev

Unnamed: 0,index,word_a,word_b,rhyme,word_tokens
84331,46164.0,døgnvill,trekkspill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
46777,8610.0,trekkspill,mellomspill,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
231293,60728.0,sjømil,buskvekst,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
171305,798600.0,breport,bakeri,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
69491,31324.0,livsstil,racerbil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
...,...,...,...,...,...
46112,,rundbogestil,rokokkostil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
126681,,ill,kompani,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
213297,,dyrt,gnurt,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."
47153,,veteranbil,snøbil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf...."


In [23]:
X_train = tf.convert_to_tensor(list(double_train["word_tokens"]))
X_val = tf.convert_to_tensor(list(double_dev["word_tokens"]))

y_train = tf.convert_to_tensor(list(double_train["rhyme"]))
y_val = tf.convert_to_tensor(list(double_dev["rhyme"]))

2022-04-12 01:36:09.100534: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 291299328 exceeds 10% of free system memory.


In [24]:
model_name = "rhyme_model_200k_mirror"

In [25]:
# # Uncomment to train model 
# model = create_model()

# model_checkpoint = ModelCheckpoint(f"models/{model_name}.hdf5",monitor="val_loss")
# terminate_on_nan = TerminateOnNaN()
# csv_logger = CSVLogger(f'logs/training_{model_name}.log')
# early_stop = EarlyStopping(monitor='loss', patience=5)

# history = model.fit(
#     [X_train[:, 0], X_train[:, 1]],
#     y_train,
#     batch_size=128,
#     epochs=100,
#     callbacks=[model_checkpoint, terminate_on_nan, csv_logger, early_stop],
#     validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
# )

Epoch 1/100


2022-04-12 01:36:09.972535: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 145649664 exceeds 10% of free system memory.
2022-04-12 01:36:10.004550: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 145649664 exceeds 10% of free system memory.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100


In [26]:
# load the model
model = load_model(f"models/{model_name}.hdf5")

y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
y_pred = y_pred > 0.5
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     35559
           1       0.99      1.00      1.00     35559

    accuracy                           1.00     71118
   macro avg       1.00      1.00      1.00     71118
weighted avg       1.00      1.00      1.00     71118



In [27]:
test = df.loc[X_test_indexes].copy()
test["pred"] = y_pred
test["pred"] = test["pred"].apply(lambda x: int(x))
test
test[test.rhyme != test.pred]

Unnamed: 0,index,word_a,word_b,rhyme,word_tokens,pred
236961,413717,hasj,eg,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
25820,25820,fløytespill,smil,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
179744,233918,boretårn,havørn,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
39504,1337,fromasj,pistasie,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
156268,436525,tort,adgangskort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
...,...,...,...,...,...,...
214853,230479,pipelort,ID-kort,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
39042,875,tort,sluseport,1,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",0
211694,764723,de,entalpi,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1
161393,143148,hasj,fred,0,"((tf.Tensor(0.0, shape=(), dtype=float64), tf....",1


In [28]:
samples = [
    ["Kan du ikke se det", "Deg skal jeg lede"], 
    ["Kaker av alle slag", "Her henger Norges flagg"], 
    ["Jeg har ikke tid", "Til dette svineri"],
    ["Hva har du sagt", "Kaken er bakt"], 
    ["Barna er lagt", "Kaken er laget"],
    ["Gjorde du det med vilje", "Kaken smaker vanilje"], 
    ["Dette vokser", "Satans underbukser"],
    ["tid", "svineri"]
]

samples2 = [[b,a] for a,b in samples]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)

sample_tokens2 = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples2]
sample_tokens2 = tf.convert_to_tensor(sample_tokens2)

sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]

sample_pred2 = model.predict([sample_tokens2[:, 0], sample_tokens2[:, 1]])
predictions2 = [round(pred[0], 4) for pred in sample_pred2]

for i in range(len(samples)):
    print(f"Sentence 1: {samples[i][0]}")
    print(f"Sentence 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print(f"{'Rhyme' if predictions2[i] > 0.5 else 'Non-rhyme'}({predictions2[i]})")
    print("---------------\n")

Sentence 1: Kan du ikke se det
Sentence 2: Deg skal jeg lede
Rhyme(0.9957000017166138)
Rhyme(0.9952999949455261)
---------------

Sentence 1: Kaker av alle slag
Sentence 2: Her henger Norges flagg
Rhyme(0.996999979019165)
Rhyme(0.9975000023841858)
---------------

Sentence 1: Jeg har ikke tid
Sentence 2: Til dette svineri
Rhyme(0.8123999834060669)
Rhyme(0.9900000095367432)
---------------

Sentence 1: Hva har du sagt
Sentence 2: Kaken er bakt
Rhyme(0.9979000091552734)
Rhyme(0.9975000023841858)
---------------

Sentence 1: Barna er lagt
Sentence 2: Kaken er laget
Non-rhyme(0.0)
Non-rhyme(0.0007999999797903001)
---------------

Sentence 1: Gjorde du det med vilje
Sentence 2: Kaken smaker vanilje
Rhyme(0.9970999956130981)
Rhyme(0.9970999956130981)
---------------

Sentence 1: Dette vokser
Sentence 2: Satans underbukser
Rhyme(0.9970999956130981)
Rhyme(0.9970999956130981)
---------------

Sentence 1: tid
Sentence 2: svineri
Rhyme(0.9957000017166138)
Rhyme(0.9957000017166138)
---------------