source: https://github.com/minoguep/rhyme_detection and https://paulminogue.com/index.php/2021/02/14/using-a-siamese-neural-network-to-create-a-simple-rhyme-detector/

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger
tqdm.pandas()

MAX_LEN = 64

ModuleNotFoundError: No module named 'tensorflow'

### Step 1: Create dataset
We want equally many positive and negative samples of rhyme pairs

In [36]:
seed = 420

positive = pd.read_csv("../norwegian_rhyme_scheme_corpus/annotation_tool/rhyme_pairs.tsv", sep="\t", names=["word_a", "word_b"], header=0)
negative = pd.read_csv("../norwegian_rhyme_scheme_corpus/annotation_tool/negative_rhyme_pairs.tsv", sep="\t", names=["word_a", "word_b"], header=0)
negative = negative.sample(n=len(positive), random_state=seed)
positive["rhyme"] = [1]*len(positive)
negative["rhyme"] = [0]*len(negative)

df = pd.concat([positive, negative])

In [37]:
df

Unnamed: 0,word_a,word_b,rhyme
0,huse,bruse,1
1,halen,pralen,1
2,oksepar,svar,1
3,bevare,bare,1
4,ly,våbengny,1
...,...,...,...
1757,støy,brast,0
20575,meier,kne,0
2632,synger,land,0
16537,frukt,sten,0


### Step 2: Create model
Copy paste from Pauls notebook

In [6]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):
    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])

    # now loop through inputs and pad or reduce size if required
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

In [7]:
def create_model():
  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )
    
    # This is the siamese portion of the model 
    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lstm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
    
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

In [40]:
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(df['word_a'] + df['word_b'])

df['word_tokens'] = df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

  0%|          | 0/14506 [00:00<?, ?it/s]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    list(df['word_tokens']), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=seed
    )
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.25, random_state=seed
    )

In [42]:
X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)
X_val = tf.convert_to_tensor(X_val)
y_val = tf.convert_to_tensor(y_val)

In [43]:
model = create_model()

model_checkpoint = ModelCheckpoint("models/rhyme_model.hdf5",monitor="val_loss")
terminate_on_nan = TerminateOnNaN()
csv_logger = CSVLogger('training.log')

history = model.fit(
    [X_train[:, 0], X_train[:, 1]],
    y_train,
    batch_size=128,
    epochs=100,
    callbacks=[model_checkpoint, terminate_on_nan, csv_logger],
    validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
)


Epoch 1/100
Epoch 2/100
 3/68 [>.............................] - ETA: 2s - loss: 0.6936 - accuracy: 0.6380



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

In [44]:
# load the model
model = load_model("models/rhyme_model.hdf5")

X_test = tf.convert_to_tensor(X_test)
y_test = tf.convert_to_tensor(y_test)

y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
y_pred = y_pred > 0.5

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2176
           1       0.97      0.90      0.93      2176

    accuracy                           0.94      4352
   macro avg       0.94      0.94      0.94      4352
weighted avg       0.94      0.94      0.94      4352



In [46]:
samples = [
   ["Du sitter og ser så dum ut", "Idiot, din jævla stut"], 
   ["Er du sikker på at dette er med vilje?", "Hjertet ditt smaker vanilje"], 
   ["Dette vokser", "Satans underbukser"],
   ["Hva er meninga med livet?", "Ikke vet jeg."], 
   ["Dette sier Gud", "Ve deg! lille menneske"], 
]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]
for i in range(len(samples)):
    print(f"Lyric 1: {samples[i][0]}")
    print(f"Lyric 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Lyric 1: Du sitter og ser så dum ut
Lyric 2: Idiot, din jævla stut
Rhyme(0.9848999977111816)
---------------

Lyric 1: Er du sikker på at dette er med vilje?
Lyric 2: Hjertet ditt smaker vanilje
Rhyme(0.9991000294685364)
---------------

Lyric 1: Dette vokser
Lyric 2: Satans underbukser
Rhyme(0.9991000294685364)
---------------

Lyric 1: Hva er meninga med livet?
Lyric 2: Ikke vet jeg.
Non-rhyme(0.03720000013709068)
---------------

Lyric 1: Dette sier Gud
Lyric 2: Ve deg! lille menneske
Non-rhyme(0.0005000000237487257)
---------------



# Try again with different 2:3 positive to negative ratio

In [4]:
seed = 420

positive = pd.read_csv("../norwegian_rhyme_scheme_corpus/annotation_tool/rhyme_pairs.tsv", sep="\t", names=["word_a", "word_b"], header=0)
negative = pd.read_csv("../norwegian_rhyme_scheme_corpus/annotation_tool/negative_rhyme_pairs.tsv", sep="\t", names=["word_a", "word_b"], header=0)
negative = negative.sample(n=len(positive)*2, random_state=seed)
positive["rhyme"] = [1]*len(positive)
negative["rhyme"] = [0]*len(negative)

df = pd.concat([positive, negative])

In [5]:
df

Unnamed: 0,word_a,word_b,rhyme
0,huse,bruse,1
1,halen,pralen,1
2,oksepar,svar,1
3,bevare,bare,1
4,ly,våbengny,1
...,...,...,...
21190,tapen,øye,0
14441,tilende,strålefjed,0
4952,takten,tusen,0
20767,mindre,ve,0


In [8]:
#copy paste from above cells
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(df['word_a'] + df['word_b'])

df['word_tokens'] = df.progress_apply(
    lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1
)

X_train, X_test, y_train, y_test = train_test_split(
    list(df['word_tokens']), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=seed
    )
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.25, random_state=seed
    )

X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)
X_val = tf.convert_to_tensor(X_val)
y_val = tf.convert_to_tensor(y_val)

  0%|          | 0/21759 [00:00<?, ?it/s]

In [9]:
model = create_model()

                                    # changed model name
model_checkpoint = ModelCheckpoint("models/rhyme_model_2.hdf5",monitor="val_loss")
terminate_on_nan = TerminateOnNaN()
csv_logger = CSVLogger('training2.log')

history = model.fit(
    [X_train[:, 0], X_train[:, 1]],
    y_train,
    batch_size=128,
    epochs=100,
    callbacks=[model_checkpoint, terminate_on_nan, csv_logger],
    validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
)

Epoch 1/100
Epoch 2/100
  3/102 [..............................] - ETA: 4s - loss: 0.4896 - accuracy: 0.6406



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

In [1]:
# load the model
model = load_model("models/rhyme_model_2.hdf5")

X_test = tf.convert_to_tensor(X_test)
y_test = tf.convert_to_tensor(y_test)

y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
y_pred = y_pred > 0.5

NameError: name 'load_model' is not defined

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4352
           1       0.91      0.95      0.93      2176

    accuracy                           0.95      6528
   macro avg       0.94      0.95      0.95      6528
weighted avg       0.95      0.95      0.95      6528



In [12]:
samples = [
   ["Du sitter og ser så dum ut", "Idiot, din jævla stut"], 
   ["Er du sikker på at dette er med vilje?", "Hjertet ditt smaker vanilje"], 
   ["Dette vokser", "Satans underbukser"],
   ["Hva er meninga med livet?", "Ikke vet jeg."], 
   ["Dette sier Gud", "Ve deg! lille menneske"], 
]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]
for i in range(len(samples)):
    print(f"Lyric 1: {samples[i][0]}")
    print(f"Lyric 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Lyric 1: Du sitter og ser så dum ut
Lyric 2: Idiot, din jævla stut
Rhyme(0.8702999949455261)
---------------

Lyric 1: Er du sikker på at dette er med vilje?
Lyric 2: Hjertet ditt smaker vanilje
Rhyme(0.8730999827384949)
---------------

Lyric 1: Dette vokser
Lyric 2: Satans underbukser
Rhyme(0.8730999827384949)
---------------

Lyric 1: Hva er meninga med livet?
Lyric 2: Ikke vet jeg.
Non-rhyme(0.0)
---------------

Lyric 1: Dette sier Gud
Lyric 2: Ve deg! lille menneske
Non-rhyme(0.0)
---------------



In [14]:
samples = [
   ["Kan du ikke se det?", "Deg skal jeg lede"], 
   ["Kaker av alle slag", "Her henger Norges flagg"], 
   ["Jeg har ikke tid", "Til dette svineri"],
   ["Hva har du sagt", "Kaken er bakt"], 
   ["Barna er lagt", "Kaken er laget"],
    ["Er du sikker på at dette er med vilje?", "Hjertet ditt smaker vanilje"], 
   ["Dette vokser", "Satans underbukser"],
]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]
for i in range(len(samples)):
    print(f"Lyric 1: {samples[i][0]}")
    print(f"Lyric 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Lyric 1: Kan du ikke se det?
Lyric 2: Deg skal jeg lede
Non-rhyme(0.2529999911785126)
---------------

Lyric 1: Kaker av alle slag
Lyric 2: Her henger Norges flagg
Rhyme(0.8363999724388123)
---------------

Lyric 1: Jeg har ikke tid
Lyric 2: Til dette svineri
Rhyme(0.8282999992370605)
---------------

Lyric 1: Hva har du sagt
Lyric 2: Kaken er bakt
Rhyme(0.8730999827384949)
---------------

Lyric 1: Barna er lagt
Lyric 2: Kaken er laget
Non-rhyme(0.0)
---------------

Lyric 1: Er du sikker på at dette er med vilje?
Lyric 2: Hjertet ditt smaker vanilje
Rhyme(0.8730999827384949)
---------------

Lyric 1: Dette vokser
Lyric 2: Satans underbukser
Rhyme(0.8730999827384949)
---------------

