In [1]:
# Requirements:
# run: "pip install sentence-transformers" before using
import json


with open("data/raw/poetrydb_poems.json", "r") as f:
    poem_data = json.load(f)


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# begins as a dictionary and each entry is a poem
poems_and_info = poem_data['items']
print(type(poems_and_info[0]))
print(list(poems_and_info)[0])
# pulling the lines from each entry, in the format of a list of strings so combining them

# note: they are seperated by lines so we can use data later if we want to count 
# the lines of line length per poem
raw_poem_texts = [" ".join(poem["lines"]) for poem in poems_and_info]


#question: what choice do we want to make on punctuation
import re

# removes punctuation, makes it lower, and makes multiple spaces into one
poem_texts = [
    re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', poem.lower())).strip()
    for poem in raw_poem_texts
]

# checks: check that these are all correctly cleaned by looking at the beginning of 7 poems
print(poem_texts[0][1:100])
print(poem_texts[1][1:100])
print(poem_texts[2][1:100])
print(poem_texts[3][1:100])
print(poem_texts[100][1:100])
print(poem_texts[101][1:100])
print(poem_texts[102][1:100])



<class 'dict'>
{'title': 'A Song of Autumn', 'author': 'Adam Lindsay Gordon', 'lines': ['‘WHERE shall we go for our garlands glad', 'At the falling of the year,', 'When the burnt-up banks are yellow and sad,', 'When the boughs are yellow and sere?', 'Where are the old ones that once we had,', 'And when are the new ones near?', 'What shall we do for our garlands glad', 'At the falling of the year?’', '‘Child! can I tell where the garlands go?', 'Can I say where the lost leaves veer', 'On the brown-burnt banks, when the wild winds blow,', 'When they drift through the dead-wood drear?', 'Girl! when the garlands of next year glow,', 'You may gather again, my dear—', 'But I go where the last year’s lost leaves go', 'At the falling of the year.’'], 'linecount': '16'}
here shall we go for our garlands glad at the falling of the year when the burntup banks are yellow
he ocean heaves around us still with long and measured swell the autumn gales our canvas fill our s
 have a rendezvous with deat

In [3]:
embeddings_poems = model.encode(poem_texts, normalize_embeddings=True)

In [4]:
import numpy as np
# checking correct type
print(type(embeddings_poems))
print(embeddings_poems[0][:10])
# checking shape
print(embeddings_poems.shape)


<class 'numpy.ndarray'>
[ 0.02050997  0.01788874 -0.0218279   0.055269   -0.06274225 -0.02267507
 -0.01332432 -0.02804054 -0.01185982 -0.00296964]
(3413, 768)


In [3]:
with open("data/processed/combined_songs_large_fixed.json", "r") as f:
        song_data = json.load(f)


In [4]:
# begins as a dictionary with multiple keys: source, total songs and items
# Items has a list of dictionaries, each representing a song
songs_and_info = song_data['items']
# one song dictionary
print(songs_and_info[0])
# Pulling each song's lyrics
# Note: each item also contains title, album and duration if we want that later

raw_song_texts = [song["lyrics"] for song in songs_and_info]


#question: what choice do we want to make on punctuation
import re
from functools import reduce

# removes verse numbers, chorus labels, newline character and all punctuation and underscores
# will replace with a space
replacements_space = [
    r"\[Verse \d+\]",   # replace with space
    r"\[Chorus\]",      # replace with space
    r"\r?\n",           # replace newlines with space
]

# removes punctuation - not replacing with space
punctuation_pattern = r"[^A-Za-z0-9\s]"

# runs all and collaspses multiple spaces into one
song_texts = [
    re.sub(
        punctuation_pattern, "",
        re.sub(
            r"\s+", " ",
            reduce(lambda text, pat: re.sub(pat, " ", text), replacements_space, song)
        )
    ).strip().lower()
    for song in raw_song_texts
]



print(song_texts[130])
print(song_texts[131])
print(song_texts[132])
print(song_texts[133])
print(song_texts[134])

# other checks:
unexpected_chars = set("".join(song_texts)) - set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")

print("Unexpected characters:", unexpected_chars)



{'title': 'Father Figure', 'artist': 'Taylor Swift', 'spotify_artist_name': 'Taylor Swift', 'album': 'The Life of a Showgirl', 'release_date': '2025-10-03', 'duration_ms': 212777, 'popularity': 94, 'lyrics': '[Verse 1]\nWhen I found you, you were young, wayward, lost in the cold\nPulled up to you in the Jag\', turned your rags into gold\nThe winding road leads to the chateau\n"You remind me of a younger me," I saw potential\n[Chorus]\nI\'ll be your father figure,\n \nI drink that brown liquor\nI can make deals with the devil because my dick\'s bigger\nThis love is pure profit, just step into my office\nI dry your tears with my sleeve\n[Post-Chorus]\nLeave it with me, I protect the family\nLeave it with me, I protect the family\n[Verse 2]\nI pay the check before it kisses the mahogany grain\nSaid, "They wanna see you rise, they don\'t want you to reign"\nI showed you all the tricks of the trade\nAll I asked for is your loyalty, my dear protégé\n[Chorus]\nI\'ll be your father figure,\n I

In [8]:
embeddings_songs = model.encode(song_texts, normalize_embeddings=True)

In [9]:
# checking correct type
print(type(embeddings_songs))
print(embeddings_songs[0][:10])
# checking shape
print(embeddings_songs.shape)

<class 'numpy.ndarray'>
[ 5.4139685e-02  7.2890922e-02 -2.1479514e-03  1.2776041e-02
 -3.3656180e-02 -4.1403272e-03 -1.2214402e-02  8.9260209e-03
 -6.9287104e-05 -9.9582300e-03]
(3000, 768)


In [11]:
import numpy as np
from pathlib import Path

processed_dir = Path("data/processed")

np.save(processed_dir / "embeddings_poems.npy", embeddings_poems)
np.save(processed_dir / "embeddings_songs.npy", embeddings_songs)



In [9]:
# code to check retrieving them works
processed_dir = Path("data/processed")

# Load the saved numpy arrays
embeddings_poems = np.load(processed_dir / "embeddings_poems.npy")
embeddings_songs = np.load(processed_dir / "embeddings_songs.npy")
print(embeddings_songs.shape)
print(embeddings_poems.shape)

(3000, 768)
(3413, 768)


In [5]:
mini_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
mini_embeddings_poems = mini_model.encode(poem_texts, normalize_embeddings=True)


In [6]:
mini_embeddings_songs = mini_model.encode(song_texts, normalize_embeddings=True)

In [7]:
#these are less accurate faster versions
print(mini_embeddings_songs.shape)
print(mini_embeddings_poems.shape)

(3000, 384)
(3413, 384)


In [8]:
import numpy as np
from pathlib import Path

processed_dir = Path("data/processed")

np.save(processed_dir / "mini_embeddings_poems.npy", mini_embeddings_poems)
np.save(processed_dir / "mini_embeddings_songs.npy", mini_embeddings_songs)

In [None]:
# checking all the transformed data - starting with full
print("Any NaNs in poems:", np.isnan(embeddings_poems).any())
print("Any NaNs in songs:", np.isnan(embeddings_songs).any())
print("Poem mean/std:", embeddings_poems.mean(), embeddings_poems.std())
print("Song mean/std:", embeddings_songs.mean(), embeddings_songs.std())

poem_norms = np.linalg.norm(embeddings_poems, axis=1)
song_norms = np.linalg.norm(embeddings_songs, axis=1)

print("Poem norms range:", poem_norms.min(), poem_norms.max())
print("Song norms range:", song_norms.min(), song_norms.max())



Any NaNs in poems: False
Any NaNs in songs: False
Poem mean/std: -0.00026377416 0.03608343
Song mean/std: -0.00020619998 0.036083806
Poem norms range: 0.99999994 1.0000001
Song norms range: 0.99999994 1.0000001


In [12]:
from numpy import dot
from numpy.linalg import norm

def cosine(a, b): return dot(a, b) / (norm(a) * norm(b))

# Expect higher similarity between the same type (poem-poem or song-song)
print("Poem 0 vs Poem 1:", cosine(embeddings_poems[0], embeddings_poems[1]))
print("Song 0 vs Song 1:", cosine(embeddings_songs[0], embeddings_songs[1]))
print("Poem 0 vs Song 0:", cosine(embeddings_poems[0], embeddings_songs[0]))


Poem 0 vs Poem 1: 0.44382766
Song 0 vs Song 1: 0.4751011
Poem 0 vs Song 0: 0.28851935


In [13]:
print("Poems match:", len(poem_texts) == embeddings_poems.shape[0])
print("Songs match:", len(song_texts) == embeddings_songs.shape[0])


Poems match: True
Songs match: True


In [11]:
print("Any NaNs in mini poems:", np.isnan(mini_embeddings_poems).any())
print("Any NaNs in mini songs:", np.isnan(mini_embeddings_songs).any())

print("Mini poem mean/std:", mini_embeddings_poems.mean(), mini_embeddings_poems.std())
print("Mini song mean/std:", mini_embeddings_songs.mean(), mini_embeddings_songs.std())

mini_poem_norms = np.linalg.norm(mini_embeddings_poems, axis=1)
mini_song_norms = np.linalg.norm(mini_embeddings_songs, axis=1)

print("Mini poem norms range:", mini_poem_norms.min(), mini_poem_norms.max())
print("Mini song norms range:", mini_song_norms.min(), mini_song_norms.max())


Any NaNs in mini poems: False
Any NaNs in mini songs: False
Mini poem mean/std: -4.4461198e-05 0.051031016
Mini song mean/std: -0.00016751341 0.051030762
Mini poem norms range: 0.9999999 1.0000001
Mini song norms range: 0.9999999 1.0000001


In [14]:

# Expect higher similarity between same type (poem–poem, song–song)
print("Mini poem 0 vs Mini poem 1:", cosine(mini_embeddings_poems[0], mini_embeddings_poems[1]))
print("Mini song 0 vs Mini song 1:", cosine(mini_embeddings_songs[0], mini_embeddings_songs[1]))
print("Mini poem 0 vs Mini song 0:", cosine(mini_embeddings_poems[0], mini_embeddings_songs[0]))


Mini poem 0 vs Mini poem 1: 0.47302672
Mini song 0 vs Mini song 1: 0.52234554
Mini poem 0 vs Mini song 0: 0.42819285


In [15]:
print("Poems match:", len(poem_texts) == mini_embeddings_poems.shape[0])
print("Songs match:", len(song_texts) == mini_embeddings_songs.shape[0])

Poems match: True
Songs match: True


In [6]:
large_model = SentenceTransformer("sentence-t5-base")
large_embeddings_songs = large_model.encode(song_texts, normalize_embeddings=True)

In [8]:
large_embeddings_poems = large_model.encode(poem_texts, normalize_embeddings=True)

In [11]:

from numpy import dot
from numpy.linalg import norm
def cosine(a, b): return dot(a, b) / (norm(a) * norm(b))


print("Poem 0 vs Poem 1:", cosine(large_embeddings_poems[0], large_embeddings_poems[1]))
print("Song 0 vs Song 1:", cosine(large_embeddings_songs[0], large_embeddings_songs[1]))
print("Poem 0 vs Song 0:", cosine(large_embeddings_poems[0], large_embeddings_songs[0]))

Poem 0 vs Poem 1: 0.83795005
Song 0 vs Song 1: 0.88772297
Poem 0 vs Song 0: 0.8467045


In [12]:
import numpy as np
from pathlib import Path

processed_dir = Path("data/processed")

np.save(processed_dir / "large_embeddings_poems.npy", large_embeddings_poems)
np.save(processed_dir / "large_embeddings_songs.npy", large_embeddings_songs)