In [84]:
# Requirements:
# run: "pip install sentence-transformers" before using
import json


with open("data/raw/poetrydb_poems.json", "r") as f:
    poem_data = json.load(f)


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
# begins as a dictionary and each entry is a poem
poems_and_info = poem_data['items']
print(type(poems_and_info[0]))
print(list(poems_and_info)[0])
# pulling the lines from each entry, in the format of a list of strings so combining them

# note: they are seperated by lines so we can use data later if we want to count 
# the lines of line length per poem
raw_poem_texts = [" ".join(poem["lines"]) for poem in poems_and_info]


#question: what choice do we want to make on punctuation
import re

# removes punctuation, makes it lower, and makes multiple spaces into one
poem_texts = [
    re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', poem.lower())).strip()
    for poem in raw_poem_texts
]


print(poem_texts[0])
print(poem_texts[1])
print(poem_texts[2])
print(poem_texts[3])
print(poem_texts[100])
print(poem_texts[101])
print(poem_texts[102])



<class 'dict'>
{'title': 'A Song of Autumn', 'author': 'Adam Lindsay Gordon', 'lines': ['‘WHERE shall we go for our garlands glad', 'At the falling of the year,', 'When the burnt-up banks are yellow and sad,', 'When the boughs are yellow and sere?', 'Where are the old ones that once we had,', 'And when are the new ones near?', 'What shall we do for our garlands glad', 'At the falling of the year?’', '‘Child! can I tell where the garlands go?', 'Can I say where the lost leaves veer', 'On the brown-burnt banks, when the wild winds blow,', 'When they drift through the dead-wood drear?', 'Girl! when the garlands of next year glow,', 'You may gather again, my dear—', 'But I go where the last year’s lost leaves go', 'At the falling of the year.’'], 'linecount': '16'}
where shall we go for our garlands glad at the falling of the year when the burntup banks are yellow and sad when the boughs are yellow and sere where are the old ones that once we had and when are the new ones near what shall w

In [87]:
# have not normalized embeddings
embeddings_poems = model.encode(
    poem_texts,
    batch_size=32,
    show_progress_bar=True
)

Batches:   3%|▎         | 3/107 [08:26<4:52:46, 168.91s/it]


KeyboardInterrupt: 

In [28]:
import numpy as np
# checking correct type
print(type(embeddings_song))
print(embeddings_song[0][:10])
# checking shape
print(embeddings_song.shape)


<class 'numpy.ndarray'>
[ 0.02050997  0.01788874 -0.0218279   0.055269   -0.06274225 -0.02267507
 -0.01332432 -0.02804054 -0.01185982 -0.00296964]
(3413, 768)


In [None]:
with open("data/processed/combined_songs_large_fixed.json", "r") as f:
        song_data = json.load(f)


In [None]:
# begins as a dictionary with multiple keys: source, total songs and items
# Items has a list of dictionaries, each representing a song
songs_and_info = song_data['items']
# one song dictionary
print(songs_and_info[0])
# Pulling each song's lyrics
# Note: each item also contains title, album and duration if we want that later

raw_song_texts = [song["lyrics"] for song in songs_and_info]


#question: what choice do we want to make on punctuation
import re
from functools import reduce

# removes verse numbers, chorus labels, newline character and all punctuation and underscores
# will replace with a space
replacements_space = [
    r"\[Verse \d+\]",   # replace with space
    r"\[Chorus\]",      # replace with space
    r"\r?\n",           # replace newlines with space
]

# removes punctuation - not replacing with space
punctuation_pattern = r"[^A-Za-z0-9\s]"

# runs all and collaspses multiple spaces into one
song_texts = [
    re.sub(
        punctuation_pattern, "",
        re.sub(
            r"\s+", " ",
            reduce(lambda text, pat: re.sub(pat, " ", text), replacements_space, song)
        )
    ).strip().lower()
    for song in raw_song_texts
]



print(song_texts[130])
print(song_texts[131])
print(song_texts[132])
print(song_texts[133])
print(song_texts[134])

# other checks:
unexpected_chars = set("".join(song_texts)) - set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")

print("Unexpected characters:", weird_chars)



{'title': 'Father Figure', 'artist': 'Taylor Swift', 'spotify_artist_name': 'Taylor Swift', 'album': 'The Life of a Showgirl', 'release_date': '2025-10-03', 'duration_ms': 212777, 'popularity': 94, 'lyrics': '[Verse 1]\nWhen I found you, you were young, wayward, lost in the cold\nPulled up to you in the Jag\', turned your rags into gold\nThe winding road leads to the chateau\n"You remind me of a younger me," I saw potential\n[Chorus]\nI\'ll be your father figure,\n \nI drink that brown liquor\nI can make deals with the devil because my dick\'s bigger\nThis love is pure profit, just step into my office\nI dry your tears with my sleeve\n[Post-Chorus]\nLeave it with me, I protect the family\nLeave it with me, I protect the family\n[Verse 2]\nI pay the check before it kisses the mahogany grain\nSaid, "They wanna see you rise, they don\'t want you to reign"\nI showed you all the tricks of the trade\nAll I asked for is your loyalty, my dear protégé\n[Chorus]\nI\'ll be your father figure,\n I

In [None]:
embeddings_song = model.encode(song_texts, normalize_embeddings=True)

In [81]:
import numpy as np
# checking correct type
print(type(embeddings_song))
print(embeddings_song[0][:10])
# checking shape
print(embeddings_song.shape)

<class 'numpy.ndarray'>
[ 5.4139685e-02  7.2890922e-02 -2.1479514e-03  1.2776041e-02
 -3.3656180e-02 -4.1403272e-03 -1.2214402e-02  8.9260209e-03
 -6.9287104e-05 -9.9582300e-03]
(3000, 768)


In [90]:
import numpy as np
from pathlib import Path

raw_dir = Path("data/raw")

#np.save(raw_dir / "embeddings_poems.npy", embeddings_poems)
np.save(raw_dir / "embeddings_songs.npy", embeddings_song)

