
# Poem→Song KNN Retrieval
This notebook reuses the aligned embedding pipeline to find the nearest songs for each poem, then keeps only the matches that meet a similarity/distance threshold.


In [18]:

from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.preprocessing import normalize

from modality_alignment import (
    DATA_PROCESSED,
    POEM_EMB_PATH,
    SONG_EMB_PATH,
    build_dataframe,
    coral_align,
    load_metadata,
    per_modality_zscore,
    remove_modality_direction_lr,
    run_umap,
)

print("Data directory:", DATA_PROCESSED)


Data directory: /Users/shirleyzhang/Desktop/CS229/cs229FinalProject/data/processed


In [22]:
# Retrieval + filtering hyperparameters (tweak + re-run cells below)
EMBEDDING_VARIANT = "original"  # options: mpnet, original, mini, large
APPLY_CORAL = True
K_NEIGHBORS = 10
SIMILARITY_THRESHOLD = 0.4  # cosine similarity >= threshold for accepted matches
DISTANCE_THRESHOLD = -1.0   # cosine distance <= threshold (-1 disables)
MIN_SIM_FOR_KNN = 0.5       # discard candidates whose raw cosine is below this before ranking
UMAP_DIM = 32
UMAP_NEIGHBORS = 80
UMAP_MIN_DIST = 0.05
SONG_LANG_WHITELIST = {"en"}  # empty set disables
SONG_MIN_WORDS = 20
SONG_MAX_WORDS = 400
OUTPUT_PATH = DATA_PROCESSED / "poem_song_knn_matches.json"


In [23]:
EMBED_PATHS = {
    "mpnet": (
        DATA_PROCESSED / "mpnet_embeddings_songs.npy",
        DATA_PROCESSED / "mpnet_embeddings_poems.npy",
    ),
    "original": (
        DATA_PROCESSED / "embeddings_songs.npy",
        DATA_PROCESSED / "embeddings_poems.npy",
    ),
    "mini": (
        DATA_PROCESSED / "mini_embeddings_songs.npy",
        DATA_PROCESSED / "mini_embeddings_poems.npy",
    ),
    "large": (
        DATA_PROCESSED / "large_embeddings_songs.npy",
        DATA_PROCESSED / "large_embeddings_poems.npy",
    ),
}
if EMBEDDING_VARIANT not in EMBED_PATHS:
    raise ValueError(f"Unknown embedding variant '{EMBEDDING_VARIANT}'. Options: {list(EMBED_PATHS)}")
SONG_EMB_PATH, POEM_EMB_PATH = EMBED_PATHS[EMBEDDING_VARIANT]
print(f"Using embedding variant {EMBEDDING_VARIANT} songs -> {SONG_EMB_PATH.name} poems -> {POEM_EMB_PATH.name}")

Using embedding variant original songs -> embeddings_songs.npy poems -> embeddings_poems.npy


In [24]:
from sklearn.preprocessing import normalize

def load_aligned_embeddings():
    if not SONG_EMB_PATH.exists() or not POEM_EMB_PATH.exists():
        raise FileNotFoundError("Run generate_mpnet_embeddings.py to produce MPNet embeddings first.")
    songs = np.load(SONG_EMB_PATH).astype(np.float32)
    poems = np.load(POEM_EMB_PATH).astype(np.float32)
    song_std = songs.std(axis=0).mean()
    poem_std = poems.std(axis=0).mean()
    print(f"Song emb std mean={song_std:.4f} | Poem emb std mean={poem_std:.4f}")
    if song_std < 1e-2 or poem_std < 1e-2:
        raise RuntimeError("Embeddings appear collapsed (std < 0.01). Regenerate MPNet embeddings.")
    songs_aligned, poems_aligned = remove_modality_direction_lr(songs, poems)
    if APPLY_CORAL:
        poems_aligned = coral_align(songs_aligned, poems_aligned)
    combined = np.vstack([songs_aligned, poems_aligned])
    combined = normalize(combined)
    aligned = run_umap(
        combined,
        n_components=UMAP_DIM,
        n_neighbors=UMAP_NEIGHBORS,
        min_dist=UMAP_MIN_DIST,
    )
    aligned = normalize(aligned)
    song_count = len(songs)
    return aligned[:song_count], aligned[song_count:]

song_emb_aligned_full, poem_emb_aligned = load_aligned_embeddings()
song_emb_aligned_full.shape, poem_emb_aligned.shape

Song emb std mean=0.0262 | Poem emb std mean=0.0269


  warn(


((3000, 32), (3413, 32))

In [25]:
from sklearn.preprocessing import normalize

def cosine_knn(poems: NDArray[np.float32], songs: NDArray[np.float32], k: int, min_similarity: float | None = None):
    if k <= 0:
        raise ValueError("k must be positive")
    poem_norm = normalize(poems)
    song_norm = normalize(songs)
    scores = poem_norm @ song_norm.T
    if min_similarity is not None:
        scores = np.where(scores >= min_similarity, scores, -1.0)
    k_eff = min(k, song_norm.shape[0])
    idx = np.argpartition(-scores, kth=k_eff - 1, axis=1)[:, :k_eff]
    top = np.take_along_axis(scores, idx, axis=1)
    order = np.argsort(-top, axis=1)
    idx_sorted = np.take_along_axis(idx, order, axis=1)
    scores_sorted = np.take_along_axis(top, order, axis=1)
    return idx_sorted.astype(np.int64), scores_sorted.astype(np.float32)

# Use the filtered/processed song embeddings if available; fall back to the full aligned array.
# This avoids a NameError when this cell is executed before the cell that defines `song_emb_aligned`.
song_emb_input = song_emb_aligned if "song_emb_aligned" in globals() else song_emb_aligned_full

top_idx, top_scores = cosine_knn(poem_emb_aligned, song_emb_input, K_NEIGHBORS, min_similarity=MIN_SIM_FOR_KNN)
top_idx.shape

(3413, 10)

In [26]:
songs_meta, poems_meta = load_metadata()
df = build_dataframe(songs_meta, poems_meta)
poem_df = df[df["modality"] == "poem"].reset_index(drop=True)
song_df_full = df[df["modality"] == "song"].reset_index(drop=True)

song_mask = pd.Series(True, index=song_df_full.index)
if SONG_LANG_WHITELIST and "language" in song_df_full.columns:
    song_mask &= song_df_full["language"].isin(SONG_LANG_WHITELIST)

word_counts = song_df_full["text"].fillna("").str.split().str.len()
song_mask &= word_counts.between(SONG_MIN_WORDS, SONG_MAX_WORDS)

song_df = song_df_full[song_mask].reset_index(drop=True)
song_emb_aligned = song_emb_aligned_full[song_mask.to_numpy()]

len(song_df), len(poem_df)

(2096, 3413)

In [27]:
def filter_matches(sim_threshold: float, dist_threshold: float):
    records = []
    use_sim = sim_threshold >= 0.0
    use_dist = dist_threshold >= 0.0
    kept_pairs = 0
    for poem_idx, (indices, scores) in enumerate(zip(top_idx, top_scores)):
        poem_row = poem_df.iloc[poem_idx]
        matches = []
        for idx, score in zip(indices, scores):
            if idx < 0 or idx >= len(song_df):
                continue
            if score < 0:
                continue
            cosine_dist = 1.0 - float(score)
            if use_sim and float(score) < sim_threshold:
                continue
            if use_dist and cosine_dist > dist_threshold:
                continue
            song_row = song_df.iloc[int(idx)]
            matches.append(
                {
                    "song_index": int(song_row["index"]),
                    "title": song_row.get("title"),
                    "creator": song_row.get("creator"),
                    "cosine_similarity": float(score),
                    "cosine_distance": cosine_dist,
                }
            )
        if matches:
            kept_pairs += len(matches)
            records.append(
                {
                    "poem_index": int(poem_row["index"]),
                    "title": poem_row.get("title"),
                    "creator": poem_row.get("creator"),
                    "matches": matches,
                }
            )
    return records, kept_pairs

matches, kept = filter_matches(SIMILARITY_THRESHOLD, DISTANCE_THRESHOLD)
len(matches), kept

(3413, 34130)

In [29]:

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH.write_text(json.dumps(matches, indent=2))
print(f"Saved {len(matches)} poem entries ({sum(len(m['matches']) for m in matches)} pairs) to {OUTPUT_PATH}.")
matches[:1]


Saved 3413 poem entries (34130 pairs) to /Users/shirleyzhang/Desktop/CS229/cs229FinalProject/data/processed/poem_song_knn_matches.json.


[{'poem_index': 0,
  'title': 'A Song of Autumn',
  'creator': 'Adam Lindsay Gordon',
  'matches': [{'song_index': 1030,
    'title': 'Amazing (Interlude)',
    'creator': 'Future',
    'cosine_similarity': 0.7469168901443481,
    'cosine_distance': 0.25308310985565186},
   {'song_index': 1036,
    'title': 'Jimmy Cooks (feat. 21 Savage)',
    'creator': '21 Savage',
    'cosine_similarity': 0.7464072108268738,
    'cosine_distance': 0.2535927891731262},
   {'song_index': 2125,
    'title': 'jawbreaker',
    'creator': 'MGK',
    'cosine_similarity': 0.7453938722610474,
    'cosine_distance': 0.25460612773895264},
   {'song_index': 1017,
    'title': 'SOUTH OF FRANCE',
    'creator': 'Future',
    'cosine_similarity': 0.7443405389785767,
    'cosine_distance': 0.25565946102142334},
   {'song_index': 2155,
    'title': 'Stop Playing With Me',
    'creator': 'Tyler, The Creator',
    'cosine_similarity': 0.7443330883979797,
    'cosine_distance': 0.25566691160202026},
   {'song_index': 4

In [30]:
# Flatten poem→song matches into a DataFrame for per-pair analysis.
import pandas as pd

def build_matches_df(match_entries):
    rows = []
    for entry in match_entries:
        p_idx = entry["poem_index"]
        for item in entry["matches"]:
            rows.append(
                {
                    "poem_index": p_idx,
                    "song_index": item["song_index"],
                    "poem_title": entry.get("title"),
                    "song_title": item.get("title"),
                    "poem_creator": entry.get("creator"),
                    "song_creator": item.get("creator"),
                    "cosine_similarity": item.get("cosine_similarity", 0.0),
                    "cosine_distance": item.get("cosine_distance", 0.0),
                }
            )
    return pd.DataFrame(rows)

matches_pairs_df = build_matches_df(matches)
print(f"Total accepted poem→song pairs: {len(matches_pairs_df)}")
matches_pairs_df.head()


Total accepted poem→song pairs: 34130


Unnamed: 0,poem_index,song_index,poem_title,song_title,poem_creator,song_creator,cosine_similarity,cosine_distance
0,0,1030,A Song of Autumn,Amazing (Interlude),Adam Lindsay Gordon,Future,0.746917,0.253083
1,0,1036,A Song of Autumn,Jimmy Cooks (feat. 21 Savage),Adam Lindsay Gordon,21 Savage,0.746407,0.253593
2,0,2125,A Song of Autumn,jawbreaker,Adam Lindsay Gordon,MGK,0.745394,0.254606
3,0,1017,A Song of Autumn,SOUTH OF FRANCE,Adam Lindsay Gordon,Future,0.744341,0.255659
4,0,2155,A Song of Autumn,Stop Playing With Me,Adam Lindsay Gordon,"Tyler, The Creator",0.744333,0.255667


In [31]:
# A. Similarity sanity check: compare matched vs random cosine similarities.
from sklearn.preprocessing import normalize

if matches_pairs_df.empty:
    raise ValueError("No matches available. Re-run retrieval with looser thresholds.")

poem_norm = normalize(poem_emb_aligned)
song_norm = normalize(song_emb_aligned)
matched_sims = matches_pairs_df["cosine_similarity"].to_numpy()
n = len(matched_sims)
rand_poems = np.random.choice(len(poem_norm), size=n, replace=True)
rand_songs = np.random.choice(len(song_norm), size=n, replace=True)
rand_sims = np.sum(poem_norm[rand_poems] * song_norm[rand_songs], axis=1)

print(f"Matched mean cosine: {matched_sims.mean():.4f} ± {matched_sims.std():.4f}")
print(f"Random  mean cosine: {rand_sims.mean():.4f} ± {rand_sims.std():.4f}")
print(f"Gap (matched - random): {matched_sims.mean() - rand_sims.mean():.4f}")


Matched mean cosine: 0.7446 ± 0.0031
Random  mean cosine: 0.9987 ± 0.0014
Gap (matched - random): -0.2541


In [None]:
# B. Sentiment correlation using VADER scores for poems/songs.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def ensure_sentiment(df: pd.DataFrame, label: str):
    if "sentiment_compound" not in df.columns:
        df["sentiment_compound"] = (
            df["text"].fillna("").astype(str).apply(lambda t: analyzer.polarity_scores(t)["compound"])
        )
    else:
        df["sentiment_compound"] = df["sentiment_compound"].astype(float)
    return df

poem_df = ensure_sentiment(poem_df, "poem")
song_df = ensure_sentiment(song_df, "song")

merged = matches_pairs_df.merge(
    poem_df[["index", "sentiment_compound"]].rename(columns={"sentiment_compound": "sentiment_compound_poem"}),
    left_on="poem_index",
    right_on="index",
    how="left",
).merge(
    song_df[["index", "sentiment_compound"]].rename(columns={"sentiment_compound": "sentiment_compound_song"}),
    left_on="song_index",
    right_on="index",
    how="left",
)

corr = merged["sentiment_compound_poem"].corr(merged["sentiment_compound_song"])
print(f"Sentiment correlation (poem vs song): {corr:.3f}")


In [None]:
# C. Content-word overlap as a loose lexical similarity proxy.
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

if matches_pairs_df.empty:
    print("No matches available; skip content-word overlap analysis.")
else:
    poem_text_map = poem_df.set_index("index")["text"].to_dict()
    song_text_map = song_df.set_index("index")["text"].to_dict()
    _word_pattern = re.compile(r"\b[a-zA-Z']+\b")

    def content_words(text):
        if not isinstance(text, str):
            text = str(text or "")
        tokens = _word_pattern.findall(text.lower())
        return {t for t in tokens if t not in ENGLISH_STOP_WORDS and len(t) > 3}

    def overlap_count(row):
        poem_words = content_words(poem_text_map.get(int(row["poem_index"]), ""))
        song_words = content_words(song_text_map.get(int(row["song_index"]), ""))
        return len(poem_words & song_words)

    matches_pairs_df["content_overlap"] = matches_pairs_df.apply(overlap_count, axis=1)
    print(
        f"Average content-word overlap: {matches_pairs_df['content_overlap'].mean():.2f}"
    )
    matches_pairs_df["content_overlap"].describe()

In [None]:
# Optional: semantic textual similarity using a pretrained STS model (downloads weights on first run).
try:
    from sentence_transformers import SentenceTransformer, util

    stsb_model = SentenceTransformer("stsb-roberta-large")
    poem_texts = [poem_text_map[idx] for idx in matches_pairs_df["poem_index"]]
    song_texts = [song_text_map[idx] for idx in matches_pairs_df["song_index"]]
    poem_emb = stsb_model.encode(poem_texts, convert_to_tensor=True, show_progress_bar=False)
    song_emb = stsb_model.encode(song_texts, convert_to_tensor=True, show_progress_bar=False)
    sts_scores = util.cos_sim(poem_emb, song_emb).diagonal().cpu().numpy()
    print(f"Average STS semantic similarity: {float(sts_scores.mean()):.4f}")
except Exception as exc:
    print(f"Skipping STS evaluation: {exc}")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Average STS semantic similarity: 0.3406


### Qualitative spot checks (manual/LLM assisted)
- Sample a few rows from `matches_pairs_df` and read the poem/song lyric snippets to confirm the thematic match.
- Optionally call an LLM (e.g., GPT-4 or `openchat`) with a prompt describing each pair: _"Summarize how similar the following poem and song lyrics are in tone, emotion, or theme."_ and record the response as a pseudo-human rating.
- For a higher-level view, feed several poem/song texts into your favorite summarizer and ask, _"What kinds of themes appear across these matched pairs?"_


In [34]:
# Qualitative deep-dive: pick famous poems and inspect their top matches.
FAMOUS_POEM_TITLES = [
    "The Raven",
    "If—",
    "Sonnet 18",
    "Still I Rise",
    "Because I could not stop for Death",
]
TOP_K_PER_POEM = 5

# Prepare helper functions.
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()

poem_lookup = poem_df.set_index("title")

pairs_summary = []
for title in FAMOUS_POEM_TITLES:
    if title not in poem_lookup.index:
        print(f"Poem '{title}' not found; skipping.")
        continue
    poem_row = poem_lookup.loc[title]
    poem_idx = int(poem_row["index"])
    matches_for_poem = matches_pairs_df[matches_pairs_df["poem_index"] == poem_idx]
    if matches_for_poem.empty:
        print(f"No matches for poem '{title}'.")
        continue
    top_matches = matches_for_poem.nlargest(TOP_K_PER_POEM, "cosine_similarity")
    poem_text = poem_row["text"]
    poem_sent = sentiment_analyzer.polarity_scores(poem_text)["compound"]
    # Keyword extraction for the poem.
    vectorizer = TfidfVectorizer(max_features=20, stop_words="english")
    poem_keywords = vectorizer.fit([poem_text]).get_feature_names_out()
    for _, match in top_matches.iterrows():
        song_row = song_df[song_df["index"] == match["song_index"]].iloc[0]
        song_text = song_row["text"]
        song_sent = sentiment_analyzer.polarity_scores(song_text)["compound"]
        vectorizer_song = TfidfVectorizer(max_features=20, stop_words="english")
        song_keywords = vectorizer_song.fit([song_text]).get_feature_names_out()
        overlap = set(poem_keywords) & set(song_keywords)
        commentary = {
            "poem_title": title,
            "poem_creator": poem_row.get("creator"),
            "poem_sentiment": poem_sent,
            "poem_keywords": list(poem_keywords),
            "song_title": song_row.get("title"),
            "song_creator": song_row.get("creator"),
            "song_sentiment": song_sent,
            "song_keywords": list(song_keywords),
            "keyword_overlap": list(overlap),
            "cosine_similarity": match["cosine_similarity"],
        }
        pairs_summary.append(commentary)

pairs_summary[:5]


Poem 'If—' not found; skipping.
Poem 'Sonnet 18' not found; skipping.
Poem 'Still I Rise' not found; skipping.
Poem 'Because I could not stop for Death' not found; skipping.


[{'poem_title': 'The Raven',
  'poem_creator': 'Edgar Allan Poe',
  'poem_sentiment': -0.9911,
  'poem_keywords': ['angels',
   'bird',
   'bust',
   'came',
   'chamber',
   'door',
   'floor',
   'lenore',
   'nevermore',
   'prophet',
   'quoth',
   'raven',
   'said',
   'shore',
   'soul',
   'tapping',
   'tell',
   'thee',
   'thy',
   'word'],
  'song_title': 'Amazing (Interlude)',
  'song_creator': 'Future',
  'song_sentiment': 0.8957,
  'song_keywords': ['amazin',
   'black',
   'caucasians',
   'cause',
   'exotic',
   'future',
   'girl',
   'girls',
   'going',
   'got',
   'kissin',
   'kissing',
   'look',
   'neck',
   'nigga',
   'outro',
   'talk',
   'throw',
   'wild',
   'ya'],
  'keyword_overlap': [],
  'cosine_similarity': 0.7452190518379211},
 {'poem_title': 'The Raven',
  'poem_creator': 'Edgar Allan Poe',
  'poem_sentiment': -0.9911,
  'poem_keywords': ['angels',
   'bird',
   'bust',
   'came',
   'chamber',
   'door',
   'floor',
   'lenore',
   'nevermore',

In [35]:
# Present 3-5 curated pairs with human-readable commentary
from pprint import pprint

NUM_EXAMPLES = min(5, len(pairs_summary))
print(f"Showing {NUM_EXAMPLES} qualitative pairs:")
for entry in pairs_summary[:NUM_EXAMPLES]:
    print("-" * 80)
    print(f"Poem: {entry['poem_title']} ({entry['poem_creator']}) | sentiment {entry['poem_sentiment']:.2f}")
    print(f"  Keywords: {', '.join(entry['poem_keywords'][:8])}")
    print(f"Song: {entry['song_title']} ({entry['song_creator']}) | sentiment {entry['song_sentiment']:.2f}")
    print(f"  Keywords: {', '.join(entry['song_keywords'][:8])}")
    if entry['keyword_overlap']:
        print(f"  Shared motifs: {', '.join(entry['keyword_overlap'])}")
    print(f"Cosine similarity: {entry['cosine_similarity']:.3f}")


Showing 5 qualitative pairs:
--------------------------------------------------------------------------------
Poem: The Raven (Edgar Allan Poe) | sentiment -0.99
  Keywords: angels, bird, bust, came, chamber, door, floor, lenore
Song: Amazing (Interlude) (Future) | sentiment 0.90
  Keywords: amazin, black, caucasians, cause, exotic, future, girl, girls
Cosine similarity: 0.745
--------------------------------------------------------------------------------
Poem: The Raven (Edgar Allan Poe) | sentiment -0.99
  Keywords: angels, bird, bust, came, chamber, door, floor, lenore
Song: Jimmy Cooks (feat. 21 Savage) (21 Savage) | sentiment -0.97
  Keywords: better, dogg, don, getting, klaim, kommunity, kool, life
Cosine similarity: 0.745
--------------------------------------------------------------------------------
Poem: The Raven (Edgar Allan Poe) | sentiment -0.99
  Keywords: angels, bird, bust, came, chamber, door, floor, lenore
Song: jawbreaker (MGK) | sentiment 0.28
  Keywords: beach, c