## Imports

In [None]:
# imports
import pandas as pd
import pickle
import openai
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

## Load data

In [3]:
# load data (full dataset available at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
df = pd.read_csv("app/data/SpotGenTrack/filtered_track_df.csv")

# print dataframe
n_examples = 5
df.head(n_examples)

Unnamed: 0,artists_id,acousticness,danceability,disc_number,duration_ms,energy,href,id,instrumentalness,key,...,speechiness,tempo,time_signature,track_href,uri,valence,release_date,artists_name,genres,release_year
0,1RyvyyTE3xzB2ZywiAwp0i,0.0151,0.888,1.0,161758.0,0.665,https://api.spotify.com/v1/tracks/6QFzUXTIZXOL...,6QFzUXTIZXOLesQcgmGOsR,0.0,5.0,...,0.093,78.021,4.0,https://api.spotify.com/v1/tracks/6QFzUXTIZXOL...,6QFzUXTIZXOLesQcgmGOsR,0.568,2019-01-18,Future,"['atl hip hop', 'pop', 'pop rap', 'rap', 'sout...",2019
1,0yN7xI1blow9nYIK0R8nM7,0.00129,0.501,1.0,209133.0,0.964,https://api.spotify.com/v1/tracks/5eaTDXlQLlfN...,5eaTDXlQLlfNLfJALhxRyj,0.00393,7.0,...,0.0793,75.038,4.0,https://api.spotify.com/v1/tracks/5eaTDXlQLlfN...,5eaTDXlQLlfNLfJALhxRyj,0.486,2019-01-11,Buckcherry,"['alternative metal', 'glam metal', 'hard rock...",2019
2,4SqTiwOEdYrNayaGMkc7ia,0.0171,0.722,1.0,234894.0,0.781,https://api.spotify.com/v1/tracks/0rN1Jv3hGnYZ...,0rN1Jv3hGnYZ2zmAXbpnkc,0.0,8.0,...,0.0321,137.907,4.0,https://api.spotify.com/v1/tracks/0rN1Jv3hGnYZ...,0rN1Jv3hGnYZ2zmAXbpnkc,0.644,2019-02-15,LÉON,"['dance pop', 'indie cafe pop', 'indie poptimi...",2019
3,5JMLG56F1X5mFmWNmS0iAp,0.926,0.616,1.0,210887.0,0.21,https://api.spotify.com/v1/tracks/5hNR2TXslTho...,5hNR2TXslThoEgqqdDlt8F,0.0,7.0,...,0.0371,70.478,4.0,https://api.spotify.com/v1/tracks/5hNR2TXslTho...,5hNR2TXslThoEgqqdDlt8F,0.447,2019-03-13,Chelsea Cutler,"['dance pop', 'electropop', 'indie poptimism',...",2019
4,2oX42qP5ineK3hrhBECLmj,0.0875,0.641,1.0,196188.0,0.537,https://api.spotify.com/v1/tracks/31Y7dmcZFeJs...,31Y7dmcZFeJsMKXtJNeQya,9e-06,0.0,...,0.0282,112.864,4.0,https://api.spotify.com/v1/tracks/31Y7dmcZFeJs...,31Y7dmcZFeJsMKXtJNeQya,0.0681,2019-02-13,Andy Grammer,"['neo mellow', 'pop', 'pop rap', 'pop rock', '...",2019


## Build cache to save embeddings

In [4]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "app/data/recommendations_embeddings_cache.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [11]:
# as an example, take the first description from the dataset
example_string = df["lyrics"].values[0]

#example_string = "iloveyou"
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:10]}...")


Example string: 

It's okay, 'kay (Wheezy, Pluto)
Yeah, baby ain't play, play
Last name Hndrxx whatever I do
(Wheezy outta here)

Woo, everything we do, we goin' dummy (Woo)
Whatever I do, I hope I got that Tommy (Hrrr)
Just in case a nigga try to play, play, play (Just in case)
Yeah, every watch I own on tsunami (Brrr)
Chain on frost, ask Sonny (Go ask)
Tryna pose a threat, go at your mommy
It's okay, 'kay (Okay)

Intercontineezy, got a spaceship parked
Fine Europeans look good in the dark (Fine)
Meet me in the jacuzzi, girl, we made it (We made it)
Christian Dior already slated
Yeah, you try to leave me, downgradin' (Downgrade)
I can't save you from the apes out the cages (Out the cages)
Phone book in my jeans like pages (Freebandz)
This money I crave, it is so contagious
Sometimes get caught up in that mission, don't get caught inside the loop
V.I.P. with raw fishes, I'ma show ya what it do
We get splashy, we get Percs for no reason
Got some tall, wet blondie at Four Seasons (Yee)


## Recommend similar articles based on embeddings

In [8]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    print(f"Source string: {query_string}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

In [14]:
lyrics = df["lyrics"].values

track_lyrics = print_recommendations_from_strings(
    strings=lyrics,  # let's base similarity off of the article description
    index_of_source_string=0,  # let's look at articles similar to the first song in the dataset
    k_nearest_neighbors=2,  # let's look at the 5 most similar articles
)

## Visualisation

In [None]:
# get embeddings for all article descriptions
embeddings = [embedding_from_string(string) for string in lyrics]
# compress the 2048-dimensional embeddings into 2 dimensions using t-SNE
tsne_components = tsne_components_from_embeddings(embeddings)
# get the article labels for coloring the chart
labels = df["label"].tolist()

chart_from_components(
    components=tsne_components,
    labels=labels,
    strings=lyrics,
    width=600,
    height=500,
    title="t-SNE components of article descriptions",
)

In [None]:
def nearest_neighbor_labels(
    list_of_indices: list[int],
    k_nearest_neighbors: int = 5
) -> list[str]:
    """Return a list of labels to color the k nearest neighbors."""
    labels = ["Other" for _ in list_of_indices]
    source_index = list_of_indices[0]
    labels[source_index] = "Source"
    for i in range(k_nearest_neighbors):
        nearest_neighbor_index = list_of_indices[i + 1]
        labels[nearest_neighbor_index] = f"Nearest neighbor (top {k_nearest_neighbors})"
    return labels


# Example usage for lyrics
lyric_indices = track_lyrics  # Replace with the variable containing the lyric indices

# Create labels for the recommended lyrics
lyric_labels = nearest_neighbor_labels(lyric_indices, k_nearest_neighbors=5)

In [None]:
chart_from_components(
    components=tsne_components,
    labels=lyric_indices,
    strings=lyrics,
    width=600,
    height=500,
    title="Nearest neighbors of the Tony Blair article",
    category_orders={"label": ["Other", "Nearest neighbor (top 5)", "Source"]},
)