# 1. Import data

In [68]:
import pandas as pd

In [69]:
# Import numerical data and clean
df_9k_numerical = pd.read_csv('../raw_data/20250609_9k_spotify_metrics_lyrics_data.csv')
df_9k_numerical.drop(columns=['link', 'track_id', 'Unnamed: 0', 'text', 'song'], inplace=True)

# Import embedded data
df_9k_embedded = pd.read_csv('../raw_data/20250611_ft3_lyrics_clustered_with_embeddings.csv')

# Merge data
df = df_9k_numerical.merge(df_9k_embedded, on=['artist', 'title_cleaned'], how='inner')

# 2. Create KNN model and find top 50 songs

In [70]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

### Faire un modèle KNN et le fitter
# Ne pas fiter dans une fonction


# Categorize columns
categ_columns = ['genre']
num_columns = ['popularity', 'year', 'danceability', 'energy',
    'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
    'time_signature']

# Call encoders and scalers
ohe = OneHotEncoder(sparse_output=False)
minmax = MinMaxScaler()

# Make encoding pipeline
pipe = make_column_transformer(
    (ohe, categ_columns),
    (minmax, num_columns),
    remainder='drop'
).set_output(transform="pandas")

#Transform df
X_transformed = pipe.fit_transform(df_9k_numerical)

# Call model
model_knn = NearestNeighbors(n_neighbors=100, algorithm='auto', metric='euclidean')

# Fit model
model_knn.fit(X_transformed)


In [71]:
### Trouver la chanson et les knn

def find_song(song_name, artist_name, df, model_knn):
    # Find song index
    song_idx = df.index[(df['title_cleaned'] == song_name) & (df['artist'] == artist_name)].tolist()[0]

    # Choose a song to query by index
    song_transformed = pipe.transform(df.iloc[[song_idx]])

    # Find 20 nearest neighbors (including the song itself)
    distances, indices = model_knn.kneighbors(song_transformed, n_neighbors=101)

    # Exclude the first index if it is the song itself
    neighbor_indices = indices[0][1:]

    # Retrieve metadata for neighbors
    neighbors_df = df.iloc[neighbor_indices][['artist', 'title_cleaned', 'text', 'embedding']]

    # Add searched song
    searched_song_df = df[(df['title_cleaned'] == song_name) & (df['artist'] == artist_name)][['artist', 'title_cleaned', 'embedding']]
    neighbors_df = pd.concat([searched_song_df, neighbors_df], axis=0)

    return neighbors_df

## 2.1 Test functions

In [72]:
neighbors_df = find_song('Money', 'Pink Floyd', df, model_knn)
neighbors_df

Unnamed: 0,artist,title_cleaned,embedding,text
2322,Pink Floyd,Money,"[-0.014121033251285553, 0.019402118399739265, ...",
7309,Pink Floyd,Comfortably Numb,"[-0.01901412010192871, -0.04353988543152809, 0...",Hello? Is there anybody in there? Just nod if ...
4917,Grateful Dead,Lost Sailor,"[0.03301208093762398, -0.021575020626187325, -...",Compass card is spinning Helm is swinging to a...
4923,Grateful Dead,Morning Dew,"[0.008461940102279186, 0.04476252570748329, -0...","Walk me out in the morning dew my honey, Walk ..."
4911,Grateful Dead,Let It Grow,"[0.008294593542814255, -0.004948828835040331, ...","Morning comes, she follows the path to the riv..."
...,...,...,...,...
4140,Def Leppard,Billy's Got A Gun,"[-0.008196419104933739, 0.006126804742962122, ...","Billy's got a gun, he's on the run Confusion i..."
891,Grateful Dead,Scarlet Begonias,"[0.017752934247255325, 0.008365767076611519, -...","As I was walkin' down Grosvenor Square, Not a ..."
3688,The Beatles,Glass Onion,"[-0.003423981834203005, 0.01932707242667675, -...",I told you about strawberry fields You know th...
4880,Grateful Dead,Caution,"[-0.03617602586746216, -0.00026380899362266064...","(do not stop on tracks) I went down one day, I..."


# 3. Find top 3 songs with embeddings

In [73]:
import torch
import pandas as pd
import torch.nn.functional as F
import ast

### Trouver le top plus similaire grâce aux embeddings

def get_top_similar_songs(df, song, artist, top_n=3):

    neighbors_df = find_song(song, artist, df, model_knn)

    # Embedding change
    neighbors_df['embedding'] = neighbors_df['embedding'].apply(ast.literal_eval)

    # recupérer la chanson
    input_song = neighbors_df[(neighbors_df['title_cleaned'] == song) & (neighbors_df['artist'] == artist)]

    # si chanson non trouvée msg d'erreur
    if input_song.empty:
        raise ValueError("Song not found.")

    # récupérer le cluster de la chanson et son embedding
    ## Recupérer la chanson et l'embedding
    song_embedding = torch.tensor(input_song.iloc[0]['embedding'])

    # récupérer les chansons du mm cluster (sauf la chanson input)
    ## Récuperer le neighbors_df short de la fonction get_song
    label_songs = neighbors_df[~((neighbors_df['title_cleaned'] == song) & (neighbors_df['artist'] == artist))]

    # similarité
    def compute_similarity(row):
        emb = torch.tensor(row['embedding'])
        return F.cosine_similarity(song_embedding, emb, dim=0).item()

    label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)

    # top n songs similaires
    top_songs = label_songs.sort_values(by='similarity', ascending=False).head(top_n)

    return top_songs

## 3.1 Test functions

In [74]:
list(df[df.artist == 'Eric Clapton'].title_cleaned)

['32-20 Blues',
 'After Midnight',
 'Badge',
 'Bell Bottom Blues',
 "Can't Find My Way Home",
 'Cocaine',
 'Come Back Baby',
 'Crossroads',
 'Danger',
 'Dead End Road',
 "Don't Cry Sister",
 "Don't Let Me Be Lonely Tonight",
 'Every Little Thing',
 'Eyesight To The Blind',
 'Find Myself',
 'Five Long Years',
 'Forever Man',
 'Goodnight Irene',
 'I Shot The Sheriff',
 'I Wanna Be',
 'If I Needed Someone',
 'Marry You',
 'Missing Person',
 "Nobody Knows You When You're Down And Out",
 'Back Home',
 'Broken Down',
 "It's Easy",
 'Key To The Highway',
 "Knockin' On Heaven's Door",
 'Last Fair Deal Gone Down',
 'Lay Down Sally',
 'Layla',
 'Let It Grow',
 'Little Wing',
 "Milkcow's Calf Blues",
 'Modern Girl']

In [75]:
get_top_similar_songs(df, "Knockin' On Heaven's Door", 'Eric Clapton', top_n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)


Unnamed: 0,artist,title_cleaned,embedding,text,similarity
4531,Eric Clapton,Modern Girl,"[-0.011848488822579384, -0.002734000561758876,...",[Chorus:] She's a modern girl in a modern worl...,0.63344
7742,Santana,Evil Ways,"[0.0009233258897438645, 0.03372199088335037, -...","You've got to change your evil ways, baby Befo...",0.607493
4739,Free,Heartbreaker,"[0.04407769814133644, -0.03818242996931076, -0...",Well my maker Must've been a hard heartbreaker...,0.604073
3538,Alice Cooper,Deeper,"[0.021550260484218597, 0.013255621306598186, -...","We must complete the trip, try not to lose you...",0.591605
2740,Stevie Ray Vaughan,Cold Shot,"[-0.04151754081249237, -0.023807289078831673, ...","Once was a sweet thang baby, held our love in ...",0.589765


# 4. Find themes through a GenAI agent

In [76]:
import pandas as pd

from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langgraph.prebuilt import create_react_agent
from langchain.schema import HumanMessage


# Create a dataframe
df = df

# Get lyrics from dataframe
@tool
def get_lyrics_top_songs(song_title : str, artist_name : str) -> str:
    """ Input a song and artist and get the top 5 songs similar in beat and lyrics.
    Use the artist name and song title in the query as artist_name and song_title """

    top_songs = get_top_similar_songs(df, song_title, artist_name, top_n=5)[['artist', 'title_cleaned', 'text']]
    searched_song = df[(df['title_cleaned'] == song_title) & (df['artist'] == artist_name)][['artist', 'title_cleaned', 'text']]

    songs = pd.concat([searched_song, top_songs], axis=0)

    if songs.empty:
        return f"No songs found for this {artist_name} and {song_title}."

    results = []
    for _, row in songs.iterrows():
        results.append(f"Artist: {row['artist']}\nTitle: {row['title_cleaned']}\nLyrics: {row['text']}\n")
    return "\n".join(results)

# Prompt Gemini model
def model_gemini(song_title, artist_name):

    ### Instantiate Gemini model ###
    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

    ### Instantiate variables ###
    # Tools
    tools = [get_lyrics_top_songs]

    # Prompt
    system_prompt = """
        With the name of an artist and a song title as an input use the tool to find the 5 most similar songs based on beats and lyrics.
        Then, analyze the lyrics of all songs and explain the similarities between the lyrics in 5-10 lines.
        Make sure that every time you mention a song, you also mention the artist. """

    ### Create agent
    agent = create_react_agent(model, tools, prompt=system_prompt)

    # Input query
    query = f"Find the top 5 similar songs to {artist_name}'s {song_title}. Summarize why the lyrics are similar"

    # Get response
    response = agent.invoke({"messages": [HumanMessage(content=query)]})

    return response["messages"][-1].content


## 4.1 Test function

In [77]:
model_gemini("Knockin' On Heaven's Door", 'Eric Clapton')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)


'The lyrics of "Knockin\' On Heaven\'s Door" by Eric Clapton, along with the other identified songs, share themes of struggle, despair, and transformation. Eric Clapton\'s "Knockin\' On Heaven\'s Door" expresses a sense of resignation and plea for release. Similarly, Free\'s "Heartbreaker" conveys emotional pain and a desire for a fresh start. Alice Cooper\'s "Deeper" uses metaphorical descent to represent inner turmoil. Santana\'s "Evil Ways" speaks of relationship troubles and a need for change, while Stevie Ray Vaughan\'s "Cold Shot" reflects on lost love and emotional coldness. Lastly, Eric Clapton\'s "Modern Girl" explores themes of resilience and navigating a challenging world.'