In [1]:
import pandas as pd
from IPython.display import Markdown
import re

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Reload dotenv in notebook
from dotenv import load_dotenv
dot_env_loaded = load_dotenv()
dot_env_loaded

True

# 1. Lyrics Database

In [4]:
lyrics_df_all = pd.read_csv('../raw_data/20250609_17k_lyrics_eng_fr.csv')

In [5]:
lyrics_df_all.drop(columns='Unnamed: 0', inplace=True)
lyrics_df_all.reset_index(drop=True, inplace=True)

In [6]:
lyrics_df_all.isna().sum()

artist                     0
track_title_clean          0
lyrics_clean               3
artist_lower               0
track_title_clean_lower    0
dtype: int64

In [7]:
lyrics_df_all.groupby('artist').count()['lyrics_clean'].sort_values(ascending=False).head(50)

artist
Booba                       90
Marilyn Manson              89
Tim McGraw                  89
Rascal Flatts               89
Kenny Chesney               86
Grateful Dead               85
Shy'm                       85
George Strait               83
Phish                       83
Muse                        80
Christine and the queens    78
Kery James                  78
Def Leppard                 73
Lionel Richie               73
Jenifer                     72
Carla Bruni                 72
Lara Fabian                 71
Rush                        70
Gradur                      69
Lefa                        68
Zazie                       68
Aya Nakamura                68
Weezer                      68
PNL                         68
Slayer                      67
Korn                        67
Norah Jones                 67
Foo Fighters                66
Céline Dion                 66
Hamza                       66
Lynyrd Skynyrd              66
Megadeth                    65
M

In [8]:
lyrics_df_all.head()

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
0,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",abba,"andante, andante"
1,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,abba,as good as new
2,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
3,ABBA,Cassandra,Down in the street they're all singing and sho...,abba,cassandra
4,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're enchai...",abba,chiquitita


In [9]:
lyrics_df_all.to_csv('../raw_data/data_17k_lyrics.csv', index=False)

In [10]:
df = pd.read_csv('../raw_data/data_17k_lyrics.csv')
df

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
0,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",abba,"andante, andante"
1,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,abba,as good as new
2,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
3,ABBA,Cassandra,Down in the street they're all singing and sho...,abba,cassandra
4,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're enchai...",abba,chiquitita
...,...,...,...,...,...
17299,Therapie Taxi,Candide Crush,"Je suis si sage-sage d'habitude, mon chéri Com...",therapie taxi,candide crush
17300,Therapie Taxi,J'en ai marre,Et j'en ai marre de toi Je sais c'est dur mais...,therapie taxi,j'en ai marre
17301,Therapie Taxi,Avec ta zouz,"N'aie pas le seum, fais-moi la bise Ce soir c'...",therapie taxi,avec ta zouz
17302,Minuit,Flash,"Lumière crue, lumière sans, flash! Étrange sen...",minuit,flash


## 1.1 Clean lyrics

In [11]:
lyrics_df_all

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
0,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",abba,"andante, andante"
1,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,abba,as good as new
2,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
3,ABBA,Cassandra,Down in the street they're all singing and sho...,abba,cassandra
4,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're enchai...",abba,chiquitita
...,...,...,...,...,...
17299,Therapie Taxi,Candide Crush,"Je suis si sage-sage d'habitude, mon chéri Com...",therapie taxi,candide crush
17300,Therapie Taxi,J'en ai marre,Et j'en ai marre de toi Je sais c'est dur mais...,therapie taxi,j'en ai marre
17301,Therapie Taxi,Avec ta zouz,"N'aie pas le seum, fais-moi la bise Ce soir c'...",therapie taxi,avec ta zouz
17302,Minuit,Flash,"Lumière crue, lumière sans, flash! Étrange sen...",minuit,flash


In [12]:
lyrics_df_all.dropna(inplace=True)

In [13]:
import re

def no_backslash(lyrics):
    lyrics = lyrics.replace('\\', '')
    return lyrics


In [14]:
lyrics_df_all['lyrics_clean'] = lyrics_df_all['lyrics_clean'].apply(no_backslash)

In [15]:
lyrics_df_all[(lyrics_df_all.artist == 'Booba') & (lyrics_df_all.track_title_clean == 'Bénigni')]['lyrics_clean']

16194    Avant, j'voulais la plus sexy, maintenant j've...
Name: lyrics_clean, dtype: object

# 3. Download Gemini model

In [16]:
import os
google_api_key_exists = 'GOOGLE_API_KEY' in os.environ
google_api_key_exists

True

## 3.1 Import GenAI Gemini model

In [17]:
from langchain.chat_models import init_chat_model

model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

## 3.2 Create a tool to get lyrics

In [18]:
from langchain_core.tools import tool

@tool
def get_lyrics(artist_name : str) -> str:
    """ Get song titles and lyrics of a specific artist's name.
    Use the artist name in the query as artist_name """
    songs = df[df['artist'].isin([artist_name])]
    if songs.empty:
        return "No songs found for this artist."

    results = []
    for _, row in songs.iterrows():
        results.append(f"Title: {row['track_title_clean']}\nLyrics: {row['lyrics_clean']}\n")
    return "\n".join(results)

In [19]:
tools = [get_lyrics]

## 3.3 Create a system prompt

In [20]:
system_prompt = """
    With the name of an artist as an input, and the lyrics of their songs from the get_lyrics tool,
    you are tasked to summarize the main themes in their lyrics. The output would follow the following format:
    Here are the 5 main themes in Adele's song “Chasing Pavements” with quotes to illustrate each:
    1. Uncertainty and Doubt
        The song revolves around the question of whether to keep pursuing a relationship or give up. This doubt is clear in the repeated lines:
        “Should I give up? Or should I just keep chasing pavements? Even if it leads nowhere.”

    2. Love vs. Rationality
        Adele expresses the tension between emotional conviction and logical thinking. She knows it’s love but questions if it’s worth continuing:
        “This ain't lust, I know this is love, but if I tell the world I'll never say enough.”

    3. Fear of Waste and Futility
        There is a strong fear that continuing the pursuit might be pointless or a waste of effort:
        “Or would it be a waste? Even if I knew my place, should I leave it there?”

    4. Hope and Perseverance
        Despite doubts, she shows a desire to keep trying, to keep “chasing pavements,” hoping for a positive outcome even if the path is unclear:
        “I build myself up and fly around in circles, waiting as my heart drops.”

    5. Self-reflection and Decision-Making
        The song is about introspection and the difficult process of making a decision about love and whether to keep going or move on:
        “I've made up my mind, don't need to think it over, if I'm wrong, I am right.”

    These themes together create a nuanced emotional landscape about love, uncertainty, and the struggle between hope and letting go.
    """

## 3.4 Create an agent executor

### 3.4.1 Test Jule's version

In [21]:
# Import
from langgraph.prebuilt import create_react_agent

# Create the `agent_executor` with a model and tools
agent_executor = create_react_agent(model, tools)

In [22]:
from langchain_core.messages import HumanMessage

query = "Summarize booba's songs"
response = agent_executor.invoke(
    {"messages": [HumanMessage(content=query)]}
)
response["messages"][-1].content

'Sorry, I can not summarize the songs of the artist Booba, because I need the ability to access the song titles and lyrics of the artist first. The available tools do not have the ability to summarize songs.'

### 3.4.2 Test upgraded version

In [23]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.agents import initialize_agent, AgentType
from langchain.schema import HumanMessage

In [24]:
# Initialize the agent with your tool and model
agent_executor = initialize_agent(
    tools,
    model,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

artist = 'Adele'

query = f"Summarize the top 5 main themes in {artist}'s lyrics. For each theme please quote some lyrics"
response = agent_executor.run(query)
Markdown(response)

  agent_executor = initialize_agent(
  response = agent_executor.run(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to analyze Adele's lyrics to identify the top 5 recurring themes and provide supporting quotes. I can use the `get_lyrics` tool to access her lyrics.
Action: get_lyrics
Action Input: Adele[0m
Observation: [36;1m[1;3mTitle: All I Ask
Lyrics: [Verse 1] I will leave my heart at the door I won't say a word They've all been said before, you know So why don't we just play pretend Like we're not scared of what's coming next Or scared of having nothing left [Pre-Chorus] Look, don't get me wrong I know there is no tomorrow All I ask is [Chorus] If this is my last night with you Hold me like I'm more than just a friend Give me a memory I can use Take me by the hand while we do what lovers do It matters how this ends Cause what if I never love again? [Verse 2] I don't need your honesty It's already in your eyes and I'm sure my eyes, they speak for me No one knows me like you do And since you're the only one that matters, tell 

The top 5 main themes in Adele's lyrics are: Heartbreak and Loss, Regret and Apology, Longing and Yearning, Moving On and Forgiveness, and Nostalgia and Memories. I have provided example lyrics for each theme.

## 3.5 Prompt AI agent executor

In [25]:
system_prompt = """
With the name of an artist as an input, and the lyrics of their songs from the get_lyrics tool,
you are tasked to summarize the top 5 themes in their lyrics.

Below is an example of the format to use for the response:

    Here are the 5 main themes in Adele's songs:

    1. Heartbreak and Loss
        Adele often sings about the intense pain and sorrow that come with the end of a relationship.
        “Go easy on me, I was still a child” (Easy On Me)
        “Baby, let the water wash away all our tears” (Water Under the Bridge)
        “Never mind, I’ll find someone like you” (Someone Like You)

    2. Regret and Remorse
        Many of her songs reflect on past mistakes and express regret over things said or done in relationships.
        “Hello, can you hear me? I’m in California dreaming about who we used to be” (Hello)
        “I should have treated you right” (Take It All)
        “I regret the things I never said” (Remedy)

    3. Resilience and Moving On
        Despite the pain, Adele's lyrics often show a strength and determination to overcome heartbreak and move forward.
        “I’m gonna make it through” (Make You Feel My Love)
        “We could have had it all, rolling in the deep” (Rolling in the Deep)
        “I’ve gotta let go of us” (Love In The Dark)

    4. Longing and Yearning
        A strong sense of longing for a lost love or a past relationship is a recurring theme in her music.
        “When we were young, the world was so much brighter” (When We Were Young)
        "Don't forget me, I beg, I remember you said, Sometimes it lasts in love, but sometimes it hurts instead" (Someone Like You)
        “Oh, how the time flies, as we get older” (To Be Loved)

    5. Self-Reflection and Growth
        Adele's songs often involve introspection and a journey of self-discovery and personal growth.
        “I’m not the girl I used to be” (Million Years Ago)
        “I’m trying to find myself” (Send My Love (To Your New Lover))
        “I’ve changed my mind, I’ll live and learn” (Set Fire to the Rain)

    These themes combine to create a rich tapestry of emotions, exploring the complexities of love, loss, and personal growth.
"""

In [26]:
agent_2 = create_react_agent(model, tools, prompt=system_prompt)

In [27]:
artist = 'Bob Marley'

query = f"Summarize the top 5 themes of {artist} lyrics, explain them in one line. For each theme quote 3 different lyrics and put the name of the song in parentheses"

response = agent_2.invoke({"messages": [HumanMessage(content=query)]})

response["messages"][-1].content


'Here are the 5 main themes in Bob Marley\'s songs:\n\n1.  **Social Justice and Revolution**\n    Marley uses his music to advocate for equality, justice, and the liberation of oppressed people.\n    "Get up, stand up, stand up for your rights" (Get Up, Stand Up)\n    "We refuse to be what you wanted us to be" (Babylon System)\n    "Emancipate yourselves from mental slavery" (Redemption Song)\n\n2.  **Love and Relationships**\n    Many of his songs explore the complexities of love, relationships, and the importance of unity.\n    "One love, one heart, let\'s get together and feel alright" (One Love)\n    "Could you be loved and be loved?" (Could You Be Loved)\n    "Is this love, is this love, is this love that I\'m feeling?" (Is This Love)\n\n3.  **Spirituality and Rastafarianism**\n    Marley\'s faith and spiritual beliefs are central themes, promoting peace, love, and connection with Jah (God).\n    "Jah live, children, yeah!" (Exodus)\n    "We know where we\'re going, we know where 

In [28]:
query = f"Summarize the top 5 themes of {artist} lyrics, explain them in one line. For each theme quote 3 different lyrics and put the name of the song in parentheses"

response = agent_2.invoke({"messages": [HumanMessage(content=query)]})
response['messages'][-1].pretty_print()


Here are the 5 main themes in Bob Marley's songs:

1.  **Social Justice and Resistance**
    Marley's lyrics frequently call for equality, justice, and resistance against oppression.
    "Get up, stand up, stand up for your right" (Get Up, Stand Up)
    "We refuse to be what you wanted us to be" (Babylon System)
    "Emancipate yourselves from mental slavery" (Redemption Song)

2.  **Love and Unity**
    Many of his songs emphasize the importance of love, peace, and unity among all people.
    "One love, one heart, let's get together and feel alright" (One Love)
    "Let's unite to fight this holy Armageddon" (Iron Lion Zion)
    "Love would never leave us alone" (Is This Love)

3.  **Spirituality and Rastafarianism**
    Marley's deep faith and connection to Rastafarianism are prominent, with references to Jah (God), Zion, and spiritual liberation.
    "Jah live, children" (Exodus)
    "We're the survivors, yes, the black survivors" (Black Survivors)
    "Open your eyes and look with

In [29]:
df[df.artist == 'Pink Floyd']

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
2321,Pink Floyd,Eclipse,All that you touch All that you see All that y...,pink floyd,eclipse
2322,Pink Floyd,Money,"Money, get away Get a good job with more pay a...",pink floyd,money
2323,Pink Floyd,Wish You Were Here,"So, so you think you can tell Heaven from Hell...",pink floyd,wish you were here
7306,Pink Floyd,Arnold Layne,Arnold Layne had a strange hobby Collecting cl...,pink floyd,arnold layne
7307,Pink Floyd,Astronomy Domine,"Lime and limpid green, a second scene A fight ...",pink floyd,astronomy domine
7308,Pink Floyd,Brain Damage,The lunatic is on the grass. The lunatic is on...,pink floyd,brain damage
7309,Pink Floyd,Comfortably Numb,Hello? Is there anybody in there? Just nod if ...,pink floyd,comfortably numb
7310,Pink Floyd,Hey You,Hey you out there in the cold Getting lonely g...,pink floyd,hey you
7311,Pink Floyd,High Hopes,Beyond the horizon of the place we lived when ...,pink floyd,high hopes
7312,Pink Floyd,Jugband Blues,It's awfully considerate of you to think of me...,pink floyd,jugband blues


# 4. Exploration

In [30]:
import pandas as pd

from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langgraph.prebuilt import create_react_agent
from langchain.schema import HumanMessage


# Get lyrics from dataframe
@tool
def get_lyrics(artist_name : str) -> str:
    """ Get song titles and lyrics of a specific artist's name.
    Use the artist name in the query as artist_name """
    songs = df[df['artist'].isin([artist_name])]
    if songs.empty:
        return f"No songs found for this {artist}."

    results = []
    for _, row in songs.iterrows():
        results.append(f"Title: {row['track_title_clean']}\nLyrics: {row['lyrics_clean']}\n")
    return "\n".join(results)

# Prompt Gemini model
def model_gemini(artist):

    ### Instantiate Gemini model ###
    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

    ### Instantiate variables ###
    # Tools
    tools = [get_lyrics]

    # Prompt
    system_prompt = """
        With the name of an artist as an input, and the lyrics of their songs from the get_lyrics tool,
        you are tasked to summarize the top 5 themes in their lyrics. The answer will be two parts: text and a json.

        For the text format: Below is an example of the  format to use, please keep the same format:

            Here are the 5 main themes in Adele's songs:

            1. Heartbreak and Loss
                Adele often sings about the intense pain and sorrow that come with the end of a relationship.
                “Go easy on me, I was still a child” (Easy On Me)
                “Baby, let the water wash away all our tears” (Water Under the Bridge)
                “Never mind, I’ll find someone like you” (Someone Like You)

            2. Regret and Remorse
                Many of her songs reflect on past mistakes and express regret over things said or done in relationships.
                “Hello, can you hear me? I’m in California dreaming about who we used to be” (Hello)
                “I should have treated you right” (Take It All)
                “I regret the things I never said” (Remedy)

            3. Resilience and Moving On
                Despite the pain, Adele's lyrics often show a strength and determination to overcome heartbreak and move forward.
                “I’m gonna make it through” (Make You Feel My Love)
                “We could have had it all, rolling in the deep” (Rolling in the Deep)
                “I’ve gotta let go of us” (Love In The Dark)

            4. Longing and Yearning
                A strong sense of longing for a lost love or a past relationship is a recurring theme in her music.
                “When we were young, the world was so much brighter” (When We Were Young)
                "Don't forget me, I beg, I remember you said, Sometimes it lasts in love, but sometimes it hurts instead" (Someone Like You)
                “Oh, how the time flies, as we get older” (To Be Loved)

            5. Self-Reflection and Growth
                Adele's songs often involve introspection and a journey of self-discovery and personal growth.
                “I’m not the girl I used to be” (Million Years Ago)
                “I’m trying to find myself” (Send My Love (To Your New Lover))
                “I’ve changed my mind, I’ll live and learn” (Set Fire to the Rain)

            These themes combine to create a rich tapestry of emotions, exploring the complexities of love, loss, and personal growth.

        For the json: please share the list artist, themes and unique songs you quoted as below
                {'artist' : 'Adele',
                'themes' : ['Heartbreak and Loss', 'Regret and Remorse', 'Resilience and Moving On', 'Longing and Yearning', 'Self-Reflection and Growth']
                'songs' : ['Easy On Me', 'Water Under the Bridge', 'Someone Like You', 'Hello', 'Take It All', 'Remedy', 'Make You Feel My Love', 'Rolling in the Deep', 'Love In The Dark', 'When We Were Young', 'To Be Loved', 'Million Years Ago', 'Send My Love (To Your New Lover)', 'Set Fire to the Rain']}
    """

    ### Create agent
    agent = create_react_agent(model, tools, prompt=system_prompt)

    # Input query
    query = f"Summarize the top 5 themes of {artist} lyrics, explain them in one line. For each theme quote 3 different lyrics and put the name of the song in parentheses"

    # Get response
    response = agent.invoke({"messages": [HumanMessage(content=query)]})

    return print(response["messages"][-1].content)

In [31]:
df.artist.unique()

array(['ABBA', 'Adele', 'Aerosmith', 'Alabama', 'Alice Cooper',
       'Alice In Chains', 'Allman Brothers Band', 'Ariana Grande',
       'Avril Lavigne', 'Backstreet Boys', 'The Beatles', 'Bee Gees',
       'Billie Holiday', 'Billy Joel', 'Black Sabbath', 'Bob Dylan',
       'Bob Seger', 'Bon Jovi', 'Bonnie Raitt', 'Britney Spears',
       'Bruce Springsteen', 'Bruno Mars', 'Chaka Khan', 'Cheap Trick',
       'Chris Brown', 'Christina Aguilera', 'Chuck Berry', 'Cinderella',
       'Coldplay', 'Counting Crows', 'Culture Club', 'Dave Matthews Band',
       'David Guetta', 'Death', 'Deep Purple', 'Def Leppard',
       'Demi Lovato', 'Depeche Mode', 'Diana Ross', 'Don Henley',
       'Don McLean', 'Donna Summer', 'Drake', 'Dream Theater',
       'Ed Sheeran', 'Eddie Cochran', 'Ella Fitzgerald', 'Ellie Goulding',
       'Elton John', 'Elvis Costello', 'Elvis Presley', 'Eminem',
       'Emmylou Harris', 'Enigma', 'Enrique Iglesias', 'Eric Clapton',
       'Etta James', 'Evanescence', 'Everc

In [32]:
response = model_gemini('PNL')
response

Here are the 5 main themes in PNL's songs:

1.  Struggle and Hustle
    PNL frequently explores the themes of street life, drug dealing, and the daily struggles to survive in the "jungle" of their environment.
    “Au DD J\'la passe, la détaille, la pé-cou, la vi-sser, des regrets d\'vant ton bébé” (Au DD)
    “J\'suis dans mon monde j\'ai ma bouée, j\'ai ma p\'tite niaks\' Mon playback sur Acadian” (Hasta la vista)
    “J\'ai que des tics (sniff), à l\'odeur de l\'argent (j\'comptais) J\'me cache, j\'compte cash le cash dans la chambre” (Blanka)

2.  Brotherhood and Loyalty
    The bond between the two brothers, Ademo and N.O.S, and their loyalty to their crew (QLF - Que La Famille) are central themes.
    “Papa nous a cogné tête contre tête, nous a dit: "J'veux un amour en fer J'veux personne entre vous, même pas moi, même pas les anges de l'Enfer"” (Deux Frères)
    “Que la mif, que la mif Que la mif, que la mif Rien n\'a changé dans ma putain d\'tête” (Autre Monde)
    “Rien de mie

In [33]:
df[df.artist == 'Therapie Taxi']

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
17298,Therapie Taxi,Hit Sale,"Y'a les phrases que tu dis, les phrases de mec...",therapie taxi,hit sale
17299,Therapie Taxi,Candide Crush,"Je suis si sage-sage d'habitude, mon chéri Com...",therapie taxi,candide crush
17300,Therapie Taxi,J'en ai marre,Et j'en ai marre de toi Je sais c'est dur mais...,therapie taxi,j'en ai marre
17301,Therapie Taxi,Avec ta zouz,"N'aie pas le seum, fais-moi la bise Ce soir c'...",therapie taxi,avec ta zouz


In [34]:
df.groupby('artist').count()

Unnamed: 0_level_0,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABBA,57,57,57,57
Aaron,41,41,41,41
Abou Tall,21,21,21,21
Achile,3,3,3,3
Adamo,23,23,23,23
...,...,...,...,...
Zero 7,24,24,24,24
Zeromancer,17,17,17,17
Ziak,35,35,35,35
Zola,1,1,1,1


In [35]:
df.groupby('artist').count()['lyrics_clean'][df.groupby('artist').count()['lyrics_clean'] >= 3]

artist
ABBA          57
Aaron         41
Abou Tall     21
Achile         3
Adamo         23
              ..
Zebda         37
Zero 7        24
Zeromancer    17
Ziak          35
Édith Piaf     9
Name: lyrics_clean, Length: 567, dtype: int64

# 5. Exploration KNN

## 5.1 Numerical features

In [None]:
df_9k_numerical = pd.read_csv('../raw_data/20250609_9k_spotify_metrics_lyrics_data.csv')
df_9k_numerical.drop(columns=['link', 'track_id', 'Unnamed: 0', 'text', 'song'], inplace=True)

In [None]:
df_9k_numerical = pd.read_csv('../raw_data/20250609_9k_spotify_metrics_lyrics_data.csv')

In [None]:
df_9k_numerical.drop(columns=['link', 'track_id', 'Unnamed: 0', 'text', 'song'], inplace=True)

In [46]:
df_9k_numerical.head()

Unnamed: 0,artist,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,title_cleaned
0,ABBA,32,2014,swedish,0.277,0.479,10,-7.798,1,0.0282,0.587,0.000282,0.0745,0.367,202.887,280427,3,"Andante, Andante"
1,ABBA,30,2014,swedish,0.408,0.727,5,-11.898,1,0.0556,0.0305,0.0,0.777,0.662,148.697,206080,4,As Good As New
2,ABBA,21,2012,swedish,0.498,0.585,5,-8.67,1,0.0289,0.00723,1.3e-05,0.342,0.841,131.61,184880,4,Bang-A-Boomerang
3,ABBA,29,2014,swedish,0.38,0.559,0,-6.489,1,0.0362,0.604,0.0,0.114,0.495,159.803,292027,3,Cassandra
4,ABBA,27,2012,swedish,0.517,0.433,9,-12.984,1,0.0321,0.716,5e-06,0.308,0.369,84.256,327867,4,Chiquitita


### 5.1.2 Set up processing pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [68]:
# Categorize columns
categ_columns = ['genre']
num_columns = ['popularity', 'year', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature']

# Call encoders and scalers
ohe = OneHotEncoder(sparse_output=False)
minmax = MinMaxScaler()

# Make encoding pipeline
pipe = make_column_transformer(
    (ohe, categ_columns),
    (minmax, num_columns),
    remainder='drop'
).set_output(transform="pandas")

#Transform df
X_transformed = pipe.fit_transform(df_9k_numerical)

In [70]:
X_transformed.head()

Unnamed: 0,onehotencoder__genre_acoustic,onehotencoder__genre_alt-rock,onehotencoder__genre_black-metal,onehotencoder__genre_blues,onehotencoder__genre_classical,onehotencoder__genre_country,onehotencoder__genre_dance,onehotencoder__genre_dancehall,onehotencoder__genre_death-metal,onehotencoder__genre_disco,...,minmaxscaler__loudness,minmaxscaler__mode,minmaxscaler__speechiness,minmaxscaler__acousticness,minmaxscaler__instrumentalness,minmaxscaler__liveness,minmaxscaler__valence,minmaxscaler__tempo,minmaxscaler__duration_ms,minmaxscaler__time_signature
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.707873,1.0,0.00632,0.590543,0.000284,0.059738,0.353327,0.913523,0.147828,0.5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.557387,1.0,0.037803,0.030684,0.0,0.773443,0.660523,0.590539,0.103139,0.75
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.675867,1.0,0.007124,0.007274,1.3e-05,0.331505,0.846923,0.488696,0.090396,0.75
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.755919,1.0,0.015512,0.607646,0.0,0.099868,0.486619,0.656733,0.154801,0.5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.517526,1.0,0.010801,0.720322,5e-06,0.296962,0.35541,0.206456,0.176344,0.75


In [None]:
# Call model
model_knn = NearestNeighbors(n_neighbors=100, algorithm='auto', metric='euclidean')

# Fit model
model_knn.fit(X_transformed)

In [None]:
""" Rajouter artist name """

def find_song(song_name):
    # Find song index
    song_idx = df_9k_numerical.index[df_9k_numerical['title_cleaned'] == song_name].tolist()[0]

    # Choose a song to query by index
    song_transformed = pipe.transform(df_9k_numerical.iloc[[song_idx]])

    # Find 20 nearest neighbors (including the song itself)
    distances, indices = model_knn.kneighbors(song_transformed, n_neighbors=101)

    # Exclude the first index if it is the song itself
    neighbor_indices = indices[0][1:]

    # Retrieve metadata for neighbors
    neighbors_metadata = df_9k_numerical.iloc[neighbor_indices][['artist', 'title_cleaned']]

    return neighbors_metadata

In [99]:
find_song('Believe In Yourself').head(50)

Unnamed: 0,artist,title_cleaned
4723,Frankie Valli,Fallen Angel
3715,Bee Gees,I Started A Joke
452,Diana Ross,More Today Than Yesterday
3927,Chaka Khan,I Feel For You
6220,Lionel Richie,You Are
6207,Lionel Richie,My Love
3722,Bee Gees,Massachusetts
165,Bee Gees,Jive Talkin'
1623,Lionel Richie,Out Of My Head
4724,Frankie Valli,Grease


## 5.2 Embedded lyrics

In [39]:
df_9k_embedded = pd.read_csv('../raw_data/20250611_ft3_lyrics_clustered_with_embeddings.csv')

In [102]:
df_9k_embedded

Unnamed: 0,artist,title_cleaned,text,label,embedding,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,Yellowcard,Miles Apart,If I could I would do all of this again Travel...,3,"[0.019293159246444702, 0.02275143377482891, -0...",0.019293,0.022751,-0.037651,0.027542,0.058273,...,-0.042401,-0.030389,-0.014984,-0.028456,0.019748,-0.002454,0.071282,0.040992,0.001366,0.036016
1,Roy Orbison,Coming Home,It's a long and lonely highway. When you're tr...,8,"[-0.01818583346903324, 0.01762726716697216, -0...",-0.018186,0.017627,-0.044172,0.001227,0.019453,...,0.022084,-0.044688,-0.014683,0.031072,-0.007822,0.001406,0.035424,0.011209,-0.002338,0.003329
2,John Legend,Tomorrow,With good understanding and a lot of patience ...,7,"[-0.0030277669429779053, 0.031162232160568237,...",-0.003028,0.031162,-0.078237,0.018235,0.033328,...,-0.005760,-0.054076,-0.021342,-0.027279,0.004642,-0.016758,0.070787,0.000192,0.016111,0.016156
3,Guided By Voices,I Am Columbus,Gather around See is mine Disappear that man O...,6,"[-0.005865388549864292, 0.05365801602602005, -...",-0.005865,0.053658,-0.008006,0.013593,0.026096,...,-0.040121,-0.018065,-0.037919,0.019524,0.006835,-0.037575,0.055972,-0.021865,0.012754,0.000006
4,OneRepublic,Counting Stars,"Lately I've been, I've been losing sleep Dream...",9,"[-0.03060612827539444, 0.019974103197455406, -...",-0.030606,0.019974,-0.041886,0.032175,0.007943,...,-0.040384,-0.028813,0.007124,-0.040075,0.003795,-0.001913,0.049674,0.032278,0.002382,-0.014442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8885,Roy Orbison,Lana,"Zuhm-ma zhum-ma-zhum-ma, Zuhm-ma zhum-ma-zhum-...",1,"[-0.005385252181440592, 0.02448173053562641, -...",-0.005385,0.024482,-0.073894,-0.005794,0.056154,...,0.022459,-0.043896,-0.065906,-0.009375,0.008410,-0.020645,0.023039,0.029706,-0.004498,-0.001945
8886,Marilyn Manson,Burning Flag,"They wanna sell it out, buy it up And dumb it ...",5,"[0.0001660230482229963, 0.01828096993267536, -...",0.000166,0.018281,-0.023797,-0.007704,0.016762,...,0.006197,-0.004927,-0.022333,-0.012346,0.008672,0.004688,0.053972,0.022368,-0.008053,0.011309
8887,Bob Dylan,A Hard Rain's A-Gonna Fall,"Oh, where have you been, my blue-eyed son? And...",6,"[0.036018699407577515, -0.012850928120315075, ...",0.036019,-0.012851,-0.056249,-0.029555,0.001914,...,0.031773,0.055180,0.008137,0.024572,-0.002618,-0.023996,0.081041,-0.009193,0.000283,-0.020081
8888,Lynyrd Skynyrd,Double Trouble,"Eleven times I been busted, eleven times I bee...",8,"[-0.026216328144073486, 0.0233945082873106, -0...",-0.026216,0.023395,-0.026408,-0.030484,0.010623,...,-0.013221,-0.000224,0.013485,-0.037816,0.018382,0.008729,0.039519,0.007946,0.013991,0.048969


## 5.3 Merge numerical and embedded dataframes

In [195]:
df_9k_numerical = pd.read_csv('../raw_data/20250609_9k_spotify_metrics_lyrics_data.csv')
df_9k_numerical.drop(columns=['link', 'track_id', 'Unnamed: 0', 'song'], inplace=True)

df_9k_embedded = pd.read_csv('../raw_data/20250611_ft3_lyrics_clustered_with_embeddings.csv')

df_9k_all = df_9k_numerical.merge(df_9k_embedded, on=['artist', 'title_cleaned'], how='inner')

In [199]:
df_9k_all.text_x

0       Take it easy with me, please Touch me gently l...
1       I'll never know why I had to go Why I had to p...
2       Making somebody happy is a question of give an...
3       Down in the street they're all singing and sho...
4       Chiquitita, tell me what's wrong You're enchai...
                              ...                        
8885    Cocaine flame in my bloodstream Sold my coat w...
8886    Spent the night with a friend of mine and a ha...
8887    (A throne in Heaven sat empty for 33 years.) W...
8888    We are a new creation, the old has gone, the n...
8889    Once again to strive, to beat it out Over and ...
Name: text_x, Length: 8890, dtype: object

### 5.3.1 KNN song function

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

### Définir le modèle knn + le fitter

def knn_model(df):
    # Categorize columns
    categ_columns = ['genre']
    num_columns = ['popularity', 'year', 'danceability', 'energy',
        'key', 'loudness', 'mode', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
        'time_signature']

    # Call encoders and scalers
    ohe = OneHotEncoder(sparse_output=False)
    minmax = MinMaxScaler()

    # Make encoding pipeline
    pipe = make_column_transformer(
        (ohe, categ_columns),
        (minmax, num_columns),
        remainder='drop'
    ).set_output(transform="pandas")

    #Transform df
    X_transformed = pipe.fit_transform(df_9k_numerical)

    # Call model
    model_knn = NearestNeighbors(n_neighbors=100, algorithm='auto', metric='euclidean')

    # Fit model
    model_knn.fit(X_transformed)

    return model_knn

In [None]:
### Trouver la chanson et les knn

def find_song(song_name, artist_name, df, model_knn):
    # Find song index
    song_idx = df.index[(df['title_cleaned'] == song_name) & (df['artist'] == artist_name)].tolist()[0]

    # Choose a song to query by index
    song_transformed = pipe.transform(df.iloc[[song_idx]])

    # Find 20 nearest neighbors (including the song itself)
    distances, indices = model_knn.kneighbors(song_transformed, n_neighbors=101)

    # Exclude the first index if it is the song itself
    neighbor_indices = indices[0][1:]

    # Retrieve metadata for neighbors
    neighbors_df = df.iloc[neighbor_indices][['artist', 'title_cleaned', 'text_x', 'embedding']]

    # Add searched song
    searched_song_df = df[(df['title_cleaned'] == song_name) & (df['artist'] == artist_name)][['artist', 'title_cleaned', 'embedding']]
    neighbors_df = pd.concat([searched_song_df, neighbors_df], axis=0)

    return neighbors_df

### 5.3.2 Get top songs, cosine similarity

In [None]:
import torch
import pandas as pd
import torch.nn.functional as F
import ast

### Trouver le top plus similaire grâce aux embeddings

def get_top_similar_songs(df, song, artist, top_n=3):

    model_knn = knn_model(df)

    neighbors_df = find_song(song, artist, df, model_knn)

    # Embedding change
    neighbors_df['embedding'] = neighbors_df['embedding'].apply(ast.literal_eval)

    # recupérer la chanson
    input_song = neighbors_df[(neighbors_df['title_cleaned'] == song) & (neighbors_df['artist'] == artist)]

    # si chanson non trouvée msg d'erreur
    if input_song.empty:
        raise ValueError("Song not found.")

    # récupérer le cluster de la chanson et son embedding
    ## Recupérer la chanson et l'embedding
    song_embedding = torch.tensor(input_song.iloc[0]['embedding'])

    # récupérer les chansons du mm cluster (sauf la chanson input)
    ## Récuperer le neighbors_df short de la fonction get_song
    label_songs = neighbors_df[~((neighbors_df['title_cleaned'] == song) & (neighbors_df['artist'] == artist))]

    # similarité
    def compute_similarity(row):
        emb = torch.tensor(row['embedding'])
        return F.cosine_similarity(song_embedding, emb, dim=0).item()

    label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)

    # top n songs similaires
    top_songs = label_songs.sort_values(by='similarity', ascending=False).head(top_n)

    return top_songs

In [212]:
df_9k_all[df_9k_all.artist == "Pink Floyd"]['title_cleaned']

2321                       Eclipse
2322                         Money
2323            Wish You Were Here
7306                  Arnold Layne
7307              Astronomy Domine
7308                  Brain Damage
7309              Comfortably Numb
7310                       Hey You
7311                    High Hopes
7312                 Jugband Blues
7313                  Keep Talking
7314                 Run Like Hell
7315                See Emily Play
7316    The Fletcher Memorial Home
7317                   Us And Them
Name: title_cleaned, dtype: object

In [214]:
get_top_similar_songs(df_9k_all, 'Money', 'Pink Floyd', top_n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)


Unnamed: 0,artist,title_cleaned,embedding,text_x,similarity
1657,Lou Reed,Change,"[0.01923578977584839, -0.017707567662000656, -...",The only thing constantly changing is change A...,0.714741
1661,Lou Reed,Lady Day,"[0.021348197013139725, 0.011830992065370083, -...",When she walked on down the street She was lik...,0.696213
2323,Pink Floyd,Wish You Were Here,"[0.016658535227179527, -0.01941368542611599, -...","So, so you think you can tell Heaven from Hell...",0.694774
6611,The Monkees,Tapioca Tundra,"[0.019749097526073456, -0.021163253113627434, ...",Reasoned verse some prose or rhyme Lose themse...,0.672989
3923,Chaka Khan,Do You Love What You Feel,"[0.00474343728274107, -0.010908550582826138, -...",Chorus: Do you love what you feel Cause I love...,0.668404


### 5.3.3 Understand lyrics

In [None]:
df = df_9k_all

def get_lyrics(song_name : str, artist_name : str) -> str:
    """ Get song titles and lyrics of a specific artist's name.
    Use the artist name in the query as artist_name """
    top_songs = get_top_similar_songs(df_9k_all, song_name, artist_name, top_n=3)[['artist', 'title_cleaned', 'text_x']]
    searched_song = df[(df['title_cleaned'] == song_name) & (df['artist'] == artist_name)][['artist', 'title_cleaned', 'text_x']]

    songs = pd.concat([searched_song, top_songs], axis=0)

    return songs

In [219]:
get_lyrics('Money', 'Pink Floyd')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_songs['similarity'] = label_songs.apply(compute_similarity, axis=1)


Unnamed: 0,artist,title_cleaned,text_x
2322,Pink Floyd,Money,"Money, get away Get a good job with more pay a..."
1657,Lou Reed,Change,The only thing constantly changing is change A...
1661,Lou Reed,Lady Day,When she walked on down the street She was lik...
2323,Pink Floyd,Wish You Were Here,"So, so you think you can tell Heaven from Hell..."


In [None]:
import pandas as pd

from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langgraph.prebuilt import create_react_agent
from langchain.schema import HumanMessage


# Create a dataframe
df = df_9k_all

# Get lyrics from dataframe
@tool
def get_lyrics_top_songs(song_title : str, artist_name : str) -> str:
    """ Input a song and artist and get the top 3 songs similar in beat and lyrics.
    Use the artist name and song title in the query as artist_name and song_title """

    top_songs = get_top_similar_songs(df_9k_all, song_title, artist_name, top_n=3)[['artist', 'title_cleaned', 'text_x']]
    searched_song = df[(df['title_cleaned'] == song_title) & (df['artist'] == artist_name)][['artist', 'title_cleaned', 'text_x']]

    songs = pd.concat([searched_song, top_songs], axis=0)

    if songs.empty:
        return f"No songs found for this {artist_name} and {song_title}."

    results = []
    for _, row in songs.iterrows():
        results.append(f"Artist: {row['artist']}\nTitle: {row['title_clean']}\nLyrics: {row['text_n']}\n")
    return "\n".join(results)

# Prompt Gemini model
def model_gemini(song_title, artist_name):

    ### Instantiate Gemini model ###
    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

    ### Instantiate variables ###
    # Tools
    tools = [get_lyrics_top_songs]

    # Prompt
    system_prompt = """
        With the name of an artist and a song title as an input use the tool to find the 3 most similar songs based on beats and lyrics.
        Then, analyze the lyrics of all 4 songs to get the top 5 themes. Make sure to use the song entered by the person as the main input.
    """

    ### Create agent
    agent = create_react_agent(model, tools, prompt=system_prompt)

    # Input query
    query = f"Find the top 3 similar songs to {artist_name}'s {song_title}. Compare the lyrics of this song to the lyrics of the top 3 similar songs to get the top 5 themes. For each theme quote 3 different lyrics and put the name of the song in parentheses"

    # Get response
    response = agent.invoke({"messages": [HumanMessage(content=query)]})

    return print(response["messages"][-1].content)

In [None]:
model_gemini(song_title, artist_name)
'Pink Floyd'