In [1]:
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Reload dotenv in notebook
from dotenv import load_dotenv
dot_env_loaded = load_dotenv()
dot_env_loaded

True

# 1. Import data

In [4]:
lyrics_df = pd.read_csv('../raw_data/20250609_17k_lyrics_eng_fr.csv')
lyrics_df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
lyrics = lyrics_df['lyrics_clean'].iloc[6]

# 2. Play around with models

In [6]:
from transformers import pipeline

lyrics = """
You can dance, you can jive, having the time of your life
See that girl, watch that scene, diggin' the Dancing Queen
Friday night and the lights are low
Looking out for the place to go
Where they play the right music, getting in the swing
You come in to look for a king
Anybody could be that guy
Night is young and the music's high
With a bit of rock music, everything is fine
You're in the mood for a dance
And when you get the chance
You are the Dancing Queen, young and sweet, only seventeen
Dancing Queen, feel the beat from the tambourine
You can dance, you can jive, having the time of your life
See that girl, watch that scene, diggin' the Dancing Queen
You're a teaser, you turn 'em on
Leave them burning and then you're gone
Looking out for another, anyone will do
You're in the mood for a dance
And when you get the chance
You are the Dancing Queen, young and sweet, only seventeen
Dancing Queen, feel the beat from the tambourine
You can dance, you can jive, having the time of your life
See that girl, watch that scene, diggin' the Dancing Queen
"""

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Load summarization pipelines
bart = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate summaries (max_length controls roughly summary size)
bart_summary = bart(lyrics, max_length=100, min_length=60, do_sample=True)[0]['summary_text']

print("BART summary:\n", bart_summary)


BART summary:
 Friday night and the lights are low. Look out for the place to go where they play the right music, getting in the swing. You come in to look for a king. You're a teaser, you turn 'em on. Leave them burning and then you're gone. Anyone could be that guy.


In [8]:
# Load summarization pipelines
t5 = pipeline("summarization", model="t5-base")

# Generate summaries (max_length controls roughly summary size)
t5_summary = t5(lyrics, max_length=100, min_length=60, do_sample=True)[0]['summary_text']

print("\nT5 summary:\n", t5_summary)


T5 summary:
 only seventeen Dancing Queen, you turn 'em on Leave them burning and then you're gone Looking out for another, anyone will do You can dance, you can jive, having the time of your life See that girl, watch that scene, diggin' the Dancing queen Friday night and the lights are low You're in the mood for a dance And when you get the chance You are the dancing queen, young and sweet, only seventeen da


In [9]:
# Load summarization pipelines
pegasus = pipeline("summarization", model="google/pegasus-xsum")

# Generate summaries (max_length controls roughly summary size)
pegasus_summary = pegasus(lyrics, max_length=100, min_length=60, do_sample=True)[0]['summary_text']

print("\nPegasus summary:\n", pegasus_summary)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Pegasus summary:
 You're a teaser, you turn 'em on Leave them burning and then you're gone Looking out for another, anyone will do You're in the mood for a dance And when you get the chance You are the Dancing Queen, young and sweet, only seventeen Dancing Queen, feel the beat from the tambourine


# 3. Refine best model - Bart

In [28]:
bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

summary = bart_summarizer(
    lyrics,
    max_length=10,
    min_length=3,
    do_sample=True)[0]['summary_text']

print("BART summary with temperature and sampling:\n", summary)

BART summary with temperature and sampling:
 Friday night and the lights are low


# 4. Test on FR

In [11]:
lyrics_df[lyrics_df['artist'] == 'Charles Aznavour']

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
17146,Charles Aznavour,La bohème,"Je vous parle d'un temps, Que les moins de vin...",charles aznavour,la bohème
17147,Charles Aznavour,Hier encore,"Hier encore, J'avais vingt ans, Je carressais ...",charles aznavour,hier encore
17148,Charles Aznavour,Emmenez-moi,Vers les docks où le poids et l'ennui Me courb...,charles aznavour,emmenez-moi
17149,Charles Aznavour,For Me... Formidable,You are the one For me for me for me Formidabl...,charles aznavour,for me... formidable
17150,Charles Aznavour,Comme ils disent,J'habite seul avec maman Dans un très vieil ap...,charles aznavour,comme ils disent
17151,Charles Aznavour,Que c'est triste Venise,"Que c'est triste Venise, Au temps des amours m...",charles aznavour,que c'est triste venise
17152,Charles Aznavour,Je m'voyais déjà,À dix-huit ans j'ai quitté ma province Bien dé...,charles aznavour,je m'voyais déjà
17153,Charles Aznavour,Je t'attends,"Mes jours passent, mes nuits pleurent Et pleur...",charles aznavour,je t'attends


In [12]:
french_lyrics = lyrics_df['lyrics_clean'].iloc[17146]
french_lyrics

"Je vous parle d'un temps, Que les moins de vingt ans, Ne peuvent pas connaître, Montmartre en ce temps là, Accrochait ses lilas, Jusque sous nos fenêtres, Et si l'humble garni, Qui nous servait de nid, Ne payait pas de mine, C'est là qu'on s'est connu, Moi qui criait famine et toi, Qui posait nue, La Bohème, la Bohème, Ca voulait dire, on est heureux, La Bohème, la Bohème, Nous ne mangions, Qu'un jour sur deux. Dans les cafés voisins, Nous étions quelques uns, Qui attendions la gloire, Et bien que miséreux, Avec le ventre creux, Nous ne cessions d'y croire, Et quand quelques bistrots, Contre un bon repas chaud, Nous prenaient une toile, Nous récitions des vers, Grouppés autour du poêle, En oubliant l'hiver. La Bohème, la Bohème, Ca voulait dire, Tu es jolie, La Bohème, la Bohème, Et nous avions tous du génie. Souvent il m'arrivait, Devant mon chevalet, De passer des nuits blanches, Retouchant le dessin, De la ligne d'un sein, Du galbe d'une hanche, Et ce n'est qu'au matin, Qu'on s'ass

In [22]:
from transformers import pipeline

# Initialize the translation pipeline
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", max_length=500)

In [23]:
# Translate French text to English
result = pipe(french_lyrics)  # "Hello everyone"

print(result[0]['translation_text'])

Your input_length: 495 is bigger than 0.9 * max_length: 500. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


I tell you about a time, that the under twenty years old, can not know, Montmartre at that time, Accrochae his lilacs, Up under our windows, And if the humble garnished, Who served us as a nest, Do not pay a mine, This is where we knew each other, I who cried famine and you, Who posed naked, The Bohemian, the Bohemian, That meant, we are happy, The Bohemian, the Bohemian, We did not eat, Only one day in two. In the cafes neighbors, We were some, Who waited for the glory, And though miserous, With the hollow belly, We do not ced to believe in it, And when some bistros, Against a good hot meal, We took a canvas, We recited worms, Grouped around the stove, By forgetting the winter. The Bohemian, the Bohemian, It meant, You are pretty, The Bohemian, the Bohemian, And when some bistrots, From a good meal, We did all the same, We did the Bohian, We did the same the same, The Bohian, I did the same, I, I did the same, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, 

# 5. Detect language

In [None]:
from langdetect import detect, DetectorFactory
import pandas as pd

# Fix seed for consistent results from langdetect
DetectorFactory.seed = 0

def detect_language(df, text_column='lyrics_clean'):
    """
    Detect language of text in the specified column of a DataFrame
    and add a new 'language' column with detected language codes ('fr', 'en', etc.).

    Parameters:
        df (pd.DataFrame): Input DataFrame containing the lyrics.
        text_column (str): Name of the column containing text to detect language.

    Returns:
        pd.DataFrame: DataFrame with added 'language' column.
    """
    def safe_detect(text):
        try:
            return detect(text)
        except:
            return 'unknown'  # In case of empty or problematic text

    df['language'] = df[text_column].apply(safe_detect)
    return df

In [36]:
lyrics_df.head()

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower
0,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",abba,"andante, andante"
1,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,abba,as good as new
2,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
3,ABBA,Cassandra,Down in the street they're all singing and sho...,abba,cassandra
4,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're enchai...",abba,chiquitita


In [38]:
detect_language(lyrics_df)

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower,language
0,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",abba,"andante, andante",en
1,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,abba,as good as new,en
2,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang,en
3,ABBA,Cassandra,Down in the street they're all singing and sho...,abba,cassandra,en
4,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're enchai...",abba,chiquitita,en
...,...,...,...,...,...,...
17299,Therapie Taxi,Candide Crush,"Je suis si sage-sage d'habitude, mon chéri Com...",therapie taxi,candide crush,fr
17300,Therapie Taxi,J'en ai marre,Et j'en ai marre de toi Je sais c'est dur mais...,therapie taxi,j'en ai marre,fr
17301,Therapie Taxi,Avec ta zouz,"N'aie pas le seum, fais-moi la bise Ce soir c'...",therapie taxi,avec ta zouz,fr
17302,Minuit,Flash,"Lumière crue, lumière sans, flash! Étrange sen...",minuit,flash,fr


In [51]:
lyrics_df.groupby('language').count()['artist'].sort_values(ascending=False)*100/len(lyrics_df)

language
en         57.449145
fr         41.308368
ro          0.496995
es          0.242718
it          0.138696
ca          0.092464
tl          0.046232
id          0.040453
nl          0.040453
pt          0.028895
sw          0.023116
unknown     0.017337
et          0.011558
de          0.011558
so          0.011558
cy          0.011558
af          0.011558
bg          0.005779
sl          0.005779
da          0.005779
Name: artist, dtype: float64

In [57]:
lyrics_df[lyrics_df.language == 'sl']

Unnamed: 0,artist,track_title_clean,lyrics_clean,artist_lower,track_title_clean_lower,language
10525,Hugues Aufray,Ja-Da,Hop! Ho! Jada (Jada) jada (Jada) jada jada jin...,hugues aufray,ja-da,sl


# 99. Lyric embedding

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = 'brunokreiner/lyrics-bert'

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = [lyrics]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 0.0987,  0.0067,  0.0855,  0.0722, -0.3017,  0.0526,  0.1079, -0.1781,
         -0.1205, -0.0810,  0.1493, -0.0775, -0.0405,  0.1038, -0.0239,  0.0541,
         -0.0612,  0.2090,  0.0347, -0.0575, -0.0892,  0.0766, -0.0770, -0.2105,
          0.0516, -0.0606,  0.0469, -0.1791, -0.1525, -0.1947,  0.1100,  0.1123,
         -0.2183,  0.0280, -0.0283,  0.0258, -0.1947, -0.1529, -0.2569, -0.0696,
          0.1022,  0.1029,  0.1547, -0.0765, -0.0602,  0.0637, -0.0264,  0.0090,
          0.2063, -0.1180, -0.1545, -0.2151,  0.1095, -0.0984, -0.0811, -0.2056,
         -0.0988,  0.0018,  0.1317,  0.0967, -0.1389, -0.0777,  0.0765, -0.0837,
          0.1378, -0.1690,  0.0611, -0.0309, -0.0983,  0.1052, -0.0178, -0.0103,
          0.1225,  0.0462,  0.0487,  0.0433,  0.0942, -0.0698,  0.1399,  0.0430,
          0.0724, -0.0031,  0.1632,  0.1287,  0.1393, -0.1206,  0.0284, -0.1318,
          0.0558,  0.0030,  0.3121, -0.0125, -0.0227, -0.1393,  0.1122,  0.2191,
       