# Data preprocessing

## combining dataframes into one dataframe

In [1]:
import pandas as pd
import os

In [2]:
directory = './csv/'

In [3]:
dataframes = []

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory,filename)
        
        df = pd.read_csv(filepath)
        
        dataframes.append(df)
        
combined_df = pd.concat(dataframes,ignore_index=True)

In [4]:
combined_df.shape

(5749, 7)

In [5]:
combined_df.sample(10)

Unnamed: 0.1,Artist,Title,Album,Date,Lyric,Year,Unnamed: 0
16,Ariana Grande,​R.E.M.,Sweetener,2018-08-17,ariana grande mmm last night boy i met you yea...,2018.0,
3705,Lady Gaga,I Wanna Be With U,Artpop Act II (Scrapped),2013-09-01,it's off it's on the party's just begun take o...,2013.0,85.0
1673,Drake,I Do (Remix Verse),,,uh please put away the cameras cause we just i...,,320.0
3162,Justin Bieber,Breathe,Unreleased Songs,,lyrics for this song have yet to be released p...,,279.0
1187,Coldplay,Till Kingdom Come,,,one two steal my heart and hold my tongue i fe...,,178.0
5459,Taylor Swift,Long Live/New Year’s Day,Taylor Swift,2018-05-08,long live i said remember this moment in the b...,2018.0,189.0
578,Beyoncé,LIFE,Unreleased Songs,,kobalt music publishing ltd sonyatv music publ...,,270.0
4739,Rihanna,Nobody’s Business,Unapologetic,2012-11-19,rihanna you'll always be mine sing it to the w...,2012.0,49.0
4477,Nicki Minaj,Roman In Moscow [Edited Version] by Nicki Minaj,,,im big bully no bike gear i told you bitches l...,,258.0
3006,Justin Bieber,I Would,Believe Acoustic,2013-01-29,if i could take away the pain and put a smile ...,2013.0,123.0


## checking for duplicates

In [9]:
combined_df.duplicated().sum()

0

## drop unimportant columns

In [10]:
df2 = combined_df.drop(['Album','Date','Year','Unnamed: 0'],axis=1)

In [11]:
df2.shape

(5749, 3)

In [12]:
df2.head()

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...


## checking for NA values and dropping them

In [13]:
df2.isna().sum()

Artist     0
Title      0
Lyric     38
dtype: int64

In [14]:
df3 = df2.dropna()

In [15]:
df3.shape

(5711, 3)

In [16]:
df4 = df3.drop(index=2)

In [17]:
df4.head(10)

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
5,Ariana Grande,​​breathin,lacigam gnihtemos od ot thgin laiceps ruoy s't...
6,Ariana Grande,"​break up with your girlfriend, i’m bored",you got me some type of way hmm ain't used to ...
7,Ariana Grande,​positions,heaven sent you to me i'm just hopin i don't r...
8,Ariana Grande,34+35,hmm you might think i'm crazy the way i've b...
9,Ariana Grande,​imagine,step up the two of us nobody knows us get in t...
10,Ariana Grande,​needy,if you take too long to hit me back i can't pr...


In [18]:
df4.Artist.nunique()

20

In [19]:
df4.shape

(5710, 3)

In [20]:
# df4.to_csv('modified_file.csv',index=False)

## importing spacy to preprocess and load the model that we will use

In [21]:
import spacy


In [22]:
nlp = spacy.load("en_core_web_lg")

In [23]:
df4.head(50)

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
5,Ariana Grande,​​breathin,lacigam gnihtemos od ot thgin laiceps ruoy s't...
6,Ariana Grande,"​break up with your girlfriend, i’m bored",you got me some type of way hmm ain't used to ...
7,Ariana Grande,​positions,heaven sent you to me i'm just hopin i don't r...
8,Ariana Grande,34+35,hmm you might think i'm crazy the way i've b...
9,Ariana Grande,​imagine,step up the two of us nobody knows us get in t...
10,Ariana Grande,​needy,if you take too long to hit me back i can't pr...


In [24]:
df4.head()

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
5,Ariana Grande,​​breathin,lacigam gnihtemos od ot thgin laiceps ruoy s't...


## function using stemming and lemitization and remove stop words

### text preprocessing for lyrics

In [25]:
def preprocess_lyrics(text):
    text = text.lower()
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or not token.text.isalpha():
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [26]:
df4['Lyric'] = df4.Lyric.apply(preprocess_lyrics)

In [27]:
df4.head()

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",think end sean match write song ricky listen l...
1,Ariana Grande,7 rings,yeah breakfast tiffany bottle bubble girl tatt...
3,Ariana Grande,Side To Side,ariana grande nicki minaj night ariana day nic...
4,Ariana Grande,​​no tears left to cry,right state mind wanna like time be get tear l...
5,Ariana Grande,​​breathin,lacigam gnihtemos od ot thgin laiceps ruoy tub...


In [28]:
df4.sample(40)

Unnamed: 0,Artist,Title,Lyric
7,Ariana Grande,​positions,heaven send hopin repeat history pre boy tryna...
5113,Selena Gomez,A Sweeter Place,kid cudi yeah yeah yeah yeah ah selena gomez p...
4863,Rihanna,Love Looks Like Us*,lyric song release check song release
4796,Rihanna,Push Up on Me,break break break break break break break brea...
19,Ariana Grande,Let Me Love You,ariana grande break ex m single not know s be ...
3691,Lady Gaga,Heavy Metal Lover,heavy metal lover heavy metal lover heavy meta...
1855,Dua Lipa,Running,turn gold dust trust lose darkness inside igno...
32,Ariana Grande,​everytime,time tired noshow tired control yuh tell let l...
1356,Drake,One Dance,kyla baby like style drake grip waist way way ...
1775,Drake,Drake’s Voice Mail Box #2,message kim damn ve try week not message okay ...


## vectorization using sentence transformers

In [29]:
from sentence_transformers import SentenceTransformer


In [30]:
model = SentenceTransformer('all-mpnet-base-v2')

In [31]:
def get_sentence_embedding(sentence):
    return model.encode(sentence)

df4['Lyric_Embedding'] = df4['Lyric'].apply(get_sentence_embedding)

In [54]:
df4.dtypes()

Artist             object
Title              object
Lyric              object
Lyric_Embedding    object
dtype: object

In [37]:
df4.head()

Unnamed: 0,Artist,Title,Lyric,Lyric_Embedding
0,Ariana Grande,"​thank u, next",think end sean match write song ricky listen l...,"[-0.003785647, 0.023360051, -0.0039056526, -0...."
1,Ariana Grande,7 rings,yeah breakfast tiffany bottle bubble girl tatt...,"[0.046609636, 0.09596844, 0.0039622225, 0.0083..."
3,Ariana Grande,Side To Side,ariana grande nicki minaj night ariana day nic...,"[0.0010204625, 0.051407225, -0.009481423, 0.02..."
4,Ariana Grande,​​no tears left to cry,right state mind wanna like time be get tear l...,"[-0.02434266, 0.03272228, 0.014845281, 0.00261..."
5,Ariana Grande,​​breathin,lacigam gnihtemos od ot thgin laiceps ruoy tub...,"[-0.0033835403, 0.017899824, 0.005919852, 0.02..."


## cosine similarity

In [49]:
import numpy as np
import pandas as pd

In [52]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_sentence_embedding(sentence):
    return model.encode(sentence)


def find_similar_songs(input_lyrics, data, top_n=5):
    input_embedding = get_sentence_embedding(input_lyrics)

    all_embeddings = np.vstack(data['Lyric_Embedding'].values)
    
    similarities = cosine_similarity([input_embedding], all_embeddings)
    
    top_indices = np.argsort(similarities[0])[::-1][:top_n]
    
    top_songs = data.iloc[top_indices]
    top_scores = similarities[0][top_indices]
    
    results = [(row.Artist, row.Title, score) for row, score in zip(top_songs.itertuples(), top_scores)]
    
    return results




In [53]:
find_similar_songs("what the world needs now is love sweet love no not just for some but for everyone " ,df4)

[('Justin Bieber', 'We Were Born For This', 0.6169882),
 ('Coldplay', 'What the World Needs Now', 0.5924143),
 ('Lady Gaga', 'In Like With You', 0.5911218),
 ('Coldplay', 'Earth Angel (Will You Be Mine)', 0.53806674),
 ('Beyoncé', 'When You Wish Upon a Star', 0.5343462)]

In [45]:
np.save('lyric_embeddings2.npy', np.vstack(df4['Lyric_Embedding'].values))
