In [2]:
import numpy as np 
import pandas as pd 

In [3]:
temp = pd.read_csv("song_lyrics.csv")
print("Length:", len(temp))


Length: 5134856


In [10]:
temp.columns

Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id',
       'language_cld3', 'language_ft', 'language', 'word_count', 'num_parts',
       'unique_word_count', 'has_chorus', 'has_intro', 'has_outro',
       'has_bridge', 'cleaned_lyrics', 'sentiment', 'negative_sentiment',
       'neutral_sentiment', 'positive_sentiment', 'average_word_length',
       'stopword_count'],
      dtype='object')

In [4]:
temp = (
    temp
    .loc[temp["language"] == "en"]
    .loc[temp["tag"] != "misc"]
    .loc[temp["tag"] != ""]
    .loc[temp["tag"] != None]
    .loc[temp["lyrics"] != None]
    .loc[temp["lyrics"] != ""]
    .loc[temp["lyrics"] != "[Instrumental]"] 
)
temp = temp.sort_values(by="id")
print(len(temp))

3233212


In [5]:
print(temp.loc[temp["id"] == 154314].values)

[['Do I Wanna Know?' 'rock' 'Arctic Monkeys' 2013 2965702 '{}'
  "[Verse 1]\nHave you got colour in your cheeks?\nDo you ever get that fear that you can't shift the type\nThat sticks around like summat in your teeth?\nAre there some aces up your sleeve?\nHave you no idea that you're in deep?\nI've dreamt about you nearly every night this week\nHow many secrets can you keep?\n'Cause there's this tune I found\nThat makes me think of you somehow an' I play it on repeat\nUntil I fall asleep, spillin' drinks on my settee\n\n[Pre-Chorus]\n(Do I wanna know?) If this feelin' flows both ways?\n(Sad to see you go) Was sorta hopin' that you'd stay\n(Baby, we both know) That the nights were mainly made\nFor sayin' things that you can't say tomorrow day\n\n[Chorus]\nCrawlin' back to you\nEver thought of callin' when\nYou've had a few?\n'Cause I always do\nMaybe I'm too\nBusy bein' yours\nTo fall for somebody new\nNow, I've thought it through\nCrawlin' back to you\n[Verse 2]\nSo have you got the gut

In [6]:
import re
def fix_lyrics(lyrics):
    parts = re.split(r"([\n\[\]\(\)])", lyrics)
    output = " ".join(filter(None, parts))
    output = re.sub(r"([?.,!:;])",'',output)
    output = re.sub(r"in'(?= \w|\.|,|$)","ing",output.lower())
    return output


temp["lyrics"] = temp["lyrics"].apply(fix_lyrics)



In [7]:
print("Rows with null values:")
print(temp[temp.isnull().any(axis=1)])
temp = temp.dropna()
print(len(temp))

Rows with null values:
        title   tag         artist  year  views          features  \
55462     NaN   rap         Boojee  2011    128                {}   
97756     NaN  rock     Touch Amor  2011  19414  {"Touché Amoré"}   
215002    NaN   rap    Ahfueaefasf  2013    294                {}   
287274    NaN   rap        RATKING  2014   7920                {}   
322441    NaN   rap         A-Reed  2014     37                {}   
...       ...   ...            ...   ...    ...               ...   
4944516   NaN   rap       KKAUTAMA  2021     61                {}   
4965697   NaN   rap   StonedAKhana  2021      2                {}   
5079347   NaN  rock     Cloudwatch  2022      2                {}   
5101642   NaN   rap       DJ Lucas  2022      4                {}   
5125809   NaN   pop  Haarshaan . n  2022      8                {}   

                                                    lyrics       id  \
55462    what i wanna do i wanna make it out of school ...    59093   
97756 

In [8]:


def count_words_without_sections(lyrics):
 
    clean_lyrics = re.sub(r'\[\s*.*?\s*\]', '', lyrics)  # Remove [sections]
    return len([word for word in clean_lyrics.split() if word != '\\n'])

temp['word_count'] = temp['lyrics'].apply(count_words_without_sections)


temp['num_parts'] = temp['lyrics'].apply(
    lambda x: len(re.findall(r'\[\s*.*?\s*\]', x))  
)

def unique_word_count(lyrics):
  
    clean_lyrics = re.sub(r'\[\s*.*?\s*\]', '', lyrics) 
  
    return len(set(word for word in clean_lyrics.split() if word != '\\n'))

temp['unique_word_count'] = temp['lyrics'].apply(unique_word_count)


# (Handles Spaces Within Brackets)
temp['has_chorus'] = temp['lyrics'].str.contains(r'\[\s*chorus\s*\]', case=False, regex=True)
temp['has_intro'] = temp['lyrics'].str.contains(r'\[\s*intro\s*\]', case=False, regex=True)
temp['has_outro'] = temp['lyrics'].str.contains(r'\[\s*outro\s*\]', case=False, regex=True)
temp['has_bridge'] = temp['lyrics'].str.contains(r'\[\s*bridge\s*\]', case=False, regex=True)

def clean_lyrics(lyrics):
    return re.sub(r'\[\s*.*?\s*\]', '', lyrics)

temp['cleaned_lyrics'] = temp['lyrics'].apply(clean_lyrics)


from textblob import TextBlob


temp['sentiment'] = temp['cleaned_lyrics'].apply(lambda x: TextBlob(x).sentiment.polarity)
temp['negative_sentiment'] = temp['sentiment'].apply(lambda x: 1 if x < 0 else 0)
temp['neutral_sentiment'] = temp['sentiment'].apply(lambda x: 1 if x == 0 else 0)
temp['positive_sentiment'] = temp['sentiment'].apply(lambda x: 1 if x > 0 else 0)






import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


stop_words = set(stopwords.words('english'))

# Average word length (on cleaned lyrics)
temp['average_word_length'] = temp['cleaned_lyrics'].apply(
    lambda x: sum(len(word) for word in x.split() if word.isalpha()) / len(x.split()) if len(x.split()) > 0 else 0
)

# Stopword count (on cleaned lyrics)
temp['stopword_count'] = temp['cleaned_lyrics'].apply(
    lambda x: sum(1 for word in x.split() if word.lower() in stop_words)
)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aldri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
print(f"Original dataset shape: {temp.shape}")
temp = temp[temp['num_parts'] > 0]

print(f"Filtered dataset shape: {temp.shape}")
# Display the shape of the filtered DataFrame


# Preview the first few rows of the filtered DataFrame
temp.head()

Original dataset shape: (3233094, 25)
Filtered dataset shape: (1529698, 25)


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,...,has_intro,has_outro,has_bridge,cleaned_lyrics,sentiment,negative_sentiment,neutral_sentiment,positive_sentiment,average_word_length,stopword_count
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}",[ chorus opera steve & cam'ron ] \n killa cam ...,1,en,en,...,False,False,False,\n killa cam killa cam cam \n killa cam killa...,0.005455,0,0,1,3.366828,184
1,Can I Live,rap,JAY-Z,1996,468624,{},[ produced by irv gotti ] \n \n [ intro ] \n y...,3,en,en,...,True,False,False,\n \n \n yeah hah yeah roc-a-fella \n we inv...,-0.022526,1,0,0,3.929236,237
2,Forgive Me Father,rap,Fabolous,2003,4743,{},maybe cause i'm eatin \n and these bastards fi...,4,en,en,...,False,False,False,maybe cause i'm eatin \n and these bastards fi...,-0.019329,1,0,0,3.618881,271
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[ produced by kanye west and brian miller ] \n...,5,en,en,...,False,False,False,\n \n \n ugh killa \n baby \n kanye this tha...,-0.058609,1,0,0,3.326115,300
4,Fly In,rap,Lil Wayne,2005,78271,{},"[ intro ] \n so they ask me \n ""young boy \n w...",6,en,en,...,True,False,False,"\n so they ask me \n ""young boy \n what you g...",-0.035069,1,0,0,3.509302,188
