In [None]:
!pip install kaggle -q

In [None]:
import os
import json
import zipfile
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
kaggle_config = json.load(open('kaggle.json'))


In [None]:
kaggle_config.keys()

dict_keys(['username', 'key'])

In [None]:
#set the environment variables

os.environ['KAGGLE_USERNAME'] = kaggle_config['username']
os.environ['KAGGLE_KEY'] = kaggle_config['key']

In [None]:
#loading  the dataset using kaggle api

!kaggle datasets download notshrirang/spotify-million-song-dataset

Dataset URL: https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
License(s): CC0-1.0
spotify-million-song-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!ls

kaggle.json  spotify-million-song-dataset.zip
sample_data  spotify_millsongdata.csv


In [None]:
with zipfile.ZipFile('spotify-million-song-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
df = pd.read_csv('/content/spotify_millsongdata.csv')
df.shape

(57650, 4)

In [None]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [None]:
df = df.sample(10000)

df = df.drop('link', axis=1).reset_index(drop=True)

In [None]:
all_lyrics = " ".join(df['text'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_lyrics)

In [None]:
df.head()

Unnamed: 0,artist,song,text
0,Omd,Only Tears,Only tears are meant to fall \r\nOnly once an...
1,Prince,A Case Of U,I am a lonely painter \r\nI live in a box of ...
2,Wanda Jackson,Last Letter,Why must you treat me as if I were only a frie...
3,Qntal,Ad Mortem Festinamus,Scribere probosui de contemptu mundano \r\nUt...
4,Elvis Presley,A Big Hunk O' Love,"Hey baby, I ain't askin' much of you \r\nNo n..."


In [None]:
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Most common words in lyrics")
plt.show()

NameError: name 'plt' is not defined

Data Preprocessing

In [None]:
#download nltk data

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))

NameError: name 'stopwords' is not defined

In [None]:
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()

    tokens = word_tokenize(text)
    # Filter out stopwords
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [None]:
#apply Preprocessing to lyrics

df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
df.head()

Unnamed: 0,artist,song,text,cleaned_text
0,Omd,Only Tears,Only tears are meant to fall \r\nOnly once an...,tears meant fall start lose fight nothing else...
1,Prince,A Case Of U,I am a lonely painter \r\nI live in a box of ...,lonely painter live box paints used 2 frighten...
2,Wanda Jackson,Last Letter,Why must you treat me as if I were only a frie...,must treat friend done 's made different cold ...
3,Qntal,Ad Mortem Festinamus,Scribere probosui de contemptu mundano \r\nUt...,scribere probosui de contemptu mundano ut dege...
4,Elvis Presley,A Big Hunk O' Love,"Hey baby, I ain't askin' much of you \r\nNo n...","hey baby , ai n't askin ' much baby , ai n't a..."


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])


In [None]:
# Compute cosine similarity between all songs
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [None]:
def recommend_songs(song_name, df=df, cosine_sim=cosine_sim, top_n=5):
    """
    Recommend top N similar songs based on cosine similarity.

    Parameters:
    song_name (str): Name of the song to search
    df (DataFrame): Dataset containing song and artist columns
    cosine_sim (array): Cosine similarity matrix
    top_n (int): Number of recommendations to return

    Returns:
    DataFrame or str: Recommended songs or error message
    """

    # Find song index (case insensitive)
    matches = df[df['song'].str.lower() == song_name.lower()].index

    if len(matches) == 0:
        return "Song not found in the dataset!"

    song_idx = matches[0]

    # Get similarity scores for the song
    sim_scores = list(enumerate(cosine_sim[song_idx]))

    # Sort songs by similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar songs (excluding the song itself)
    sim_scores = sim_scores[1:top_n + 1]

    # Extract song indices
    song_indices = [i[0] for i in sim_scores]

    # Return recommended songs
    return df[['artist', 'song']].iloc[song_indices]


In [None]:
df['song'][2]

'Last Letter'

In [None]:
print("\nRecommendations for the song 'Last Letter':\n")

# Wrapped the song name in quotes to make it a string
recommendations = recommend_songs("Last Letter")
print(recommendations)


Recommendations for the song 'Last Letter':

                        artist                          song
3551                      Cake  Friend Is A Four Letter Word
9572  Electric Light Orchestra     From The End Of The World
1572              Donna Summer                      Our Love
6348                Roxy Music                Same Old Scene
6276           The Temptations            Just One Last Look
