In [1]:
!pip install kagglehub --upgrade



In [2]:
import kagglehub

In [3]:
# download dataset from kaggle

data_path = kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

In [4]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# segregate 2 data's

data_path = Path(data_path)

songs_data_path = data_path / 'Music Info.csv'
users_data_path = data_path / 'User Listening History.csv'

In [6]:
# load the songs dataset

df_songs = pd.read_csv(songs_data_path)
df_songs.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [7]:
# size of dataset

df_songs.shape

(50683, 21)

In [8]:
# data info

df_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50683 entries, 0 to 50682
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   track_id             50683 non-null  object 
 1   name                 50683 non-null  object 
 2   artist               50683 non-null  object 
 3   spotify_preview_url  50683 non-null  object 
 4   spotify_id           50683 non-null  object 
 5   tags                 49556 non-null  object 
 6   genre                22348 non-null  object 
 7   year                 50683 non-null  int64  
 8   duration_ms          50683 non-null  int64  
 9   danceability         50683 non-null  float64
 10  energy               50683 non-null  float64
 11  key                  50683 non-null  int64  
 12  loudness             50683 non-null  float64
 13  mode                 50683 non-null  int64  
 14  speechiness          50683 non-null  float64
 15  acousticness         50683 non-null 

# Data Cleaning



1.   remove duplicates
2.   since genre has 56% values as missing, we will drop it
3.   assign missing values of tags as "no tags"
4.   convert columns to lowercase
5.   drop unique columns (like Name, Id, etc.) on which similarity can't be calculated





In [9]:
# reset the index

df_songs.reset_index(drop=True, inplace=True)

In [10]:
# duplicates in the data based on spotify_id

df_songs.duplicated(subset="spotify_id").sum()

np.int64(9)

In [11]:
# drop duplicates

df_songs.drop_duplicates(subset=["spotify_id", "year", "duration_ms"], inplace=True)

In [12]:
# check for duplicates

df_songs.duplicated(subset=["spotify_id", "year", "duration_ms"]).sum()

np.int64(0)

In [13]:
# remove columns not required for collaborative filtering

cols_to_remove = ['track_id', 'name', 'spotify_id', 'genre', 'spotify_preview_url']
df_content_filtering = df_songs.drop(columns=cols_to_remove)
df_content_filtering

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,The Killers,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.360,1,0.0746,0.001190,0.000000,0.0971,0.240,148.114,4
1,Oasis,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.000000,0.2070,0.651,174.426,4
2,Nirvana,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.0400,0.000175,0.000459,0.0878,0.543,120.012,4
3,Franz Ferdinand,"rock, alternative, indie, alternative_rock, in...",2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.1330,0.490,104.560,4
4,Radiohead,"rock, alternative, indie, alternative_rock, in...",2008,238640,0.515,0.430,7,-9.935,1,0.0369,0.010200,0.000141,0.1290,0.104,91.841,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50678,アンティック-珈琲店-,,2008,273440,0.438,0.933,6,-3.062,0,0.1650,0.003120,0.000000,0.1300,0.421,166.956,4
50679,ACIDMAN,"rock, alternative_rock, japanese, cover",2004,275133,0.351,0.693,0,-6.811,1,0.1200,0.000940,0.000049,0.1920,0.450,200.350,4
50680,coldrain,"metal, metalcore, post_hardcore",2014,254826,0.434,0.975,10,-3.092,0,0.2680,0.000108,0.001410,0.1630,0.282,158.025,4
50681,アンティック-珈琲店-,,2008,243293,0.513,0.902,4,-3.914,0,0.0530,0.000715,0.001350,0.0571,0.618,109.923,4


In [14]:
# check for missing values

df_content_filtering.isna().sum()

Unnamed: 0,0
artist,0
tags,1126
year,0
duration_ms,0
danceability,0
energy,0
key,0
loudness,0
mode,0
speechiness,0


In [15]:
# fill the tags column missing values with string "no tags"

df_content_filtering.fillna({"tags": "no_tags"}, inplace=True)

In [16]:
# check for missing values

df_content_filtering.isna().sum()

Unnamed: 0,0
artist,0
tags,0
year,0
duration_ms,0
danceability,0
energy,0
key,0
loudness,0
mode,0
speechiness,0


In [17]:
# artists names as lowercase

df_content_filtering["artist"] = df_content_filtering["artist"].str.lower()

In [18]:
# number of unique artists

df_songs.loc[:,'artist'].nunique()

8317

In [19]:
# number of unique year values

df_songs.loc[:,'year'].nunique()

75

In [20]:
# min & max of numerical columns

df_songs.select_dtypes(include="int").agg(["min", "max"])

Unnamed: 0,year,duration_ms,key,mode,time_signature
min,1900,1439,0,0,0
max,2022,3816373,11,1,5


In [21]:
# value counts for the tags

(
    df_songs
    .loc[:,'tags']
    .str.lower()
    .str.split(',')
    .explode()
    .str.strip()
    .value_counts()
)

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
rock,10681
indie,7284
electronic,6592
alternative,6271
pop,4650
...,...
dark_ambient,602
japanese,489
polish,411
j_pop,213


In [22]:
# tags whose value is greater than 1K

(
    df_songs
    .loc[:,'tags']
    .str.lower()
    .str.split(',')
    .explode()
    .str.strip()
    .value_counts()
    .loc[lambda ser: ser>=1000]
)

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
rock,10681
indie,7284
electronic,6592
alternative,6271
pop,4650
...,...
ska,1088
gothic_metal,1072
grindcore,1040
french,1018


# Transform the data

In [23]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [24]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from category_encoders.count import CountEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [25]:
df_content_filtering.head(3)

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,the killers,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,oasis,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,nirvana,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4


In [26]:
df_content_filtering.shape

(50674, 16)

In [27]:
# columns to transform

frequency_encode_cols = ["year"]
ohe_cols = ["artist", "time_signature", "key"]
tfidf_cols = 'tags' # TfidfVectorizer expects a 1D text-like input — a Series, not a DataFrame
standard_scale_cols = ["duration_ms", "loudness", "tempo"]
min_max_scale_cols = ["danceability", "energy", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]

In [28]:
len(frequency_encode_cols + ohe_cols + standard_scale_cols + min_max_scale_cols)

14

In [29]:
# transform the data

transformer = ColumnTransformer(transformers=[
    ("frequency_encode", CountEncoder(normalize=True, return_df=True), frequency_encode_cols),
    ("ohe", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ("tfidf", TfidfVectorizer(max_features=85), tfidf_cols),
    ("standard_scale", StandardScaler(), standard_scale_cols),
    ("min_max_scale", MinMaxScaler(), min_max_scale_cols)
], remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False)

transformer

In [30]:
# fit the transformer

transformer.fit(df_content_filtering)

In [31]:
# transform the data

transformed_df = transformer.transform(df_content_filtering)

In [32]:
transformed_df.shape

(50674, 8431)

In [33]:
transformed_df # it's a sparse matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 907911 stored elements and shape (50674, 8431)>

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# featch songs where artist in Shakira

df_songs.loc[df_songs['artist'] == "Shakira", :]

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1025,TRLWDVU128F932B093,"Whenever, Wherever",Shakira,https://p.scdn.co/mp3-preview/09ddeb4ae33ee6e8...,07PHBDuUmOeZ7jeKSbAbKi,"rock, pop, female_vocalists, singer_songwriter...",,2012,196826,0.787,...,1,-4.967,0,0.0474,0.298,5e-06,0.206,0.86,107.674,4
2068,TRILOWN128F426080F,Underneath Your Clothes,Shakira,https://p.scdn.co/mp3-preview/6c5a56058ce04371...,07qRl4PT2lA6O3KN40McLz,"rock, pop, female_vocalists, singer_songwriter...",Pop,2013,224893,0.707,...,8,-5.293,1,0.0298,0.691,0.0,0.103,0.407,82.784,4
2205,TRXLMFJ12903CC06F7,She Wolf,Shakira,https://p.scdn.co/mp3-preview/4dc802fd3b06fcb5...,075xFXR0JDBwFPVinG1ig5,"electronic, pop, female_vocalists, experimenta...",,2009,187866,0.857,...,7,-6.48,1,0.0428,0.323,0.00322,0.314,0.868,121.994,4
2469,TRSQAWU128F92EA20F,La Tortura,Shakira,https://p.scdn.co/mp3-preview/e62ae4649f029729...,0ofDrTTcinCUxm7wqCLPQa,"electronic, pop, female_vocalists, singer_song...",Latin,2011,213106,0.741,...,0,-5.904,1,0.0421,0.023,0.000788,0.12,0.834,100.001,4
3374,TRINUNP12903CD84D9,Did It Again,Shakira,https://p.scdn.co/mp3-preview/5477eae2283113ff...,0eMNEdcC5OImvrfn79J9dU,"electronic, pop, female_vocalists, experimenta...",,2009,227333,0.869,...,5,-5.069,0,0.0896,0.509,0.0,0.0741,0.599,137.955,4
4319,TROLUWR128F92E5858,Te Dejo Madrid,Shakira,https://p.scdn.co/mp3-preview/bcaccba847a5cf53...,0IESLhxv5iqXvMH5mm3z88,"pop, experimental, singer_songwriter, dance, l...",,2001,187333,0.741,...,11,-5.011,1,0.0379,0.0244,1.5e-05,0.283,0.926,131.017,4
4600,TRLNLES128F932DA8E,Fool,Shakira,https://p.scdn.co/mp3-preview/4ce283ae87d032f5...,0MttJjfO3pkTHJgVdXqPcP,"rock, pop, female_vocalists, acoustic, pop_rock",Rap,2001,230333,0.64,...,3,-5.848,1,0.0241,0.0456,0.0,0.0988,0.42,100.994,4
4618,TRUIPZL12903CA0BFE,Rules,Shakira,https://p.scdn.co/mp3-preview/e5c557dc2b36006a...,1f7TZQzs4UikFQIzeWOtOj,"pop, female_vocalists, singer_songwriter, pop_...",,2001,219106,0.692,...,5,-6.365,0,0.0418,0.0359,2e-06,0.185,0.775,149.0,4
6047,TRAAKDG128F42A0ECB,Hips Don't Lie,Shakira,https://p.scdn.co/mp3-preview/3859547944f57cfb...,01Yj2MCGpjZs34PRlGgz4K,"pop, female_vocalists, singer_songwriter, danc...",Pop,2001,217453,0.777,...,10,-5.867,0,0.0734,0.284,0.0,0.43,0.76,100.003,4
6819,TRBAHID128F4278EAF,Objection (Tango),Shakira,https://p.scdn.co/mp3-preview/bf65095d5ce58358...,0p9QhtUdbyDAQ6k14hQ2i3,"pop, female_vocalists, singer_songwriter, danc...",Pop,2001,222533,0.603,...,11,-5.282,0,0.0677,0.0147,0.0,0.0246,0.705,179.344,4


In [36]:
# build input vector

song_input = df_content_filtering[df_songs['name'] == "Whenever, Wherever"]
song_input

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1025,shakira,"rock, pop, female_vocalists, singer_songwriter...",2012,196826,0.787,0.828,1,-4.967,0,0.0474,0.298,5e-06,0.206,0.86,107.674,4


In [37]:
# input vector to calculate similarity

input_vector = transformer.transform(song_input)
input_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (1, 8431)>

In [38]:
# calculate the similarity matrix

similarity_scores = cosine_similarity(transformed_df, input_vector)
similarity_scores

array([[0.99999914],
       [0.99999847],
       [0.99999921],
       ...,
       [0.99999877],
       [0.9999992 ],
       [0.99999891]])

In [39]:
similarity_scores.shape

(50674, 1)

In [40]:
top_10_songs_indexes = np.argsort(similarity_scores.ravel())[-11:][::-1] # argsort() orts in ascending order (-1 to reverse)
top_10_songs_indexes

array([ 1025, 12305,  6046,  6129, 17241,  6133,  7172,  6121,  6526,
       38383,  6287])

In [41]:
top_10_songs_names = df_content_filtering.loc[top_10_songs_indexes, :]
top_10_songs_names

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1025,shakira,"rock, pop, female_vocalists, singer_songwriter...",2012,196826,0.787,0.828,1,-4.967,0,0.0474,0.298,5e-06,0.206,0.86,107.674,4
12305,beck,"rock, alternative, experimental, funk",1996,215160,0.71,0.886,1,-7.247,1,0.351,0.05,0.00818,0.0526,0.564,174.46,4
6046,iyaz,"pop, dance, rap, hip_hop, rnb, male_vocalists,...",2010,181946,0.721,0.746,9,-6.565,1,0.0644,0.163,0.0,0.195,0.182,91.033,4
6129,magic!,"pop, reggae, male_vocalists",2014,223830,0.747,0.752,1,-5.315,1,0.0354,0.047,0.0,0.0703,0.915,144.047,4
17241,lady gaga,"pop, dance, american, love",2011,316520,0.368,0.833,6,-5.869,0,0.0464,0.000154,0.00651,0.0545,0.482,143.982,4
6133,corinne bailey rae,"pop, female_vocalists, jazz, singer_songwriter...",2006,240175,0.586,0.222,5,-15.114,1,0.0395,0.665,0.814,0.0981,0.0792,122.766,4
7172,vanessa carlton,"pop, female_vocalists, singer_songwriter, pian...",2002,238440,0.319,0.671,0,-3.323,1,0.0329,0.226,0.0,0.169,0.409,185.256,3
6121,nena,"pop, female_vocalists, 80s, german, new_wave",2007,234800,0.404,0.433,4,-12.84,1,0.0616,0.0936,5e-06,0.112,0.573,192.711,4
6526,demi lovato,"pop, female_vocalists, beautiful",2011,220146,0.58,0.539,7,-5.229,1,0.0285,0.105,0.0,0.122,0.198,103.813,4
38383,steve vai,"instrumental, guitar",1990,351160,0.545,0.77,5,-8.854,1,0.0474,0.000287,0.75,0.718,0.63,127.058,4


In [42]:
def recommend(song_name, songs_data, transformed_data, k=10):
  """
  Recommends top k songs similar to the given song basedon content-based filtering.

  Parameters:
  song_name (str): the name of the song to base the recommendations on.
  song_data (DataFrame): the dataframe containing information.
  transformed_data (ndarray): the transformed data matrix for similarity calculations.
  k (int, optional): number of similar songs to recommend. default is 10.

  Returns:
  DataFrame: a dataframe containing the top k recommended songs with their names, artists, and Spotify preview URLs.
  """

  # filter out the song from data
  song_row = songs_data.loc[songs_data['name'] == song_name,:]
  if song_row.empty:
    print('Song not found in the dataset')
  else:
    # get the index of the song
    song_index = song_row.index[0]
    print(song_index)

    # generate the input vector
    input_vector = transformed_data[song_index].reshape(1, -1)

    # calculate similarity scores
    similarity_scores = cosine_similarity(input_vector, transformed_data)
    print(similarity_scores.shape)

    # get the top k songs
    top_k_songs_indexes = np.argsort(similarity_scores.ravel())[-k-1:][::-1]
    print(top_10_songs_indexes)

    # get the top k song names
    top_k_songs_names = songs_data.iloc[top_k_songs_indexes]

    # print the top k songs
    top_k_list = top_k_songs_names[['name', 'artist', 'spotify_preview_url']].reset_index(drop=True)

    return top_k_list

In [43]:
# recommend song using the function

recommend("Whenever, Wherever", songs_data=df_songs, transformed_data=transformed_df,k=10)

1025
(1, 50674)
[ 1025 12305  6046  6129 17241  6133  7172  6121  6526 38383  6287]


Unnamed: 0,name,artist,spotify_preview_url
0,"Whenever, Wherever",Shakira,https://p.scdn.co/mp3-preview/09ddeb4ae33ee6e8...
1,Why Wait,Shakira,https://p.scdn.co/mp3-preview/d78c90c5cb5626be...
2,Hips Don't Lie,Shakira,https://p.scdn.co/mp3-preview/3859547944f57cfb...
3,Oops!...I Did It Again,Britney Spears,https://p.scdn.co/mp3-preview/7fb86827422540ad...
4,Perfect Lover,Britney Spears,https://p.scdn.co/mp3-preview/52671e54d36f077e...
5,Bootylicious,Destiny's Child,https://p.scdn.co/mp3-preview/7e327ccb1e4c52b2...
6,Wild Things,Alessia Cara,https://p.scdn.co/mp3-preview/c13f00088525d0b2...
7,La Isla Bonita,Madonna,https://p.scdn.co/mp3-preview/d8f3cafe99c1f0cd...
8,Cruel Summer,Bananarama,https://p.scdn.co/mp3-preview/47d13ef240a58bef...
9,Dreams for Plans,Shakira,https://p.scdn.co/mp3-preview/6e2c021846087a88...
