In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# loading the dataset that contains lyrics data with word count for each track.
merged_df = pd.read_csv('/Users/vivekrambha/Documents/QMUL Slides and Notes/Music recommendation system/combined_data.csv')

In [3]:
# created a table where each row corresponds to a track, each column corresponding to a word and values are the word counts for each track 
track_word_matrix = merged_df.pivot_table(index='track_id', columns='word', values='count', fill_value=0)

In [4]:
# lyrics of the input song which is same as the MP3 file present in part 2
input_lyrics = """It's been so long
That I haven't seen your face
I'm tryna be strong
But the strength I have is washing away
It won't be long
Before I get you by my side
And just hold you, tease you, squeeze you
Tell you what's been on my mind
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
Girl, I know
Mistakes were made between us two
And we showed our ass that night
Even said some things weren't true
Watched you go
And haven't seen my girl since then
Why can't it be the way it was
'Cause you were my homie, lover and friend
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
I can't lie (I miss you much)
Watchin' every day that goes by (I miss you much)
Until I get you back, I'm gon' try (yes, I miss you much)
'Cause you are the apple of my eye (girl, I miss you much)
(I miss you much)
I can't lie (I miss you much)
Watchin' every day that goes by (I miss you much)
Until I get you back, I'm gon' try (yes, I miss you much)
'Cause you are the apple of my eye (girl, I miss you much)
(I miss you much)
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na
I want you to fly with me (want you to fly)
I miss how you lie with me (miss how you lie)
Just wish you could dine with me (wish you could dine with me)
The one that'll grind with me (said the one that'll grind with me)
I want you to fly with me (want you to fly)
I miss how you lie with me (oh, miss how you lie)
Just wish you could dine with me (wish you could dine)
The one that'll grind with me (oh, one that'll grind, yeah, yeah)
I wanna make up right now, na, na
I wanna make up right now, na, na (make up)
Wish we never broke up right now, na, na
We need to link up right now, na, na (link up, baby)
I wanna make up right now, na, na
I wanna make up right now, na, na
Wish we never broke up right now, na, na
We need to link up right now, na, na (link up)
"""

In [5]:
#initialize a countvectorizer
vectorizer = CountVectorizer(vocabulary=track_word_matrix.columns)

# transform the input lyrics into a veector based on the word counts
input_vector = vectorizer.transform([input_lyrics])

In [6]:
# Convert the sparse matrix to a dataframe
input_df = pd.DataFrame(input_vector.toarray(), columns=vectorizer.get_feature_names_out())

In [7]:
# computing cosine similarity between lyrics vector and each track in the dataset
similarity_scores = cosine_similarity(input_df, track_word_matrix)

In [8]:
# convert similarity score into a dataframe to get all values
similarity_df = pd.DataFrame(similarity_scores.T, index=track_word_matrix.index, columns=['similarity'])

In [9]:
# getting top five recommendations
top_n = 5
recommendations = similarity_df.sort_values(by='similarity', ascending=False).head(top_n)

In [10]:
print(recommendations)

                    similarity
track_id                      
TRAWSJK128F42767F2    0.557790
TRAEROB12903CE3E7D    0.534168
TRARBBK128F427ED68    0.506242
TRAWYNV128F92F5E7E    0.490976
TRADGWP128F42618E4    0.434340


In [11]:
recommended_tracks = merged_df.merge(recommendations, left_on='track_id', right_index=True)

In [12]:
recommended_tracks = recommended_tracks[['track_id', 'similarity', 'artist_name', 'song_title', 'album_name']].drop_duplicates()

In [13]:
for index, row in recommended_tracks.iterrows():
    print(f"Song ID: {row['track_id']}, Similarity: {row['similarity']:.4f}, Artist: {row['artist_name']}, Title: {row['song_title']}, Album: {row['album_name']}")

Song ID: TRARBBK128F427ED68, Similarity: 0.5062, Artist: Koffi Olomide, Title: Elle Et Moi, Album: Tcha Tcho
Song ID: TRAWYNV128F92F5E7E, Similarity: 0.4910, Artist: Guns N' Roses, Title: Catcher In The Rye, Album: Chinese Democracy
Song ID: TRADGWP128F42618E4, Similarity: 0.4343, Artist: Madilu System, Title: Vincent, Album: La Bonne Humeur
Song ID: TRAEROB12903CE3E7D, Similarity: 0.5342, Artist: Pepper, Title: Stormtrooper (Live), Album: Kona Gold
Song ID: TRAWSJK128F42767F2, Similarity: 0.5578, Artist: Ini Kamoze, Title: Here Comes The Hotstepper, Album: 100 R&B Classics - The Anthems
