# Music Recommendations Based on Lyrics Similarity
Latent Sematic Analysis

In [1]:
import math
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
import re
from string import punctuation


import scipy.linalg as la
from sparsesvd import sparsesvd 
from scipy.sparse import csc_matrix

from scipy.spatial.distance import pdist, cdist


In [2]:
data = pd.read_csv('lyrics.csv')
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [122]:
data['s-a'] = data['song'] + ' - ' + data['artist'] # song name might be duplicated, we use s-a to identify each obs
data.head()

Unnamed: 0,artist,song,link,text,s-a
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd...",Ahe's My Kind Of Girl - ABBA
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl...","Andante, Andante - ABBA"
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...,As Good As New - ABBA
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,Bang - ABBA
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,Bang-A-Boomerang - ABBA


In [123]:
data.shape

(57650, 5)

In [191]:
# only use first 5000 songs as an example
data10000 = data.iloc[:10000, :] 
data10000.shape

(10000, 5)

In [192]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [193]:
#1 term frequency
def tf(docs):
    wc_dic = {}
    for i in range(docs.shape[0]):
        # str.maketrans: If there is a third argument, it must be a string, whose characters will be mapped to None in the result.
        name = docs.loc[i, 's-a']
        text = docs.loc[i, 'text']
        terms = re.sub(r'[^a-zA-Z\s]','', text).lower().split()
     
        # filter out stop words
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        terms = [t for t in terms if not t in stop_words]
        
        count = Counter(terms)
        wc_dic[name] = count
    df = pd.DataFrame(wc_dic).fillna(0)
    return df

In [194]:
# 2 inverse document frequency 
def idf(docs): 
    termf = tf(docs)
    docf = (termf > 0).sum(axis=1)
    n = len(docs)
    return np.log(n/(1+docf))

In [195]:
# 3 tf-idf table
def tf_idf(docs):
    return tf(docs).multiply(idf(docs), axis=0)

In [196]:
df = tf_idf(data10000)
# this could take a few minutes

In [197]:
df.shape

(36029, 10000)

In [198]:
k = 100
T, s, D = sparsesvd(csc_matrix(df), k)  
print(T.shape, s.shape, D.shape) # each col of T is a term; each col of D is a song
X = np.diag(s) @ D # each col is a song
print('X shape:', X.shape)

(100, 36029) (100,) (100, 10000)
X shape: (100, 10000)


In [199]:
data.loc[data['s-a']=='Rolling In The Deep - Adele']

Unnamed: 0,artist,song,link,text,s-a
22979,Adele,Rolling In The Deep,/a/adele/rolling+in+the+deep_20894941.html,There's a fire starting in my heart \nReachin...,Rolling In The Deep - Adele


In [236]:
# input current song
new_data = data.loc[data['s-a']=='Rolling In The Deep - Adele'].reset_index(drop=True)
all_terms = tf_idf(new_data) 
new_sa = new_data['s-a'][0]
v = df.join(all_terms).fillna(0)[new_sa] 

q = (v.T @ T.T @ np.diag(1.0/s)).reshape(-1,1) 
print('q shape:', q.shape) 

q shape: (100, 1)


In [237]:
ranked_reviews = df.columns[np.argsort(cdist(q.T, X.T, 'cosine'))].values[0][::-1]

In [238]:
print('='*10, "Current Song", '='*10, '\n')
print('<', new_sa, '>')
print('-'*40)
print(new_data['text'][0][:400])
print('...\n')

print('='*10, "Top 5 Recommended", '='*10, '\n')

for i, title in enumerate(ranked_reviews[:5]):
    print('%03d' % (i+1), '<', title, '>')
    print('-'*40)
    print(data10000.loc[data10000['s-a'] == title, 'text'].reset_index(drop=True)[0][:400])
    print('...\n')



< Rolling In The Deep - Adele >
----------------------------------------
There's a fire starting in my heart  
Reaching a fever pitch and it's bringing me out the dark  
Finally I can see you crystal clear  
Go 'head and sell me out and I'll lay your ship bare  
See how I leave with every piece of you  
Don't underestimate the things that I will do  
There's a fire starting in my heart  
Reaching a fever pitch and its bringing me out the dark  
  
The scars of your lov
...


001 < I'm Not Gonna Miss You - Glen Campbell >
----------------------------------------
I'm still here, but yet I'm gone  
I don't play guitar or sing my songs  
They never defined who I am  
The man that loves you 'til the end  
  
You're the last person I will love  
You're the last face I will recall  
And best of all, I'm not gonna miss you  
Not gonna miss you  
  
I'm never gonna hold you like I did  
Or say, "I love you" to the kids  
You're never gonna see it in my eyes  
It'
...

002 < Cherizar - Gino Van

### Let's try a new song

In [232]:
data.loc[data['s-a']=='Dj Got Us Falling In Love - Usher']

Unnamed: 0,artist,song,link,text,s-a
20299,Usher,Dj Got Us Falling In Love,/u/usher/dj+got+us+falling+in+love_20604379.html,Usher (yeah man) \nSo we back in the club \n...,Dj Got Us Falling In Love - Usher


In [233]:
# input current song
new_data = data.loc[data['s-a']=='Dj Got Us Falling In Love - Usher'].reset_index(drop=True)
all_terms = tf_idf(new_data) 
new_sa = new_data['s-a'][0]
v = df.join(all_terms).fillna(0)[new_sa] 

q = (v.T @ T.T @ np.diag(1.0/s)).reshape(-1,1) 
print('q shape:', q.shape) 

q shape: (100, 1)


In [234]:
ranked_reviews = df.columns[np.argsort(cdist(q.T, X.T, 'cosine'))].values[0][::-1]

In [235]:
print('='*10, "Current Song", '='*10, '\n')
print('<', new_sa, '>')
print('-'*40)
print(new_data['text'][0][:500])
print('...\n')

print('='*10, "Top 5 Recommended", '='*10, '\n')

for i, title in enumerate(ranked_reviews[:5]):
    print('%03d' % (i+1), '<', title, '>')
    print('-'*40)
    print(data10000.loc[data10000['s-a'] == title, 'text'].reset_index(drop=True)[0][:500])
    print('...\n')



< Dj Got Us Falling In Love - Usher >
----------------------------------------
Usher (yeah man)  
So we back in the club  
With our bodies rockin' from side to side (side to side)  
Thank God the week is done  
I feel like a zombie goin' back to life (back to life)  
Hands up, and suddenly we all got our hands up  
No control of my body  
Ain't I seen you before?  
I think I remember those eyes, eyes, eyes, eyes  
  
'Cause baby tonight,  
The D-J got us falling in love again  
Yeah, baby tonight,  
The D-J got us falling in love again  
So dance, dance,  
Like it's the la
...


001 < Love Shines - Fleetwood Mac >
----------------------------------------
You've got a sweet heart  
Never will you be replaced  
And it's so hard  
The memory can't be erased  
Love shines when I think of you  
You make it happen  
You make it true  
(love shines)  
Love shines there can be no doubt  
What this feeling is all about  
Oh baby your  
Love shines, love shines so bright  
(so bright)  
Did you