# Song Recommendation Based on Lyrics Similarity
Latent Sematic Analysis

In [1]:
import math
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
import re
from string import punctuation


import scipy.linalg as la
from sparsesvd import sparsesvd 
from scipy.sparse import csc_matrix

from scipy.spatial.distance import pdist, cdist


In [2]:
data = pd.read_csv('lyrics.csv')
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [122]:
data['s-a'] = data['song'] + ' - ' + data['artist'] # song name might be duplicated, we use s-a to identify each obs
data.head()

Unnamed: 0,artist,song,link,text,s-a
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd...",Ahe's My Kind Of Girl - ABBA
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl...","Andante, Andante - ABBA"
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...,As Good As New - ABBA
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,Bang - ABBA
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,Bang-A-Boomerang - ABBA


In [123]:
data.shape

(57650, 5)

In [124]:
# only use first 5000 songs as an example
data5000 = data.iloc[:5000, :] 
data5000.shape

(5000, 5)

In [125]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [126]:
#1 term frequency
def tf(docs):
    wc_dic = {}
    for i in range(docs.shape[0]):
        # str.maketrans: If there is a third argument, it must be a string, whose characters will be mapped to None in the result.
        name = docs.loc[i, 's-a']
        text = docs.loc[i, 'text']
        terms = re.sub(r'[^a-zA-Z\s]','', text).lower().split()
     
        # filter out stop words
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        terms = [t for t in terms if not t in stop_words]
        
        count = Counter(terms)
        wc_dic[name] = count
    df = pd.DataFrame(wc_dic).fillna(0)
    return df

In [127]:
# 2 inverse document frequency 
def idf(docs): 
    termf = tf(docs)
    docf = (termf > 0).sum(axis=1)
    n = len(docs)
    return np.log(n/(1+docf))

In [128]:
# 3 tf-idf table
def tf_idf(docs):
    return tf(docs).multiply(idf(docs), axis=0)

In [129]:
df = tf_idf(data5000)
#

In [130]:
df.shape # repeated song

(22544, 5000)

In [131]:
k = 100
T, s, D = sparsesvd(csc_matrix(df), k)  
print(T.shape, s.shape, D.shape) # each col of T is a term; each col of D is a doc
X = np.diag(s) @ D # each col is a doc
print('X shape:', X.shape)

(100, 22544) (100,) (100, 5000)
X shape: (100, 5000)


In [132]:
new_data = data.iloc[5000:5001, :].reset_index(drop=True)
all_terms = tf_idf(new_data) 
new_sa = new_data['s-a'][0]
v = df.join(all_terms).fillna(0)[new_sa] 

q = (v.T @ T.T @ np.diag(1.0/s)).reshape(-1,1) 
print('q shape:', q.shape) 

q shape: (100, 1)


In [133]:
ranked_reviews = df.columns[np.argsort(cdist(q.T, X.T, 'cosine'))].values[0][::-1]

In [139]:
print('='*10, "Current Song", '='*10, '\n')
print('<', new_sa, '>')
print('-'*40)
print(new_data['text'][0][:200])
print('...\n')

print('='*10, "Top 5 Recommended", '='*10, '\n')

for i, title in enumerate(ranked_reviews[:5]):
    print('%03d' % (i+1), '<', title, '>')
    print('-'*40)
    print(data5000.loc[data5000['s-a'] == title, 'text'].reset_index(drop=True)[0][:200])
    print('...\n')



< A Big Hunk O' Love - Elvis Presley >
----------------------------------------
Hey baby, I ain't askin' much of you  
No no no no no no no no  
Baby, I ain't askin' much of you  
Just a big a big a big a hunk o' love will do  
  
Don't be a stingy little mama  
You're 'bout to s
...


001 < Me And My Baby - Chicago >
----------------------------------------
Roxie.  
Look at my baby my baby and me..  
  
Me and my baby  
My baby and me  
We're 'bout as happy as babies can be  
What if I find  
That I'm causght in a storm?  
I don't care  
My baby's there 
...

002 < River Deep - Mountain High - Cyndi Lauper >
----------------------------------------
When I was a little girl I had a rag doll  
The only doll I've ever owned  
Now I love you just the way I loved that rag doll  
But only now my love has grown  
And it gets stronger in every way  
And
...

003 < Make It Big - Beach Boys >
----------------------------------------
Make it big  
So big  
  
You can make it big  
You can make 