In [54]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity   

In [55]:
df = pd.read_csv("spotify_millsongdata.csv")
df.tail(3)

Unnamed: 0,artist,song,link,text
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [56]:
df.shape

(57650, 4)

In [57]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [58]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)
df.head(2)

Unnamed: 0,artist,song,text
0,Hillsong United,Air I Breathe,"Hold me, in your arms \r\nLike a new born chi..."
1,Yes,New State Of Mind,Waiting for the moment when the moment has bee...


In [59]:
df['text'][0]

"Hold me, in your arms  \r\nLike a new born child  \r\nI'm desperate, lord  \r\nFor more of you  \r\n  \r\nTouch me, with your love  \r\nDeep within my heart  \r\nI'm waiting, lord  \r\nFor more of you  \r\n  \r\nMay your words of love  \r\nWash over me  \r\nMay your songs of grace  \r\nCover me  \r\n  \r\nMore than the air I breathe today  \r\nI need you  \r\nMore than the desert needs the rain  \r\nI need you  \r\nMore than the air breathe today  \r\nI need you  \r\nMore than to live another day  \r\nI need you\r\n\r\n"

In [60]:
df['text'] = df['text'].str.lower().replace(r'^a-zA-Z-0-9', " ").replace(r'\n', " ", regex=True)
df["text"]

0       hold me, in your arms  \r like a new born chil...
1       waiting for the moment when the moment has bee...
2       you know that i won't be untrue  \r you know t...
3       here's the thing  \r we started out friends  \...
4       there were rooms of forgiveness  \r in the hou...
                              ...                        
4995    it's been so long, darlin'  \r since i had to ...
4996    you wanna play, you wanna stay, you wanna have...
4997    she had that  \r camarillo brillo  \r flamin' ...
4998    light switch, yellow fever, crawling up your b...
4999    babe, i can't change my tune for you  \r but i...
Name: text, Length: 5000, dtype: object

In [61]:
Stemmer = PorterStemmer()

In [62]:
def token(txt):
    tok = nltk.word_tokenize(txt)
    a = [Stemmer.stem(w) for w in tok]
    return " ".join(a)

In [63]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [64]:
df["text"].apply(lambda x : token(x))

0       hold me , in your arm like a new born child i ...
1       wait for the moment when the moment ha been wa...
2       you know that i wo n't be untru you know that ...
3       here 's the thing we start out friend it wa co...
4       there were room of forgiv in the hous that we ...
                              ...                        
4995    it 's been so long , darlin ' sinc i had to go...
4996    you wan na play , you wan na stay , you wan na...
4997    she had that camarillo brillo flamin ' out alo...
4998    light switch , yellow fever , crawl up your ba...
4999    babe , i ca n't chang my tune for you but i 'l...
Name: text, Length: 5000, dtype: object

In [65]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfid.fit_transform(df["text"])

In [66]:
similar = cosine_similarity(matrix)
similar

array([[1.        , 0.06043297, 0.0073961 , ..., 0.02146218, 0.        ,
        0.02737702],
       [0.06043297, 1.        , 0.04705823, ..., 0.01053046, 0.07498604,
        0.03330675],
       [0.0073961 , 0.04705823, 1.        , ..., 0.03017098, 0.0335461 ,
        0.01807353],
       ...,
       [0.02146218, 0.01053046, 0.03017098, ..., 1.        , 0.01627245,
        0.00517212],
       [0.        , 0.07498604, 0.0335461 , ..., 0.01627245, 1.        ,
        0.        ],
       [0.02737702, 0.03330675, 0.01807353, ..., 0.00517212, 0.        ,
        1.        ]])

In [67]:
df[df['song'] == "Desire"].index[0]

np.int64(417)

In [68]:
def recommender(song_name):
    idx = df[df['song']== song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key= lambda x: x[1])
    song = []
    for s_id in distance[1:5]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [70]:
# recommender("Come With Me")

In [71]:
import pickle

In [72]:
pickle.dump(similar, open("similarity.pkl", "wb"))

In [73]:
pickle.dump(df, open("df.pkl", "wb"))