In [154]:
import pandas as pd

In [156]:
df = pd.read_csv("spotify_millsongdata.csv")

In [157]:
df.shape

(57650, 4)

In [158]:
df.isnull().sum() #no missing array

artist    0
song      0
link      0
text      0
dtype: int64

In [159]:
df = df.sample(5000).drop('link',axis=1).reset_index(drop=True) #dont need link so dropped

In [160]:
df['text'][0] # have a look at lyrics of 1st song

"Jingle bells, jingle bells,  \r\nJingle all the way!  \r\nOh what fun it is to ride,  \r\nIn a one-horse open sleigh,  \r\nHey!  \r\n  \r\nJingle bells, jingle bells,  \r\nJingle all the way!  \r\nOh what fun it is to ride,  \r\nIn a one-horse open sleigh!  \r\n  \r\nDashing through the snow (jingle, jing, jing, jingle!)  \r\nIn a one-horse open sleigh (jingle, jing, jing, jingle!)  \r\nOver the fields we go (o'er the fields, jingle, jingle!)  \r\nLaughing all the way!  \r\nBells on Bob-Tail ring (jingle, jing, jing, jingle!)  \r\nMaking spirits bright (la, la, la la!)  \r\nWhat fun it is to ride and sing,  \r\nA sleighing song tonight!  \r\n  \r\nOh! Jingle bells, jingle bells,  \r\nJingle all the way!  \r\nOh what fun it is to ride,  \r\nIn a one-horse open sleigh,  \r\nHey!  \r\n  \r\nJingle bells, jingle bells,  \r\nJingle all the way!  \r\nOh what fun it is to ride,  \r\nIn a one-horse open sleigh!  \r\n  \r\nA day or two ago (A day or two ago)  \r\nI thought I'd take a ride (I t

In [161]:

df.shape

(5000, 3)

Lets do Text Cleaning & Pre-processing

In [163]:
df['text']  = df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', '', regex = True)

In [164]:
df.tail(5)

Unnamed: 0,artist,song,text
4995,Alabama,Hey Baby,written by bruce channel and margaret cobb \r...
4996,Human League,The Touchables,"speaking of which, what is your choice \ryour..."
4997,John Denver,Isabel,isabel is waiting \rin a room of many shadows...
4998,Jackson Browne,Rock Me On The Water,"oh people, look around you \rthe signs are ev..."
4999,Roy Orbison,Domino,there's a cat in town that you might know \rh...


In [165]:
import nltk
from nltk.stem.porter import PorterStemmer

In [166]:
stemmer = PorterStemmer()

In [167]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a= [stemmer.stem(w) for w in token]
    return " ".join(a)

In [168]:
token("you are beautiful")

'you are beauti'

In [169]:
df['text'].apply(lambda x: token(x))

0       jingl bell , jingl bell , jingl all the way ! ...
1       you hear the peopl talk you walk the wrong str...
2       die by violenc in thi world is noth strang for...
3       i close my eye and i can feel that you 're a p...
4       good-day sunlight i 'd like to say how truli b...
                              ...                        
4995    written by bruce channel and margaret cobb hey...
4996    speak of which , what is your choic your conve...
4997    isabel is wait in a room of mani shadow her ey...
4998    oh peopl , look around you the sign are everyw...
4999    there 's a cat in town that you might know he ...
Name: text, Length: 5000, dtype: object

In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [171]:
tfid = TfidfVectorizer(analyzer = 'word', stop_words='english')

In [172]:
matrix = tfid.fit_transform(df['text'])

In [173]:
similar = cosine_similarity(matrix)

In [174]:
similar[0]

array([1.        , 0.0092294 , 0.00784281, ..., 0.        , 0.00633274,
       0.00380391])

In [175]:
df[df['song']=='Lady Luck'].index[0]

4685

The Recommender Function!

In [195]:
def recommendation(song_name):
    idx = df[df['song'] == song_name].index[0]
    distances = sorted(list(enumerate(similar[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:6]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [201]:
recommendation('Hey Baby')

['Hey Girl',
 'Thunderhead',
 'I Was Made To Love Him',
 'Hey Hey, My My',
 'No Need For Conversation']

In [203]:
import pickle

In [207]:
pickle.dump(similar, open("similar", "wb"))

In [209]:
pickle.dump(df, open("df", "wb"))