In [1]:
import nltk
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
stemmer = PorterStemmer()

In [2]:
df = pd.read_csv("spotifySongData.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.shape

(57650, 4)

In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df =df.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [7]:
df.head(10)

Unnamed: 0,artist,song,text
0,Enya,Paint The Sky With Stars,Suddenly before my eyes \r\nHues of indigo ar...
1,Britney Spears,I Run Away,"You toke your love away,to fast \r\nLeft no c..."
2,Britney Spears,Burning Up,I'm on fire! \r\n \r\nDon't put me off 'caus...
3,Rihanna,Stupid In Love,Mm \r\nStupid in love \r\nOh \r\nStupid in ...
4,Michael Jackson,Jam,"Nation to nation, all the world must come toge..."
5,Rammstein,Good-Bye,He lays the needle in the vein \r\nAnd he ask...
6,Lady Gaga,Another Time,Lalalala \r\nNow ew aw \r\nBut it's not the ...
7,Carpenters,Have Yourself A Merry Little Christmas,Christmas future is far away \r\nChristmas pa...
8,Leonard Cohen,Hallelujah,Now I've heard there was a secret chord \r\nT...
9,Kris Kristofferson,The Bigger The Fool,"Hey, doesn't it still look easy as it did to y..."


In [8]:
df['text'][0]

'Suddenly before my eyes  \r\nHues of indigo arise  \r\nWith them how my spirit sighs  \r\nPaint the sky with stars  \r\n  \r\nOnly night will ever know  \r\nWhy the heavens never show  \r\nAll the dreams there are to know  \r\nPaint the sky with stars  \r\n  \r\nWho has placed the midnight sky  \r\nSo a spirit has to fly?  \r\nAs the heavens seem so far now,  \r\nWho will paint the midnight star?  \r\n  \r\nNight has brought to those who sleep  \r\nOnly dreams they cannot keep  \r\nI have legends in the deep  \r\nPaint the sky with stars  \r\n  \r\nWho has placed the midnight sky  \r\nSo a spirit has to fly?  \r\nAs the heavens seem so far now,  \r\nWho will paint the midnight star?  \r\n  \r\nPlace a name upon the night  \r\nOne to set your heart alight  \r\nAnd to make the darkness bright  \r\nPaint the sky with stars.  \r\n\r\n'

In [9]:
df.shape

(10000, 3)

In [10]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [11]:
def process_text(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [12]:
df['text'] = df['text'].apply(lambda x: process_text(x))

In [13]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [14]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [15]:
df['song'][1000]

'Serial Killer'

In [16]:
recommendation(df['song'][1000])

['Love Songs',
 "I'm In Love Again",
 'Out Of My Head',
 "Can't Get You Out Of My Head",
 'Ann',
 'Jersey Girl',
 'Jersey Girl',
 'If U Seek Amy',
 'Everything Is Sound',
 'I Hate',
 'Question!',
 "Passenger (As Performed On BBC's 'Later' With Jools Holland)",
 'Red Death At 6:14',
 'Another Star',
 'Oo La La La',
 'Brown Eyed Girl',
 'Diddy',
 'Brown Eyed Girl',
 'Baby Come Out Tonight',
 'Brown Eyed Girl']

In [17]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [39]:
similarity.shape

(10000, 10000)