In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [29]:
df = pd.read_csv('songdata.csv')

In [30]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [31]:
df.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \nLet the angels fly l...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \nMore power \nPower to...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \nis something i'll believe \nf...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \nam i frightened \nwhere can ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \nmake yourself at home \ni'm a bit ...


In [32]:
df.shape

(57650, 4)

In [33]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [34]:
#Dropping link column
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [35]:
df.head()

Unnamed: 0,artist,song,text
0,Rihanna,Breakin Dishes,I don't know who you think I am \nI don't kno...
1,Face To Face,Don't Turn Away,god knows i've tried i've waited in spite of m...
2,Pat Benatar,Christmas In America,It's Christmas in America \nThere's carols in...
3,Il Divo,When A Child Is Born,[Sebastien:] \nA ray of hope flickers in the ...
4,Ella Fitzgerald,Blue Moon,Once upon a time \nBefore I took up smiling ...


In [36]:
df['text'][0]

"I don't know who you think I am  \nI don't know who you think I am  \nI don't know who you think I am  \nI don't know who you think I am  \n  \nI don't know who you think I am  \nI don't know who you think I am  \nI don't know who you think I am  \nI don't know who you think I am  \n  \nHe been gone since three thirty  \nAnd coming home lately at three thirty  \nI'm super cool I've been a fool  \nBut now I'm hot and baby you gone get it  \nNow I ain't tripping ah! I ain't twisting ah!  \nI ain't demented ah! well just a lil' bit  \nI'm kicking asses I'm taking names  \nI'm on flame don't come home babe  \n  \nI'm breaking dishes up in here  \nAll night (Oh-oh)  \nI ain't go stop until I see police lights  \nI'm a fight a man  \nI'm a fight a man  \nI'm a fight a man  \n  \nA man, a man, a ma-a-a-an  \nA man, a man, a ma-a-a-an  \n  \nI'm still waiting, come through the door  \nI am killing time, you know bleaching your clothes  \nI am roasting marshmallows on the fire  \nAnd what I am

In [37]:
#Text Cleaning
df['text'] = df['text'].str.lower().replace(r'^\w\s','').replace(r'\n','', regex = True)

In [38]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return "".join(stemming)

In [39]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [40]:
tfidvector = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [41]:
similarity[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [42]:
df[df['song'] == 'Hello']

Unnamed: 0,artist,song,text
197,Kelly Clarkson,Hello,yeistumblintothenightwe'retouchbutifeellikeyou...
1730,Evanescence,Hello,playgroundschoolbellringagainraincloudcometopl...
4398,Electric Light Orchestra,Hello,"hello,helloit'sgreattoseeyouoncagainit'sbeenso..."


In [43]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [44]:
recommendation('Hello')

['Lord Have Mercy',
 'Happy Joyous Hanukkah',
 'Lie To Me',
 'Let It Die',
 'Caught In A Trap',
 'Ode To Solitude',
 'One Single Flame',
 'Lost Angel',
 'Light My Fire',
 "El Meod Na'ala",
 'One Sweet Day',
 'Faith In Each Other',
 'Automatic',
 'I Feel Loved',
 'Without You',
 'Watchdogs',
 'All Day',
 'Hello Good Morning',
 'Dead Skin Mask',
 'Confide In Me']

In [45]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))