In [38]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
df = pd.read_csv('songdata.csv')
# shuffled data
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [40]:
df.shape

(57650, 4)

## Reduce the size of dataset to balance it to 10000 rows

In [41]:
df= df.sample(n=10000).drop('link', axis=1).reset_index(drop=True)

# Song = song name , text = lyrics, artist = artist name

## Cleaning

#### (We will extract keywords from the from text i.e. lyrics and try to find out the song based on the keywords)

In [42]:
df['song'][0], df['text'][0]

('One Promise Too Late',
 "I would have waited forever  \nIf I'd known that you'd be here  \nWe could have shared our lives together  \nAnd held each other close all through the years  \nBut I've met someone before you  \nAnd my heart just couldn't wait  \nSo no matter how much I adore you  \nI've got to stand behind the promise that I made  \n  \n[Chorus]  \nWhere were you  \nWhen I could have loved you  \nWhere were you  \nWhen I gave my heart away  \nAll my life I've been dreaming of you  \nYou came along one promise too late  \nYou came along one promise too late  \n  \nI won't say that I'm sorry that I met you  \nI can't have you but I never will forget you  \n  \n[Chorus: x2]\n\n")

In [43]:
df['text']= df['text'].str.lower().replace(r'[^a-zA-Z0-9]','').replace(r'\n','',regex=True) # =>> (r'[^\w\s]', '')
df['text'][0]

"i would have waited forever  if i'd known that you'd be here  we could have shared our lives together  and held each other close all through the years  but i've met someone before you  and my heart just couldn't wait  so no matter how much i adore you  i've got to stand behind the promise that i made    [chorus]  where were you  when i could have loved you  where were you  when i gave my heart away  all my life i've been dreaming of you  you came along one promise too late  you came along one promise too late    i won't say that i'm sorry that i met you  i can't have you but i never will forget you    [chorus: x2]"

### Tokenization
(Spliting in other words and along with stemming)

In [44]:


ps = PorterStemmer()  # Brings the word to its root form

def tokenization(text):
    stemming =[]
    tokens = nltk.word_tokenize(text)
    for w in tokens:
        stemming.append(ps.stem(w))

    return " ".join(stemming)

In [47]:
df['text']= df['text'].apply(lambda x: tokenization(x))

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
tfid = TfidfVectorizer(stop_words='english')
matrix = tfid.fit_transform(df['text'])

matrix, matrix.shape

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 548635 stored elements and shape (10000, 26031)>,
 (10000, 26031))

In [53]:
similarity = cosine_similarity(matrix)
similarity[0]

array([1.        , 0.05444512, 0.05823902, ..., 0.01767247, 0.06246485,
       0.00433177], shape=(10000,))

In [57]:
print(df['song'][0])
df[df['song'] == "One Promise Too Late"] ## Sample

One Promise Too Late


Unnamed: 0,artist,song,text
0,Reba Mcentire,One Promise Too Late,i would have wait forev if i 'd known that you...


In [68]:
def recommendation(song):

    idx = df[df['song'] == song].index[0]
    # This will give the index of the song and the similarity score 0th index is the song itself having similarity score 1
    distances = sorted(list(enumerate(similarity[0])), reverse=False, key=lambda x:x[1])

    songs = []
    for i in distances[1:21]:
        songs.append(df.iloc[i[0]].song)

    return songs

In [70]:
recommendation('Alabamy Bound')

["It's No Game",
 'Pray',
 'The Sun Goes Down',
 "Jamaica's Alright If You Like Homophobes",
 'Vegetable',
 'Everything Grows',
 'Bring Me My Longbow',
 'Christmas In New Orleans',
 'Ikrar',
 'Alabamy Bound',
 'Mustang Sally',
 'Commando',
 'America',
 'Jehovah-Jireh',
 'Aspirasi Putih',
 'Colorblind',
 'Jesus',
 'Song From The End Of The World',
 'Buku Ini Aku Pinjam',
 'I Hear Music']

In [73]:
import pickle

pickle.dump(df, open('DF.pkl', 'wb'))
pickle.dump(similarity, open('Similarity.pkl', 'wb'))