In [163]:
import numpy as np
import pandas as pd

In [164]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [165]:
songs=pd.read_csv("songdata.csv")

In [166]:
songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [167]:
songs.shape

(57650, 4)

In [168]:
songs = songs.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [169]:
songs.head()

Unnamed: 0,artist,song,text
0,R. Kelly,Break Up (Thats All We Do),"Make up, and break up \r\nThat's all we do \..."
1,Zeromancer,Mosquito Coil,It's looking out \r\nFrom inside \r\nA Chris...
2,Great Big Sea,Wave Over Wave,Only name Table Rogers a share man am I \r\nO...
3,Dan Fogelberg,Faces Of America,"There was a time, a simpler time \r\nWhen a m..."
4,Elvis Costello,Joe Porterhouse,The children sit upon the stairs \r\nHigh abo...


In [170]:
songs.shape

(5000, 3)

In [171]:
songs['text'] = songs['text'].str.replace('\r', '')

In [172]:
songs['text'] = songs['text'].str.replace('\n', '')

In [173]:
songs.head()

Unnamed: 0,artist,song,text
0,R. Kelly,Break Up (Thats All We Do),"Make up, and break up That's all we do Then ..."
1,Zeromancer,Mosquito Coil,It's looking out From inside A Christmas bon...
2,Great Big Sea,Wave Over Wave,Only name Table Rogers a share man am I On a ...
3,Dan Fogelberg,Faces Of America,"There was a time, a simpler time When a man c..."
4,Elvis Costello,Joe Porterhouse,The children sit upon the stairs High above a...


In [174]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
lyrics_matrix = tfidf.fit_transform(songs['text'])

In [175]:
cosine_similarities = cosine_similarity(lyrics_matrix) 

In [176]:
similarities = {}

In [177]:
for i in range(len(cosine_similarities)):
    similar_indices = cosine_similarities[i].argsort()[:-50:-1] 
    similarities[songs['song'].iloc[i]] = [(cosine_similarities[i][x], songs['song'][x], songs['artist'][x]) for x in similar_indices][1:]

In [178]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)
        print(f'The recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"{recom_song[i][1]} by {recom_song[i][2]}") 
            
    def recommend(self, recommendation):
        song = recommendation['song']
        number_songs = recommendation['number_songs']
        recom_song = self.matrix_similar[song][:number_songs]
        self._print_message(song=song, recom_song=recom_song)

In [179]:
recommedations = ContentBasedRecommender(similarities)

In [180]:
recommendation = {
    "song": songs['song'].iloc[10],
    "number_songs": 4 
}

In [181]:
recommedations.recommend(recommendation)

The recommended songs for Running For Cover are:
Keep On Running by Stevie Wonder
Running Man by Hanson
Save Me A Place by Fleetwood Mac
Watercolours In The Rain by Roxette


In [182]:
recommendation2 = {
    "song": songs['song'].iloc[120],
    "number_songs": 4 
}

In [161]:
recommedations.recommend(recommendation2)

The recommended songs for No Way are:
Replacement Girl by Drake
Guadalajara by Elvis Presley
Soul Deep by Roxette
I Put A Spell On You by Bette Midler


In [162]:
import pickle
pickle.dump(similarities,open("sim.pkl","wb"))
pickle.dump(songs.to_dict(),open('songs.pkl','wb'))