# Recommender Systems 3 - Content-based Filtering

### Import libraries

In [1]:
import pandas as pd
import string
from pprint import pprint
from random import randrange
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load data

In [2]:
all_songs = pd.read_csv('data/songdata.csv')
all_songs.shape

(57650, 4)

In [3]:
pd.set_option('max_colwidth', 100)

In [4]:
# Display first few rows
all_songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd it means something special to me \nLook at the w..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gently like a summer evening breeze \nTake your time, ..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,"I'll never know why I had to go \nWhy I had to put up such a lousy rotten show \nBoy, I was to..."
3,ABBA,Bang,/a/abba/bang_20598415.html,"Making somebody happy is a question of give and take \nYou can learn how to show it so come on,..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,"Making somebody happy is a question of give and take \nYou can learn how to show it so come on,..."


### Set global variables

In [5]:
global test_song

### Preprocess data

In [6]:
# Prepare smaller data subset of 5000 randomly selected songs, and drop column 'link'
songs = all_songs.sample(n=5000).drop('link', axis=1).reset_index(drop=True)
songs.head()

Unnamed: 0,artist,song,text
0,Bill Withers,Hello Like Before,Hello like before \nI'd never come here \nIf I'd known that you were here \nI must admit thou...
1,Ozzy Osbourne,Psycho Man,The midnight hour approaches \nThe killing chill takes over him \nHis victims will not know \...
2,Marianne Faithfull,As Tears Go By,"It is the evening of the day, \nI sit and watch the children play. \nSmiling faces I can see ..."
3,Cher,Dressed To Kill,"I slip into my shoes and slide into the night \nI'm on the loose, you're in my sights \nYou kn..."
4,Iggy Pop,Plastic Concrete,"Plastic and concrete, baby \nThese are the facts of life \nI'm a nightmare child \nStuck on m..."


In [7]:
# Randomly select a test song
test_song = randrange(5000)     # Select an integer between 0 and 4999
print(test_song)
print(songs['artist'][test_song])
print(songs['song'][test_song])

4758
Celine Dion
All The Way


In [8]:
# Display text (lyrics) for test song
songs['text'][test_song]

"When somebody loves you  \nIt's no good unless he loves you all the way  \nHappy to be near you  \nWhen you need someone to cheer you all the way  \n  \nTaller than the tallest tree is  \nThat's how it's got to feel  \nDeeper than the deep blue sea is  \nThat's how deep it goes if its real  \n  \nWhen somebody needs you  \nIt's no good unless he needs you all the way  \nThrough the good or lean years  \nAnd for all the in between years, come what way  \n  \nWho know where the road will lead us  \nOnly a fool would say  \nBut if you'll let me love you  \nIt's for sure I'm gonna love you, all the way, all the way\n\n"

In [9]:
# Clean-up data: remove newline characters
songs['text'] = songs['text'].str.replace(r'\n', '', regex=True)
songs.head()

Unnamed: 0,artist,song,text
0,Bill Withers,Hello Like Before,Hello like before I'd never come here If I'd known that you were here I must admit though Th...
1,Ozzy Osbourne,Psycho Man,The midnight hour approaches The killing chill takes over him His victims will not know When ...
2,Marianne Faithfull,As Tears Go By,"It is the evening of the day, I sit and watch the children play. Smiling faces I can see But ..."
3,Cher,Dressed To Kill,"I slip into my shoes and slide into the night I'm on the loose, you're in my sights You know i..."
4,Iggy Pop,Plastic Concrete,"Plastic and concrete, baby These are the facts of life I'm a nightmare child Stuck on my own ..."


In [10]:
# Create function to remove punctuation and lowercase all text
def cleanup_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    return text

# Remove punctuation and lowercase all text
songs['text'] = songs['text'].apply(lambda x: cleanup_text(x))

In [11]:
# Display text (lyrics) for same song after cleaning
songs['text'][test_song]

'when somebody loves you  its no good unless he loves you all the way  happy to be near you  when you need someone to cheer you all the way    taller than the tallest tree is  thats how its got to feel  deeper than the deep blue sea is  thats how deep it goes if its real    when somebody needs you  its no good unless he needs you all the way  through the good or lean years  and for all the in between years come what way    who know where the road will lead us  only a fool would say  but if youll let me love you  its for sure im gonna love you all the way all the way'

### Run TF-IDF vectorizer

In [12]:
# Calculate the TF-IDF score for each song lyric, word by word
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# Create a lyric_matrix variable where we store the matrix
#  containing each word and its TF-IDF score with regard to each song lyric
lyrics_matrix = tfidf.fit_transform(songs['text'])
lyrics_matrix

<5000x26297 sparse matrix of type '<class 'numpy.float64'>'
	with 277676 stored elements in Compressed Sparse Row format>

### Generate lyrics matrix to make recommendations

In [13]:
# Calculate the cosine similarity of each item with every other item in the dataset
cosine_similarities = cosine_similarity(lyrics_matrix)
cosine_similarities.shape

(5000, 5000)

In [14]:
cosine_similarities

array([[1.00000000e+00, 4.58948433e-04, 1.27182549e-02, ...,
        5.33922667e-02, 6.10744149e-03, 4.36415745e-02],
       [4.58948433e-04, 1.00000000e+00, 1.43500038e-03, ...,
        1.22254415e-03, 3.27296391e-04, 5.55850464e-04],
       [1.27182549e-02, 1.43500038e-03, 1.00000000e+00, ...,
        6.65594659e-03, 7.19727664e-04, 1.68154576e-03],
       ...,
       [5.33922667e-02, 1.22254415e-03, 6.65594659e-03, ...,
        1.00000000e+00, 3.20819971e-03, 1.38451302e-02],
       [6.10744149e-03, 3.27296391e-04, 7.19727664e-04, ...,
        3.20819971e-03, 1.00000000e+00, 1.11032696e-02],
       [4.36415745e-02, 5.55850464e-04, 1.68154576e-03, ...,
        1.38451302e-02, 1.11032696e-02, 1.00000000e+00]])

In [15]:
# Print sample cosine_similarities row
cosine_similarities[test_song]

array([0.03366853, 0.00842925, 0.        , ..., 0.0088328 , 0.00928133,
       0.02914885])

In [16]:
# Print index values for sample cosine_similarities row
#    argsort performs an indirect sort on a numpy array
#    It returns an array of indicies in sorted order
cosine_similarities[test_song].argsort()

array([4444, 2067, 2061, ..., 2895, 3864, 4758])

In [17]:
# Print top 10 indicies from sample cosine_similarities row in descending sorted order
cosine_similarities[test_song].argsort()[:-10:-1]

array([4758, 3864, 2895, 4725, 2410, 2054, 2162, 2699, 2793])

In [18]:
%%time

# Store the names of the 50 most similar songs for each song in our dataset
similarities = {}
for i in range(len(cosine_similarities)):
    # Sort each element in cosine_similarities and get the indexes of the 50 most similar songs
    similar_indices = cosine_similarities[i].argsort()[:-50:-1]
    
    # Store in similarities each name of the 50 most similar songs
    # Note: the first one is the same song
    similarities[songs['song'].iloc[i]] = [(cosine_similarities[i][x], songs['song'][x], songs['artist'][x])
                                           for x in similar_indices][1:]

similarities

CPU times: user 3.72 s, sys: 51.6 ms, total: 3.77 s
Wall time: 3.84 s


{'Hello Like Before': [(0.4378653779832861, 'Hello Goodbye', 'Paul McCartney'),
  (0.43234449794684093, 'Smells Like Teen Spirit (Live)', 'Avril Lavigne'),
  (0.3772149070256412,
   "Hello, I Love You (Let's Get Tested For AIDS)",
   'Bob Rivers'),
  (0.37605977261296675, 'I Guess I Like It Like That', 'Kylie Minogue'),
  (0.29943890108565957, 'Hello, Young Lovers', 'The Temptations'),
  (0.2693046819928457, 'Something', 'Drake'),
  (0.24308409037536033, "I Don't Wanna Know", 'Indigo Girls'),
  (0.23351439945192637, 'Leaves That Are Green', 'Paul Simon'),
  (0.22767579266036705, 'Heroin', 'Lou Reed'),
  (0.21784269491725466, 'If We Ever Meet Again', 'Katy Perry'),
  (0.20742456615883167, 'After All', 'Cher'),
  (0.2026641821327671, 'Doing Fine', 'Cliff Richard'),
  (0.20085707813803483, 'Propinquity', 'Nitty Gritty Dirt Band'),
  (0.1977732817960481, 'Goodbye Sam - Hello Samantha', 'Cliff Richard'),
  (0.1821669798183236, 'Last To Know', 'P!nk'),
  (0.1806848041903198, "Guess I'm Falli

In [19]:
# Display first element in dictionary
pprint(list(similarities.items())[test_song])

('At My Window',
 [(0.28144560509684935, 'Fly Away', 'Bread'),
  (0.23304202139944036, 'Angel', 'Rod Stewart'),
  (0.18285653312926614, 'Down Came The World', 'Waylon Jennings'),
  (0.17988040427774896, 'Beautiful Bluebird (2007)', 'Neil Young'),
  (0.17959373689321043, 'Knock On Any Door', 'Jackson Browne'),
  (0.15855507637869912, 'Blackbird', 'Nina Simone'),
  (0.15618374260182297, "Rain Keeps Fallin'", 'Grand Funk Railroad'),
  (0.1543196549392247, "God Ain't No Stained Glass Window", 'Johnny Cash'),
  (0.1436789745824404, 'Since Jesus Came Into My Heart', 'Randy Travis'),
  (0.138038438920131, 'Dirty Blvd.', 'Lou Reed'),
  (0.13684450790987995, 'Remember When', 'Leann Rimes'),
  (0.12835271574360788, 'Little Girl', 'Reba Mcentire'),
  (0.11910419896802356, 'All My Love', 'Cliff Richard'),
  (0.11895918729169286, "I'm Alive", 'Celine Dion'),
  (0.11883601516080641, 'Lab Monkey', 'Alice In Chains'),
  (0.11663771852263205, 'Come In With The Rain', 'Demi Lovato'),
  (0.11650670336546

### Create song recommender

In [20]:
# Use the similarity scores (generated above) to access the most similar items and make recommendations
# Create Content based recommender class
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_songs(self, song, recom_songs):
        rec_items = len(recom_songs)
        
        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"\t{i+1}: {recom_songs[i][1]} by {recom_songs[i][2]} (with {round(recom_songs[i][0], 3)}" +
                   " similarity score)")
        
    def recommend(self, song, rec_num_songs=3):
        recom_songs = self.matrix_similar[song][:rec_num_songs]
        # Print each recommend song
        self._print_songs(song=song, recom_songs=recom_songs)

In [21]:
# Instantiate Content based recommender class with the precomputed similarities
recommedations = ContentBasedRecommender(similarities)

### Make song recommendations

In [22]:
# Pick a song and make a recommendation
recommedations.recommend(songs['song'].iloc[10])

The 3 recommended songs for I've Got A Pocketful Of Dreams are:
	1: I Feel Lucky by Pat Benatar (with 0.19 similarity score)
	2: Lucky Me by Chris Brown (with 0.177 similarity score)
	3: One Life by Kenny Rogers (with 0.174 similarity score)


In [23]:
# Pick another song and make a recommendation
recommedations.recommend(songs['song'].iloc[120], 5)

The 5 recommended songs for Try It Baby are:
	1: Not The One by Zoegirl (with 0.393 similarity score)
	2: All Screwed Up by Ramones (with 0.356 similarity score)
	3: I Know You Well by Face To Face (with 0.353 similarity score)
	4: Do It To Me by Lionel Richie (with 0.341 similarity score)
	5: Cry Baby by Cheap Trick (with 0.334 similarity score)
