# Content Based Recommender System

In [1]:
import pandas as pd
import numpy as np
import re
import os

from numpy import dot
from numpy.linalg import norm

import warnings
warnings.filterwarnings("ignore")

In [4]:
# constants
d_PATH = './data/data_o.csv'

## Import Data

In [5]:
df = pd.read_csv(d_PATH)
print(df.shape)

(170653, 19)


In [6]:
df['release_date']= pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year

In [7]:
cols = [
    'valence', 'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'tempo', 'speechiness'
]
for c in cols:
    print(c, df[c].describe()['min'], df[c].describe()['max'])

# normalize tempo & loudness

valence 0.0 1.0
acousticness 0.0 0.996
danceability 0.0 0.988
energy 0.0 1.0
liveness 0.0 1.0
loudness -60.0 3.855
tempo 0.0 243.507
speechiness 0.0 0.97


In [8]:
def normalize_col(data_points):
    '''
    This function will normalize a column to be between 0 and 1
    '''
    min_val = min(data_points)
    if min_val < 0:
        data_points = [x + abs(min_val) for x in data_points]
        
    max_val = max(data_points)
    data_points = [x/max_val for x in data_points]
    return data_points
        

In [9]:
df['norm_tempo'] = normalize_col(df['tempo'].values)
df['norm_loudness'] = normalize_col(df['loudness'].values)

In [10]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "_" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [11]:
keep_cols = [
    'year', 'liveness', 'speechiness', 'tempo', 'norm_tempo', 'norm_loudness', 'valence', 'acousticness', 
    'danceability', 'instrumentalness', 'id' #, 'artists'
]

In [12]:
fin_df = df[keep_cols]

In [13]:
rec_df = pd.concat([ohe_prep(fin_df, 'year', 'year'), fin_df], axis = 1)

In [14]:
rec_df = rec_df.set_index(['id'])

In [15]:
def get_cosine_sim(vec1,vec2):
    return dot(vec1,vec2) / (norm(vec1) * norm(vec2))

In [16]:
def recommender(df, song_id, n_rec):
    '''
    This function will calculate the consine similarity of one song with respect to all other songs in the dataset and return
    the top N songs most similar to it.
    '''
    
    input_vector = df.loc[song_id].values
    df['similarity'] = df.apply(lambda row: get_cosine_sim(input_vector, row.values), axis=1)
    return df.nlargest(columns = 'similarity', n = n_rec)

In [17]:
def id_to_song(df, ids):
    '''
    Given a list of song ids this function will convert them back into their song names
    '''
    songs = df[df.id.isin(ids)]
    return songs.name.unique()

In [18]:
%time similar_songs = recommender(rec_df, song_id = '0KkIkfsLEJbrcIhYsCL7L5', n_rec = 5).index

CPU times: user 2.01 s, sys: 137 ms, total: 2.15 s
Wall time: 2.32 s


In [19]:
id_to_song(df = df, ids = similar_songs)

array(['Telepathy', 'Stacy', 'Long Beach', 'Breathe Deeper', 'China'],
      dtype=object)