# Content Based Recommender System

In [1]:
import pandas as pd
import numpy as np
import re
import os

from numpy import dot
from numpy.linalg import norm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# constants
tracks_PATH = '../data/tracks.csv'
genre_PATH = '../data/data_by_genres_o.csv'
d_PATH = '../data/data_o.csv'

## Import Data

In [3]:
tracks_df = pd.read_csv(tracks_PATH, converters = {'artists' : eval, 'id_artists' : eval})
genre_df = pd.read_csv(genre_PATH)
df = pd.read_csv(d_PATH)
print(tracks_df.shape, genre_df.shape, df.shape)

(586672, 20) (2973, 14) (170653, 19)


In [None]:
# d = pd.read_csv("https://raw.githubusercontent.com/vatsal220/reference_data/main/data_by_artist_o.csv")

In [5]:
df['release_date']= pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year

In [6]:
cols = [
    'valence', 'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'tempo', 'speechiness'
]
for c in cols:
    print(c, df[c].describe()['min'], df[c].describe()['max'])

# normalize tempo & loudness

valence 0.0 1.0
acousticness 0.0 0.996
danceability 0.0 0.988
energy 0.0 1.0
liveness 0.0 1.0
loudness -60.0 3.855
tempo 0.0 243.507
speechiness 0.0 0.97


In [9]:
def normalize_col(data_points):
    '''
    This function will normalize a column to be between 0 and 1
    '''
    min_val = min(data_points)
    if min_val < 0:
        data_points = [x + abs(min_val) for x in data_points]
        
    max_val = max(data_points)
    data_points = [x/max_val for x in data_points]
    return data_points
        

In [10]:
df['norm_tempo'] = normalize_col(df['tempo'].values)
df['norm_loudness'] = normalize_col(df['loudness'].values)

In [11]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "_" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [12]:
keep_cols = [
    'year', 'liveness', 'speechiness', 'tempo', 'norm_tempo', 'norm_loudness', 'valence', 'acousticness', 
    'danceability', 'instrumentalness', 'id' #, 'artists'
]

In [13]:
fin_df = df[keep_cols]

In [14]:
rec_df = pd.concat([ohe_prep(fin_df, 'year', 'year'), fin_df], axis = 1)

In [15]:
rec_df = rec_df.set_index(['id'])

In [16]:
def get_cosine_sim(vec1,vec2):
    return dot(vec1,vec2) / (norm(vec1) * norm(vec2))

In [18]:
def recommender(df, song_id, n_rec):
    '''
    This function will calculate the consine similarity of one song with respect to all other songs in the dataset and return
    the top N songs most similar to it.
    '''
    
    input_vector = df.loc[song_id].values
    df['similarity'] = df.apply(lambda row: get_cosine_sim(input_vector, row.values), axis=1)
    return df.nlargest(columns = 'similarity', n = n_rec)

In [24]:
def id_to_song(df, ids):
    '''
    Given a list of song ids this function will convert them back into their song names
    '''
    songs = df[df.id.isin(ids)]
    return songs.name.unique()

In [20]:
%time similar_songs = recommender(rec_df, song_id = '0KkIkfsLEJbrcIhYsCL7L5', n_rec = 5).index

CPU times: user 1.95 s, sys: 67.4 ms, total: 2.02 s
Wall time: 2.02 s


In [25]:
id_to_song(df = df, ids = similar_songs)

array(['Telepathy', 'Stacy', 'Long Beach', 'Breathe Deeper', 'China'],
      dtype=object)

In [13]:
genre_df['genres'].values[0]

'21st century classical'

In [8]:
# preprocessing year to be integer
tracks_df['release_date']= pd.to_datetime(tracks_df['release_date'])
tracks_df['year'] = tracks_df['release_date'].dt.year

# 

In [9]:
tracks_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,1922
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,1922
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,1922
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,1922
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922-01-01,0.402,0.158,...,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,1922
