# Recommend artists
Build a content-based recommendation engine which suggests artists

In [43]:
import pandas as pd
import numpy as np

data = pd.read_csv('data\data_by_artist.csv')
data.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,Francisco Canaro,0.983072,0.654711,177776.513454,0.292622,0.490675,0.201118,-11.733373,0.111007,123.608786,0.746469,0.054257,2,1,2267
1,Frédéric Chopin,0.989961,0.340087,251871.948502,0.106874,0.876899,0.155677,-22.575578,0.042913,90.977772,0.203644,5.748127,1,1,1068
2,Ludwig van Beethoven,0.955019,0.340157,439361.331606,0.153176,0.69077,0.164078,-20.107704,0.05555,104.833536,0.260255,5.237306,0,1,965
3,Wolfgang Amadeus Mozart,0.962084,0.353895,327808.037306,0.138348,0.514837,0.187091,-20.214154,0.067756,108.59492,0.332855,9.62487,7,1,965
4,Johann Sebastian Bach,0.958405,0.354224,203461.757872,0.201932,0.744322,0.165837,-20.936518,0.048844,106.551869,0.570526,14.704669,7,1,921


drop columns not useful

In [44]:
df = data.drop(['artists', 'duration_ms', 'key', 'mode', 'count'],axis=1)
df.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
count,27606.0,27606.0,27606.0,27606.0,27606.0,27606.0,27606.0,27606.0,27606.0,27606.0
mean,0.511213,0.54054,0.48907,0.171924,0.203372,-11.342185,0.094928,115.405597,0.512366,34.211749
std,0.369982,0.176387,0.254124,0.29677,0.140081,5.897531,0.113778,25.17744,0.244332,22.026644
min,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,0.133185,0.424693,0.277,4e-06,0.111333,-14.235,0.039402,98.636917,0.329013,13.5
50%,0.507,0.552,0.494,0.001869,0.163,-10.262017,0.052167,114.936204,0.524,39.0
75%,0.906106,0.669,0.695,0.205225,0.247,-6.960013,0.0958,129.861625,0.702,51.0
max,0.996,0.986,1.0,1.0,0.991,1.342,0.964,217.743,0.999,97.0


check for nulls

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27606 entries, 0 to 27605
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      27606 non-null  float64
 1   danceability      27606 non-null  float64
 2   energy            27606 non-null  float64
 3   instrumentalness  27606 non-null  float64
 4   liveness          27606 non-null  float64
 5   loudness          27606 non-null  float64
 6   speechiness       27606 non-null  float64
 7   tempo             27606 non-null  float64
 8   valence           27606 non-null  float64
 9   popularity        27606 non-null  float64
dtypes: float64(10)
memory usage: 2.1 MB


Let's normalize

In [55]:
from sklearn.preprocessing import MinMaxScaler

feat_matrix= MinMaxScaler().fit_transform(df)
feat_matrix.shape

(27606, 10)

Build a user profile

In [167]:
#artists_list = data['artists'].sample(n=10, random_state=5).values
artists_list = np.array(['Snoop Dogg','Nate Dogg','Xzibit','50 Cent','Solomon Burke','Carl Anderson','Koka Kambon'])
ratings = np.random.randint(2,11,size=len(artists_list))  # randomly generate ratings for artists (1-10)
user_x = list(zip(artists_list,ratings))
user_x

[('Snoop Dogg', 2),
 ('Nate Dogg', 9),
 ('Xzibit', 8),
 ('50 Cent', 9),
 ('Solomon Burke', 9),
 ('Carl Anderson', 5),
 ('Koka Kambon', 6)]

Convert user profile to features matrix using only top-rated artists

In [189]:
art,rat = list(zip(*user_x)) # unzip user profile
ratings = np.array(rat)
artists = np.array(art)
artists_liked = artists_list[np.where(ratings > 5)]   # select artists rated 6 minimum
vals = data.index[data['artists'].isin(artists_liked)].tolist()

X = feat_matrix[vals,:]
X.shape

(5, 10)

In [190]:
artists_liked

array(['Snoop Dogg', 'Xzibit', '50 Cent', 'Carl Anderson', 'Koka Kambon'],
      dtype='<U13')

Find similar artists using minimum distance between song features

In [188]:
m,n = X.shape
recommendations = []
for i in range(m): 
    artist = artists_liked[i]
    dist = np.linalg.norm(feat_matrix - X[i], axis=1)
    index_array = np.argsort(dist)
    recomm = data.artists.iloc[index_array[:5]].tolist()
    if artist in recomm:                  
        del recomm[recomm.index(artist)]
    #print('Recommendations for', artist)
    #print(pd.DataFrame(recomm, columns=['Artist']))
    #print('')
    recommendations = list(set().union(recommendations,recomm))
print(pd.DataFrame(recommendations, columns=['Artist']))

               Artist
0              Defari
1         Lloyd Banks
2      Shirley Bassey
3               Diddy
4         Gene Miller
5                 D12
6              Kurupt
7       The Time Bomb
8   Buckshot LeFonque
9             Lil Jon
10         Margie Day
11             Xzibit
12        Teresa Teng
13          Nate Dogg
14         Lil' Cease
15            50 Cent
16               2Pac
17            Ja Rule
18          Brooklynn
19           Lil' Kim
20                Nas
21              JAY-Z


In [145]:
recomm.index('Brooklynn')

3

In [197]:
from sklearn.feature_extraction.text import TfidfVectorizer
dat_genre = pd.read_csv('data\data_w_genres.csv')
genres = TfidfVectorizer(max_df=0.7, min_df = 1).fit_transform(dat_genre['genres'])

In [199]:
indx = dat_genre['artists'].index[dat_genre['artists'].isin(artists_liked)].tolist()
user_genre = genres[indx,:]

In [204]:
from sklearn.metrics.pairwise import linear_kernel

similar = linear_kernel(user_genre, genres)
indices = similar.argsort()

for i in range(m):
    artist = artists_liked[i]
    dist = np.linalg.norm(genres - user_genre[i], axis=1)
    index_array = np.argsort(dist)
    recomm = dat_genre.artists.iloc[index_array[:5]].tolist()
    print(pd.DataFrame(recomm, columns=['title']))
    print('')

ValueError: inconsistent shapes

In [207]:
list(user_genre)

[<1x1503 sparse matrix of type '<class 'numpy.float64'>'
 	with 7 stored elements in Compressed Sparse Row format>,
 <1x1503 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 <1x1503 sparse matrix of type '<class 'numpy.float64'>'
 	with 10 stored elements in Compressed Sparse Row format>,
 <1x1503 sparse matrix of type '<class 'numpy.float64'>'
 	with 0 stored elements in Compressed Sparse Row format>,
 <1x1503 sparse matrix of type '<class 'numpy.float64'>'
 	with 0 stored elements in Compressed Sparse Row format>]