## Last.fm Data

Data source: http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html

In git, unzip using tar -xvzf lastfm-dataset-360K.tar.gz

Open .tsv file from Excel and save as .csv (I named them users.csv and user_country.csv)

Data includes individual user ids matched up with artist id/names. I used some modeling examples by Nick Becker (https://beckernick.github.io/).

In [213]:
#Import dependencies
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [214]:
#Drop the csv data into a database of users
user_df = pd.read_csv('data/users.csv')
user_df.head()

Unnamed: 0,user_id,artist_id,artist_name,plays
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,b202beb7-99bd-47e7-8b72-195c8d72ebdd,christina aguilera,135392
1,09d12dfa05a0852053a9017121034a837fa4019e,4d7928cd-7ed2-4282-8c29-c0c9f966f1bd,alice cooper,134993
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,31745282-b1ea-4d62-939f-226b14d68e7c,in flames,112989
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,ac9a487a-d9d2-4f27-bb23-0f4686488345,lil wayne,107758
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,847e8a0c-cc20-4213-9e16-975515c2a926,celine dion,86132


In [215]:
#Set up the df by artist name. First we'll get rid of rows with the missing artist name, and then group by name and plays
if user_df['artist_name'].isnull().sum() > 0:
    user_df = user_df.dropna(axis = 0, subset = ['artist_name'])
artist_plays = (user_df.groupby(by = ['artist_name'])['plays'].sum().reset_index().
               rename(columns = {'plays': 'total_artist_plays'})
               [['artist_name', 'total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist_name,total_artist_plays
0,cours de la somme,9
1,oliver shanti & friends,3
2,!!!,19814
3,!5:b>@ 3070,33
4,!action pact!,143


In [216]:
#Join the artist plays df to the original user df
users_plus_artist = user_df.merge(artist_plays, left_on = 'artist_name', right_on = 'artist_name', how = 'left')
users_plus_artist.head()

Unnamed: 0,user_id,artist_id,artist_name,plays,total_artist_plays
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,b202beb7-99bd-47e7-8b72-195c8d72ebdd,christina aguilera,135392,328864
1,09d12dfa05a0852053a9017121034a837fa4019e,4d7928cd-7ed2-4282-8c29-c0c9f966f1bd,alice cooper,134993,212932
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,31745282-b1ea-4d62-939f-226b14d68e7c,in flames,112989,814097
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,ac9a487a-d9d2-4f27-bb23-0f4686488345,lil wayne,107758,285681
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,847e8a0c-cc20-4213-9e16-975515c2a926,celine dion,86132,146320


In [217]:
#Read the second csv containing each user's country
user_country = pd.read_csv('data/user_country.csv')
user_country.head()

Unnamed: 0,user_id,sex,age,country,date
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,1-Feb-07
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,4-Dec-07
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,1-Sep-06
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,28-Apr-08
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,27-Jan-06


In [218]:
#Make a left join of the country csv and user csv on the individual user id
combined_df = users_plus_artist.merge(user_country, left_on = 'user_id', right_on = 'user_id', how = 'left')
combined_df.head()

Unnamed: 0,user_id,artist_id,artist_name,plays,total_artist_plays,sex,age,country,date
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,b202beb7-99bd-47e7-8b72-195c8d72ebdd,christina aguilera,135392,328864,f,20.0,Brazil,1-Sep-07
1,09d12dfa05a0852053a9017121034a837fa4019e,4d7928cd-7ed2-4282-8c29-c0c9f966f1bd,alice cooper,134993,212932,m,39.0,United Kingdom,10-Aug-05
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,31745282-b1ea-4d62-939f-226b14d68e7c,in flames,112989,814097,m,26.0,Russian Federation,19-Sep-07
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,ac9a487a-d9d2-4f27-bb23-0f4686488345,lil wayne,107758,285681,,22.0,United States,12-May-08
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,847e8a0c-cc20-4213-9e16-975515c2a926,celine dion,86132,146320,m,27.0,United States,3-Aug-07


In [219]:
#Filter down df to only users in the US so data is more workable and artists more familiar for this exercise
usa_users = combined_df.query('country == \'United States\'')
usa_users.head()

Unnamed: 0,user_id,artist_id,artist_name,plays,total_artist_plays,sex,age,country,date
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,ac9a487a-d9d2-4f27-bb23-0f4686488345,lil wayne,107758,285681,,22.0,United States,12-May-08
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,847e8a0c-cc20-4213-9e16-975515c2a926,celine dion,86132,146320,m,27.0,United States,3-Aug-07
13,028b91859a012251da23c3dbfd2215154a789f9f,1c3919b2-43ca-4a4a-935d-9d50135ec0ef,afi,59169,264433,f,22.0,United States,18-Jan-06
15,073689cd85d6f876b0b1123598c53194b2d21198,a16d1433-ba89-4f72-a47b-a370add0bb55,boa,50530,128850,m,19.0,United States,27-Jun-05
22,0033ee7378661b88b245b1f67cc622ff63a51061,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,39655,1896944,m,,United States,5-Jun-06


In [220]:
#Make sure every user has a play count variable once for each artist; check for rows with same user and artist name values
if not usa_users[usa_users.duplicated(['user_id', 'artist_name'])].empty:
    initial_rows = usa_users.shape[0]
    
    print('Initial dataframe shape {0}'.format(usa_users.shape))
    usa_users = usa_users.drop_duplicates(['user_id', 'artist_name'])
    current_rows = usa_users.shape[0]
    print('New dataframe shape {0}'.format(usa_users.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (197144, 9)
New dataframe shape (197142, 9)
Removed 2 rows


## K-Nearest Neighbors

Looks at artists that are routinely listened to together by an individual user to recommend similar artists.

In [221]:
#Pivot rows to rows of artists and columns of users; transform into sparse matrix using SciPy
artist_df = usa_users.pivot(index = 'artist_name', columns = 'user_id', values = 'plays').fillna(0)
artist_df_sparse = csr_matrix(artist_df.values)

In [222]:
#Implement Scikit Learn model for nearest neighbors using cosine similarity
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine')
model_knn.fit(artist_df_sparse)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [224]:
#Use a random query to choose an artist and give 5 recommendations
query_index = np.random.choice(artist_df.shape[0])
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(wide_artist_data.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for paul weller:

1: scarce, with distance of 0.06884879557098622:
2: small factory, with distance of 0.06884879557098622:
3: sean lowry, with distance of 0.06884879557098622:
4: afterhours, with distance of 0.07198383472440706:
5: brian regan, with distance of 0.09296348696175838:


## Recommended Artists

Fuzzywuzzy is a python library that finds strings that approximately match a given pattern. This allows you to enter an artist name for similar recommendations, and it will still work if you misspell or don't enter it exactly at it appears in the dataset.

In [225]:
import string
from fuzzywuzzy import fuzz

In [228]:
def print_artist_recommendations(query_artist, artist_plays_matrix, knn_model, k):

    ratio_tuples = []

    for i in artist_plays_matrix.index:
        ratio = fuzz.ratio(i.lower(), query_artist.lower())
        if ratio >= 75:
            current_query_index = artist_plays_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
    
    print('Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples]))
    
    try:
        query_index = max(ratio_tuples, key = lambda x: x[1])[2] # get the index of the best artist match in the data
    except:
        print('Your artist didn\'t match any artists in the data. Try again')
        return None
    
    distances, indices = knn_model.kneighbors(artist_plays_matrix.iloc[query_index, :].reshape(1, -1), n_neighbors = k + 1)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(artist_plays_matrix.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, artist_plays_matrix.index[indices.flatten()[i]], distances.flatten()[i]))

    return None

In [229]:
#Enter a band/artist to get some recommendations of other artists you may like
print_artist_recommendations('mariah carey', artist_df_zero_one, model_nn_binary, k = 10)

Possible matches: [('ian carey', 76), ('marc cary', 76), ('maria rerych', 75), ('mariah carey', 100)]

Recommendations for mariah carey:

1: beyonce, with distance of 0.6048546149770182:
2: janet jackson, with distance of 0.6137486429448573:
3: ciara, with distance of 0.6231488268259087:
4: brandy, with distance of 0.6482066027526531:
5: rihanna, with distance of 0.663183465145696:
6: britney spears, with distance of 0.6856637710326403:
7: the pussycat dolls, with distance of 0.6881684820961433:
8: aaliyah, with distance of 0.6900993459733483:
9: fergie, with distance of 0.704764030857187:
10: keyshia cole, with distance of 0.7106147163995169:


