By Size, Time and Metrics

In [None]:
run analysis_functions.ipynb #import all helper functions
import numpy as np
import pandas as pd

In [None]:
#import full dataset
#read in full dataset
col_names = ["user_id","artist_mbid","artist_name","plays"]
df_all = pd.read_csv("usersha1-artmbid-artname-plays.tsv", sep = "\t", header = None, names = col_names)
df_all = df_all[df_all.user_id != "sep 20, 2008"]

In [None]:
#Use this to sample full dataset
def get_users(df, n):
    sample_userid = df["user_id"].unique()
    sample_userid = np.random.choice(sample_userid, size = n, replace = False)

    #grab rows with sample user id
    df_sample = df[df.user_id.isin(sample_userid)].reset_index(drop = True)

    return df_sample

In [None]:
from __future__ import print_function
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
import timeit 

sizes = [9000, 20000, 40000, 60000, 80000]

precision_all = []
recal_all = []
coverage_all = []
train_time = []
recommend_time = []

for size in sizes: 
    df = get_users(df_all, size)
    df = df.drop(['Unnamed: 0'], axis=1)
    
    #create FM model with tuned parameters
    model_fm = LightFM(learning_rate=0.05, loss='warp')
    
    #create sparse matrix
    plays_sparse = create_sparse_matrix(i).astype('float')
    print('Matrix Sparsity:', calculate_sparsity(plays_sparse))
    
    #split train test
    train,test = random_train_test_split(plays_sparse)
    
    start = timeit.timeit()
    
    #train model
    model_fm.fit(train, epochs = 30)
    
    end = timeit.timeit()
    train_t = end-start
    
    #evaluate model 
    #coverage, precision, recall, ndcg = evaluate(model_fm_vanilla, "lightfm", test, plays_sparse,liked=train.T.tocsr())
    coverage, precision, recall = evaluate_lightfm(model_fm, plays_sparse, train, test)
    print("Precision:",precision*100,'%')
    print("Recall:",recall*100,'%')
    print("Coverage:",coverage*100,'%')
    
    #time recommend time
    start = timeit.timeit()
    
    (-model.predict(0,np.arange(original.shape[1]))).argsort()[:20]
    
    end = timeit.timeit()
    
    recommend_t = end-start
    
    precision_all.append(precision)
    recal_all.append(recall)
    coverage_all.append(coverage)
    train_time.append(train_t)
    recommend_time.append(recommend_t)

Qualitative Results

In [16]:
df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df.drop(['Unnamed: 0'], axis=1)
users = list(np.sort(df.user_id.unique()))
plays_sparse = create_sparse_matrix(df).astype('float')

Creating sparse matrix...




In [320]:
#list of artists

artists_rock = [
    'the beatles',
    'the rolling stones',
    'led zeppelin',
    'queen',
    'pink floyd',
    'ac/dc',
    'guns n\' roses',
    'aerosmith'
]

artists_rap = [
    'kanye west',
    '2pac',
    'lil wayne',
    'eminem',
    'young jeezy',
    'jay-z',
    'drake'
]

artists_tim = [
    'daft punk',
    'deadmau5',
    'john mayer',
    'hans zimmer',
    'coldplay'
]

# tim's brother
# artists_deus = [
#     'coldplay',
#     'john mayer',
#     'justin timberlake'
# ]

artists_char = [
           'solange', 
           'sufjan stevens',  
           'beirut', 
           'yo la tengo', 
           'little dragon', 
           'crystal castles', 
           'a tribe called quest', 
           'radiohead', 
           'faye wong',
           'the beach boys',
           'van morrison',
           'marvin gaye',
           'whitney houston',
           'kanye west',
           'curtis mayfield',
           '2pac',
           'eagles',
           'david bowie',
           'prince'
]

new_users = [('rap', artists_rap),('rock', artists_rock), ('tim', artists_tim), ('char', artists_char)]

In [321]:
#get artist_name from artist_id
#def get_artist_name(artist_idx):

def add_user(data, new_user, value = 1000):
    col_names = ['user_id','artist_mbid','artist_name','plays']
    mbid_dict = pd.Series(data.artist_mbid.values, index = data.artist_name).to_dict()
    final = []
    
    user_name = new_user[0]
    user_artists = new_user[1]
    
    for artist in user_artists:
        final.append([user_name, mbid_dict[artist], artist, value])
    return pd.concat([data, pd.DataFrame(final, columns = col_names)])

def create_artist_mapping(data):
    artists = list(data.artist_mbid.unique())
    artist_categories = data.artist_mbid.astype('category', categories=artists).cat.categories
    name_dict = pd.Series(data.artist_name.values, index = data.artist_mbid).to_dict()
    artist_map = dict((i, name_dict[x]) for (i, x) in enumerate(artist_categories))
    return artist_map

def create_user_mapping(data):
    users = list(data.user_id.unique())
    user_categories = data.user_id.astype('category', categories=users).cat.categories
    user_map = dict((x, i) for (i, x) in enumerate(user_categories))
    return user_map

def get_artist_name(list_idx, artist_mapping):
    return [artist_mapping[idx] for idx in list_idx]

In [322]:
run analysis_functions.ipynb #import all helper functions

## ALS

In [323]:
#use implicit to get recommended for users
model_als = implicit.als.AlternatingLeastSquares(factors = 30, regularization = 0.01)

for new in new_users: 
    new_users_df = add_user(df, new, 1000)

    #create mappings
    artist_mapping = create_artist_mapping(new_users_df)
    user_mapping = create_user_mapping(new_users_df)

    plays_sparse = create_sparse_matrix(new_users_df)
    train, test, user_count = split_train_test_per_user(plays_sparse, 3, 10)

    # Train model
    print("Fitting model...")
    model_als.fit(train)

    recs = model_als.recommend(user_mapping[new[0]], train.T.tocsr(), N=20, filter_already_liked_items=True)#, artist_mapping) #returns (item_id, score)
    recs = [x[0] for x in recs]
    
    print("User ", new[0], " :\n", get_artist_name(recs, artist_mapping))

    



Creating sparse matrix...




HBox(children=(IntProgress(value=0, max=9001), HTML(value=u'')))

  0%|          | 0/15 [00:00<?, ?it/s]

Fitting model...


100%|██████████| 15.0/15 [00:05<00:00,  2.92it/s]


User  rap  :
 ['lupe fiasco', 'lil wayne', '2pac', '50 cent', 't.i.', 'ludacris', 'the game', 'jay-z', 'kanye west', 'eminem', 'snoop dogg', 'nas', 'notorious b.i.g.', 'dr. dre', 'bone thugs-n-harmony', 'drake', 'akon', 'young jeezy', 'ice cube', 't-pain']
Creating sparse matrix...


HBox(children=(IntProgress(value=0, max=9001), HTML(value=u'')))

  3%|▎         | 0.5/15 [00:00<00:04,  3.20it/s]

Fitting model...


100%|██████████| 15.0/15 [00:05<00:00,  2.76it/s]


User  rock  :
 ['martingo', 'the beatles', 'ac/dc', 'queen', "guns n' roses", 'pink floyd', 'led zeppelin', 'red hot chili peppers', 'nirvana', 'the rolling stones', 'aerosmith', 'metallica', 'u2', 'the doors', 'muse', 'radiohead', 'coldplay', 'r.e.m.', 'johnny cash', 'the who']
Creating sparse matrix...


HBox(children=(IntProgress(value=0, max=9001), HTML(value=u'')))

  3%|▎         | 0.5/15 [00:00<00:04,  3.06it/s]

Fitting model...


100%|██████████| 15.0/15 [00:06<00:00,  2.51it/s]


User  tim  :
 ['martingo', 'coldplay', 'michael jackson', 'daft punk', 'kanye west', 'hans zimmer', 'john mayer', 'soundtrack', 'linkin park', 'red hot chili peppers', 'muse', 'eminem', 'the prodigy', 'the killers', 'moby', 'the beatles', 'deadmau5', 'pendulum', 'david guetta', 'gorillaz']
Creating sparse matrix...


HBox(children=(IntProgress(value=0, max=9001), HTML(value=u'')))

  3%|▎         | 0.5/15 [00:00<00:04,  3.15it/s]

Fitting model...


100%|██████████| 15.0/15 [00:06<00:00,  2.48it/s]

User  char  :
 ['martingo', 'die \xc3\x84rzte', 'die toten hosen', 'red hot chili peppers', '[unknown]', 'mando diao', 'the beatles', 'clueso', 'farin urlaub', 'beatsteaks', 'johnny cash', 'jack johnson', 'sportfreunde stiller', 'deichkind', 'die fantastischen vier', 'b\xc3\xb6hse onkelz', 'peter fox', 'coldplay', 'madsen', 'linkin park']





## LightFM

In [329]:
#use implicit to get recommended for users
from lightfm.cross_validation import random_train_test_split

#create FM model with tuned parameters
model_fm = LightFM(learning_rate=0.05, loss='warp')

for new in new_users: 
    new_users_df = add_user(df, new, 1000)

    #create mappings
    artist_mapping = create_artist_mapping(new_users_df)
    user_mapping = create_user_mapping(new_users_df)
    
    #create sparse matrix
    plays_sparse = create_sparse_matrix(new_users_df)
    
    #split train test
    train,test = random_train_test_split(plays_sparse)

    print("training...")
    #train model
    model_fm.fit(train, epochs = 30)
    
    print("recommending...")
    #get recommendation from model
    recs = (-model_fm.predict(0,np.arange(plays_sparse.shape[1]))).argsort()[:20]
    
    print("User ", new[0], " :\n", get_artist_name(recs, artist_mapping))
    
    
    



Creating sparse matrix...




User  rap  :
 ['martingo', 'the cure', 'radiohead', 'joy division', 'aphex twin', 'david bowie', 'depeche mode', 'boards of canada', 'sigur r\xc3\xb3s', 'the beatles', 'bj\xc3\xb6rk', 'pixies', 'the smiths', 'sonic youth', 'nirvana', 'portishead', 'beirut', 'new order', 'nick cave and the bad seeds', 'pink floyd']
Creating sparse matrix...
User  rock  :
 ['martingo', 'pink floyd', 'tom waits', 'the beatles', 'johnny cash', 'radiohead', 'nine inch nails', 'the cure', 'nick cave and the bad seeds', 'joy division', 'tool', 'david bowie', 'the doors', 'sigur r\xc3\xb3s', 'the clash', 'led zeppelin', 'bob dylan', 'porcupine tree', 'the smashing pumpkins', 'leonard cohen']
Creating sparse matrix...
User  tim  :
 ['martingo', 'the beatles', 'radiohead', 'tom waits', 'the cure', 'david bowie', 'pink floyd', 'bob dylan', 'the clash', 'joy division', 'pj harvey', 'pixies', 'the doors', 'the velvet underground', 'nick cave and the bad seeds', 'the rolling stones', 'the smiths', 'siouxsie and the 

## Baseline

In [263]:
#use baseline to get recommended users
model_baseline = Baseline(n_recs = 20)
model_baseline.fit(train)
get_artist_name(model_baseline.predict(), artist_mapping)

Fitting baseline...
[332922  34681  68905 ...     28     27     24]


['martingo',
 'the beatles',
 'radiohead',
 'madonna',
 'coldplay',
 'nine inch nails',
 'marilyn manson',
 'metallica',
 'muse',
 'pink floyd',
 'linkin park',
 'red hot chili peppers',
 'system of a down',
 'death cab for cutie',
 'placebo',
 'nightwish',
 'depeche mode',
 '2pac',
 'the killers',
 'in flames']

LightFM Troubleshoot Fit Partial

In [52]:
from __future__ import print_function
from lightfm import LightFM

In [53]:
run analysis_functions.ipynb #import all helper functions

In [54]:
# Import main dataset
df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df.drop(['Unnamed: 0'], axis=1)

#create sparse matrix
plays_sparse = create_sparse_matrix(df).astype('float')
print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

import lightfm
from lightfm.data import Dataset
from lightfm import cross_validation

# Initialize lightfm Dataset()
int_df = df.drop(['artist_name'], axis=1)
int_data = lightfm.data.Dataset(user_identity_features=True, item_identity_features=True)

# Create tuples for int_df values, as well as separate tuples for user_id and artist_id
tuples = [tuple(x) for x in int_df.values]
user_id = [tuple(x) for x in df.user_id]
artist_id = [tuple(x) for x in df.artist_mbid]

# Fit int_data
int_data.fit(df.user_id, df.artist_mbid)

# Build interactions, weights
interactions, weights = int_data.build_interactions(tuples)

# Create training and test sets for lightFM
train_light, test_light = lightfm.cross_validation.random_train_test_split(plays_sparse, test_percentage=0.6, random_state=None)

model_fm_vanilla = LightFM(learning_rate=0.05, loss='warp')

# Train Model
print("Fitting model...")
model_fm_vanilla.fit(train_light, epochs=10, verbose = True)

Creating sparse matrix...




Matrix Sparsity: 99.8965986346
Fitting model...
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x1a1a86db50>