In [None]:
import scipy as sp
import numpy as np
import pandas as pd
import time
import lightfm.evaluation
from lightfm import LightFM
from implicit.als import AlternatingLeastSquares
import sklearn.metrics
import math
import random

In [3]:
data = pd.read_table("/mnt/data/lastfm/360k/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'],na_filter=False)
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
plays = sp.sparse.coo_matrix((data['plays'].astype(float), 
                   (data['user'].cat.codes,
                    data['artist'].cat.codes)), dtype = np.double)

In [86]:
#small_data = pd.read_csv('/mnt/data/lastfm/2k/user_artists.dat', sep = '\t')
#plays = sp.sparse.coo_matrix((small_data['weight'].astype(float), 
#                                             (small_data['userID'], 
#                                              small_data['artistID'])))

In [77]:
def split_train_test(plays, train_rate = 0.8):
    user_index = range(plays.shape[0])
    train = plays.copy().tolil()
    test = sp.sparse.lil_matrix(plays.shape)
    
    min_rows = int(1/(1 - train_rate))
    for uindex in user_index:
        rows = plays.getrow(uindex).indices
        if len(rows) <= min_rows:
            continue
        testindics = np.random.choice(plays.getrow(uindex).indices, 
                                        size=int(len(rows) * (1 - train_rate)), 
                                        replace=False)
        train[uindex, testindics] = 0.
        test[uindex, testindics] = plays[uindex, testindics]
    
    train = train.tocsr()
    train.eliminate_zeros()
    return train, test.tocsr()

def train_pair_wise_model_and_evaluate(train, test = None, factors = 50, epochs = 100, learning_rate = 0.05, loss = 'bpr', eva = True):
    tic = time.time()
    model = LightFM(no_components = factors, learning_rate=learning_rate, loss=loss)
    model.fit(train, epochs=epochs, num_threads = 2)
    toc = time.time()
    print("LightFM training cost %.2f seconds" % (toc - tic))
    
    if test is not None and eva:
        eva_test = lightfm.evaluation.auc_score(model, test, num_threads = 2)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    
    return model

In [68]:
train_mat, test_mat = split_train_test(plays.tocsr())

In [67]:
model = train_pair_wise_model_and_evaluate(train_mat, test = test_mat)

LightFM training cost 0.58 seconds
User auc mean = 0.77, std = 0.17 (on testing dataset)


In [169]:
def generate_hot_item_list(plays, top = 1000):
    item_indexs, item_counts = np.unique(plays.col, return_counts = True)
    items_played_count = filter(lambda item_pair: item_pair[1] > 10, list(zip(item_indexs, item_counts)))
    
    return sorted(list(items_played_count), key = lambda i:i[1], reverse = True)[: top]

为一个指定的用户产生负样本。产生的方式是：

1. 从热门Item中去掉用户已收听的Item
2. 按照热门程度加权采样

In [None]:
def weighted_sampling(sequence, k):
    """
    parameters:
    
    sequence -- list-like [(item1, weight1), ...]
    
    k -- number of selected items
    
    return:
    
    list that selected.
    """
    
    weighted_list = []
    for elements in sequence:
        weighted_list.append((elements[0], random.expovariate(elements[1])))
    
    return sorted(weighted_list, key = lambda x : x[1])[:k]

In [7]:
def generate_negative_samples(uindex, plays, hot_items, negative_count = 5):
    history = set(plays.getrow(2).indices)
    candidates = []
    for (item, weight) in hot_items:
        if item in history:
            continue
        candidates.append((item, weight))
    if negative_count > len(candidates):
        negative_count = len(candidates)
    return weighted_sampling(candidates, negative_count)

#negative_samples = generate_negative_samples(2, plays.tocsr(), hot_items, 50)

In [6]:
def evaluate_point_wise_model(model, plays, test, num_test_users = -1):
    hot_items = generate_hot_item_list(test_mat.tocoo())
    user_indexes = range(plays.shape[0])
    aucs = []
    if num_test_users > 0:
        user_indexes = np.random.choice(user_indexes, num_test_users)
    for uindex in user_indexes:
        positive_samples = test_mat.tocsr().getrow(uindex).indices
        negative_samples = generate_negative_samples(uindex, plays.tocsr(), hot_items, len(positive_samples))
        if len(negative_samples) == 0:
            continue
        user_factor = model.user_factors[uindex].reshape((1, model.factors))
        user_samples = np.concatenate((positive_samples,  negative_samples), axis = 0)
        user_feedback = np.concatenate((np.full(len(positive_samples), 1), np.full(len(negative_samples), 0)), axis = 0)
        item_factors = model.item_factors[user_samples]
        scores = np.dot(user_factor, item_factors.transpose()).reshape(len(user_feedback))
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(user_feedback, scores, pos_label=1)
        auc = sklearn.metrics.auc(fpr, tpr)
        aucs.append(auc)
    return np.array(aucs)

In [5]:
def train_point_wise_model_and_evaluate(train, plays = None, test = None, factors = 100, epochs = 100, learning_rate = 0.05,num_test_users = -1, eva = True):
    tic = time.time()
    model = AlternatingLeastSquares(factors = factors, iterations = epochs)
    model.fit(train.transpose())
    toc = time.time()
    print("ALS training cost %.2f seconds" % (toc - tic))
    
    if eva:
        eva_test = evaluate_point_wise_model(model, plays, test, num_test_users)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    return model

In [188]:
model2 = train_point_wise_model_and_evaluate(train_mat, plays, test_mat, factors = 50, epochs = 100, num_test_users = 1000)



ALS training cost 5.95 seconds
User auc mean = 0.50, std = 0.23 (on testing dataset)


In [None]:
model1 = train_pair_wise_model_and_evaluate(plays, test = None, factors = 100, epochs = 100, eva = False)

In [74]:
model2 = train_point_wise_model_and_evaluate(plays, factors = 100, epochs = 100, eva = False)



ALS training cost 89.32 seconds


In [None]:
class Recommender(object):
    def __init__(self, models = {}, plays = None, artists = None):
        self.models = models
        self.plays = plays
        self.artists = artists
        self.artistsDict = None
        if artists:
            index, names = zip(*list(enumerate(self.artists)))
            self.artistsDict = dict(zip(names, index))
        
    def recommend(self, userid, modelname = 'bpr', top = 10):
        if modelname not in self.models:
            return []
        
        recommend_list = []
        if modelname == 'bpr':
            recommend_list = self._recommend_with_bpr(userid, top)
        elif modelname == 'als':
            recommend_list = self._recommend_with_als(userid, top)
        
        return self._output_more(userid, None, recommend_list)
    
    def similar_items(self, artist_name, top = 10):
        if artist_name not in self.artistsDict:
            return {}
        itemid = self.artistsDict[artist_name]
        model = self.models['als']
        similar_items = model.similar_items(itemid, top) 
        return self._output_more(None, itemid, similar_items)
    
    def _recommend_with_bpr(self, userid, top):
        """
        compute recommendation for user
        """
        model = self.models['bpr']
        item_factors = model.get_item_representations()
        user_factors = model.get_user_representations()
        item_factors = np.c_[item_factors[1], item_factors[0]]
        user_factors = np.c_[user_factors[1], user_factors[0]]
        scores = np.dot(user_factors[userid], item_factors.transpose())
        items = range(item_factors.shape[0])
        
        """
        filter the items the user has consumed. 
        """
        sorted_items = sorted(zip(items, scores), key = lambda x : x[1], reverse = True)
        history = set(self.plays.getrow(2).indices)
        
        recommendations = []
        for item in sorted_items:
            if item[0] in history:
                continue
            recommendations.append(item)
            if len(recommendations) == top:
                break
        return recommendations
    
    def _recommend_with_als(self, userid, top):
        model = self.models['als']
        return model.recommend(userid, self.plays, N = top)
    
    def _output_more(self, userid, itemid, item_list):
        userinfo = []
        output_iteminfo = []
        input_iteminfo = []
        if userid:
            userinfo = self._output_user_more_info(userid)
        if item_list:
            output_iteminfo = self._output_items_more_info(item_list)
        if itemid:
            input_iteminfo = self._output_items_more_info([(itemid, 1)])
        return {'user': userinfo, 'item':input_iteminfo, 'items': output_iteminfo}
    
    def _output_user_more_info(self, userid):
        history = self.artists[self.plays.getrow(userid).indices]
        playcount = self.plays.getrow(userid).data
        
        return list(zip(history, playcount))
    
    def _output_items_more_info(self, items):
        itemids, scores = zip(*items)
        iteminfo = self.artists[list(itemids)]
        return list(zip(iteminfo, scores))

In [88]:
recommender = Recommender({'bpr': model1, 'als':model2}, plays.tocsr(), data.artist.cat.categories)

In [111]:
recommendation1 = recommender.recommend(87120, modelname='bpr')
recommendation2 = recommender.recommend(87120, modelname='als')

{'item': [],
 'items': [('chris brown', 4.6462021),
  ('ne-yo', 4.5142355),
  ('the pussycat dolls', 4.4670238),
  ('jordin sparks', 4.3777504),
  ('ciara', 4.3308463),
  ('danity kane', 4.2887182),
  ('all time low', 4.286067),
  ('a day to remember', 4.2578568),
  ('ashley tisdale', 4.2392902),
  ('hilary duff', 4.1141906)],
 'user': [('2pm', 18.0),
  ('98 degrees', 11.0),
  ('all 4 one', 3.0),
  ('big bang', 102.0),
  ('blue', 7.0),
  ('boa', 6.0),
  ('brian mcnight', 5.0),
  ('brown eyed girls', 43.0),
  ('chin', 2.0),
  ('def tech', 3.0),
  ('golf-mike', 165.0),
  ('jay chou', 57.0),
  ('jewelry', 9.0),
  ('kara', 28.0),
  ('lim jung hee', 2.0),
  ('mariah carey', 11.0),
  ('orange range', 22.0),
  ('shinee', 43.0),
  ('smtown', 4.0),
  ('ss501', 3.0),
  ('super junior', 42.0),
  ('super junior m', 22.0),
  ('taku iwasaki', 3.0),
  ('wada kouji', 35.0),
  ('ไอซ์ ศรัณยู', 14.0),
  ('동방신기', 175.0),
  ('믹키유천', 10.0),
  ('비', 5.0),
  ('소녀시대', 186.0),
  ('영웅재중', 15.0),
  ('이효리', 14.0)]

In [116]:
similar_items = recommender.similar_items('291703', 100)

{'item': [('동방신기', 1)],
 'items': [('동방신기', 1.0000000000000002),
  ('super junior', 0.99441452243596229),
  ('shoujo lolita 23q', 0.98569538147384261),
  ('duke 듀크', 0.97972346457582726),
  ('ayumi hamasaki', 0.9787968827628587),
  ('se7en', 0.97763852248927019),
  ('kat-tun', 0.97714908512757037),
  ('Ｐ’ＵＮＫ～ＥＮ～ＣＩＥＬ', 0.97542793979118014),
  ('news', 0.97522034593182794),
  ('trinití', 0.97389688123598783),
  ('micky', 0.97202826714271406),
  ('morikawa toshiyuki', 0.9692017185374735),
  ('anson hu (胡彦斌)', 0.96816953049726706),
  ('abe mao (阿部真央)', 0.96754087862429672),
  ('ｂｉｓ', 0.96679765586453303),
  ('[¸áąÂÁ÷] flytothesky 3Áý', 0.9661531460322027),
  ('예성', 0.96555176734572878),
  ('sophie ellis bextor', 0.96461654437179079),
  ('nana kitade', 0.964551143832589),
  ('[Ä·ÇÁmp3]ÀÌÁ¤Çö3Áý', 0.96420082429624487),
  ('porno graffitti', 0.96401613192446423),
  ('???;??', 0.96398236034751794),
  ('epik high', 0.96372563928268018),
  ('cheb mami feat. zaho', 0.96269300358238863),
  ('shine