In [190]:
import annoy as an
import scipy as sp
import numpy as np
import pandas as pd
import time
import lightfm.evaluation
from lightfm import LightFM
from implicit.als import AlternatingLeastSquares
import sklearn.metrics
import math

In [None]:
data = pd.read_table("/mnt/data/lastfm/360k/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'],na_filter=False)
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
plays = sp.sparse.coo_matrix((data['plays'].astype(float), 
                   (data['user'].cat.codes,
                    data['artist'].cat.codes)))

In [86]:
#small_data = pd.read_csv('/mnt/data/lastfm/2k/user_artists.dat', sep = '\t')
#plays = sp.sparse.coo_matrix((small_data['weight'].astype(float), 
#                                             (small_data['userID'], 
#                                              small_data['artistID'])))

In [8]:
def save_to_file(mat, file):
    with open(file, 'w') as output:
        for i in range(mat.data.shape[0]):
            output.write("%s %s %s\n" % (mat.row[i], mat.col[i], mat.data[i]))

In [9]:
#save_to_file(train, "/home/kaijiang/mybooks/lastfm_train.coo")
#save_to_file(test, '/home/kaijiang/mybooks/lastfm_test.coo')

In [194]:
def split_train_test(plays, train_rate = 0.8):
    user_index = range(plays.shape[0])
    train = plays.copy().tolil()
    test = sp.sparse.lil_matrix(plays.shape)
    
    min_rows = int(1/(1 - train_rate))
    for uindex in user_index:
        rows = plays.getrow(uindex).indices
        if len(rows) <= min_rows:
            continue
        testindics = np.random.choice(plays.getrow(uindex).indices, 
                                        size=int(len(rows) * (1 - train_rate)), 
                                        replace=False)
        train[uindex, testindics] = 0.
        test[uindex, testindics] = plays[uindex, testindics]
    
    train = train.tocsr()
    train.eliminate_zeros()
    return train, test.tocsr()

def train_pair_wise_model_and_evaluate(train, test = None, factors = 10, epochs = 10, learning_rate = 0.05, loss = 'bpr', eva = True):
    tic = time.time()
    model = LightFM(no_components = factors, learning_rate=learning_rate, loss=loss)
    model.fit(train, epochs=epochs, num_threads = 2)
    toc = time.time()
    print("LightFM training cost %.2f seconds" % (toc - tic))
    
    if test is not None and eva:
        eva_test = lightfm.evaluation.auc_score(model, test, num_threads = 2)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    
    return model

In [193]:
train_mat, test_mat = split_train_test(plays.tocsr())

In [67]:
model = train_pair_wise_model_and_evaluate(train_mat, test = test_mat)

LightFM training cost 0.58 seconds
User auc mean = 0.77, std = 0.17 (on testing dataset)


In [169]:
def generate_hot_item_list(plays, top = 1000):
    item_indexs, item_counts = np.unique(plays.col, return_counts = True)
    items_played_count = filter(lambda item_pair: item_pair[1] > 10, list(zip(item_indexs, item_counts)))
    
    return sorted(list(items_played_count), key = lambda i:i[1], reverse = True)[: top]

为一个指定的用户产生负样本。产生的方式是：

1. 从热门Item中去掉用户已收听的Item
2. 按照热门程度加权采样

In [183]:
def generate_negative_samples(uindex, plays, hot_items, negative_count = 5):
    history = set(plays.getrow(2).indices)
    softmax_total = 0.0
    candidates = []
    probability = []
    for (item, weight) in hot_items:
        if item in history:
            continue
        weight = math.exp(weight)
        candidates.append(item)
        probability.append(weight)
        softmax_total += weight
    if negative_count > len(candidates):
        negative_count = len(candidates)
    probability = np.array(probability)/softmax_total
    if len(candidates) == 0:
        return []
    return np.random.choice(np.array(candidates), negative_count, p = probability, replace=False)

#negative_samples = generate_negative_samples(2, plays.tocsr(), hot_items, 50)

In [178]:
def evaluate_point_wise_model(model, plays, test, num_test_users = -1):
    hot_items = generate_hot_item_list(test_mat.tocoo())
    user_indexes = range(plays.shape[0])
    aucs = []
    if num_test_users > 0:
        user_indexes = np.random.choice(user_indexes, num_test_users)
    for uindex in user_indexes:
        positive_samples = test_mat.tocsr().getrow(uindex).indices
        negative_samples = generate_negative_samples(uindex, plays.tocsr(), hot_items, len(positive_samples))
        if len(negative_samples) == 0:
            continue
        user_factor = model.user_factors[uindex].reshape((1, model.factors))
        user_samples = np.concatenate((positive_samples,  negative_samples), axis = 0)
        user_feedback = np.concatenate((np.full(len(positive_samples), 1), np.full(len(negative_samples), 0)), axis = 0)
        item_factors = model.item_factors[user_samples]
        scores = np.dot(user_factor, item_factors.transpose()).reshape(len(user_feedback))
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(user_feedback, scores, pos_label=1)
        auc = sklearn.metrics.auc(fpr, tpr)
        aucs.append(auc)
    return np.array(aucs)

In [195]:
def train_point_wise_model_and_evaluate(train, plays = None, test = None, factors = 10, epochs = 10, learning_rate = 0.05,num_test_users = -1, eva = True):
    tic = time.time()
    model = AlternatingLeastSquares(factors = factors, iterations = epochs)
    model.fit(train.transpose())
    toc = time.time()
    print("ALS training cost %.2f seconds" % (toc - tic))
    
    if eva:
        eva_test = evaluate_point_wise_model(model, plays, test, num_test_users)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    return model

In [188]:
model2 = train_point_wise_model_and_evaluate(train_mat, plays, test_mat, factors = 50, epochs = 100, num_test_users = 1000)



ALS training cost 5.95 seconds
User auc mean = 0.50, std = 0.23 (on testing dataset)


In [196]:
model1 = train_pair_wise_model_and_evaluate(train_mat, test = None, eva = False)
model2 = train_point_wise_model_and_evaluate(train_mat, eva = False)



LightFM training cost 196.06 seconds
ALS training cost 45.03 seconds


In [257]:
class Recommender(object):
    def __init__(self, models = {}, plays = None, artists = None):
        self.models = models
        self.plays = plays
        self.artists = artists
        
    def recommend(self, userid, modelname = 'bpr', top = 10):
        if modelname not in self.models:
            return []
        
        recommend_list = []
        if modelname == 'bpr':
            recommend_list = self._recommend_with_bpr(userid, top)
        elif modelname == 'als':
            recommend_list = self._recommend_with_als(userid, top)
        
        return self._output_more(userid, None, recommend_list)
    
    def similar_items(self, itemid, top = 10):
        model = self.models['als']
        similar_items = model.similar_items(itemid, top) 
        return self._output_more(None, itemid, similar_items)
    
    def _recommend_with_bpr(self, userid, top):
        """
        compute recommendation for user
        """
        model = self.models['bpr']
        item_factors = model.get_item_representations()
        user_factors = model.get_user_representations()
        item_factors = np.c_[item_factors[1], item_factors[0]]
        user_factors = np.c_[user_factors[1], user_factors[0]]
        scores = np.dot(user_factors[userid], item_factors.transpose())
        items = range(item_factors.shape[0])
        
        """
        filter the items the user has consumed. 
        """
        sorted_items = sorted(zip(items, scores), key = lambda x : x[1], reverse = True)
        history = set(self.plays.getrow(2).indices)
        
        recommendations = []
        for item in sorted_items:
            if item[0] in history:
                continue
            recommendations.append(item)
            if len(recommendations) == top:
                break
        return recommendations
    
    def _recommend_with_als(self, userid, top):
        model = self.models['als']
        return model.recommend(userid, self.plays, N = top)
    
    def _output_more(self, userid, itemid, item_list):
        userinfo = []
        output_iteminfo = []
        input_iteminfo = []
        if userid:
            userinfo = self._output_user_more_info(userid)
        if item_list:
            output_iteminfo = self._output_items_more_info(item_list)
        if itemid:
            input_iteminfo = self._output_items_more_info([itemid])
        return {'user': userinfo, 'item':input_iteminfo, 'items': iteminfo}
    
    def _output_user_more_info(self, userid):
        history = self.artists[self.plays.getrow(userid).indices]
        playcount = self.plays.getrow(userid).data
        
        return list(zip(history, playcount))
    
    def _output_items_more_info(self, items):
        itemids, scores = zip(*items)
        iteminfo = self.artists[list(itemids)]
        return list(zip(iteminfo, scores))

In [258]:
recommender = Recommender({'bpr': model1, 'als':model2}, plays, data.artist.as_matrix())

In [271]:
recommendation = recommender.recommend(0)
recommendation

{'items': [('american hamburger', 0.22671124),
  ('渡辺美里', 0.22374471),
  ('flyleaf', 0.22352177),
  ('sonny landreth', 0.22162895),
  ('architecture in helsinki', 0.21808676),
  ('akon', 0.21637484),
  ('talking heads', 0.21613717),
  ('chicks on speed', 0.21517316),
  ('yann tiersen', 0.21392792),
  ('akon', 0.21333641)],
 'user': []}

In [263]:
data[data.user == '00000c289a1829a808ac09c00daf10bc3c4e223b']

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403
