In [1]:
import pandas as pd

In [2]:
data_dir = '/home/vadim/playlist_generation/data/random_data'

In [3]:
tracks = pd.read_csv('{}/tracks.csv'.format(data_dir), index_col=0)
artists = pd.read_csv('{}/artists.csv'.format(data_dir), index_col=0)
albums = pd.read_csv('{}/albums.csv'.format(data_dir), index_col=0)
transactions = pd.read_csv('{}/transactions.csv'.format(data_dir), index_col=0)
playlists = pd.read_csv('{}/playlists.csv'.format(data_dir), index_col=0)

In [4]:
def get_full_dataset(transactions, tracks, playlists):
    full_dataset = pd.merge(transactions, tracks, how='left', on='trackid')
    full_dataset = pd.merge(full_dataset, playlists, how='left', on='pid')
    return full_dataset

In [5]:
full_dataset = get_full_dataset(transactions, tracks, playlists)
full_dataset['rating'] = 1
full_dataset.head()

Unnamed: 0,pid,trackid,popular,artistid,albumid,name,num_followers,rating
0,822314,1149,6600,636,821,going out,1,1
1,822314,230,5313,95,138,going out,1,1
2,822314,4937,4965,876,3355,going out,1,1
3,822595,23973,96,7340,14054,FALL '16,5,1
4,822595,4171,694,1845,2862,FALL '16,5,1


In [6]:
from polara.recommender.models import RecommenderModel
from polara.recommender.data import RecommenderData
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import KFold
from scipy import sparse

In [7]:
class LocalCollectiveEmbeddings(RecommenderModel):
    import scipy.sparse
    
    def __init__(self, *args, **kwargs):
        super(LocalCollectiveEmbeddings, self).__init__(*args, **kwargs)
        self.method = 'LCE'
        
    def reindex_content(self, content_data, col, sort=True, inplace=True):
        grouper = content_data.groupby(col, sort=sort).grouper
        new_val = grouper.group_info[1]
        old_val = grouper.levels[0]
        val_transform = pd.DataFrame({'old': old_val, 'new': new_val})
        new_data = grouper.group_info[0]

        if inplace:
            result = val_transform
            content_data.loc[:, col] = new_data
        else:
            result = (new_data, val_transform)
        return result
        
    def reindex_content_columns(self, content_data, columns):
        index_content = {}
        for col in columns:
            index_content[col] = self.reindex_content(content_data, col)
        return index_content
    
    def get_train_content(self, content_data):
        self.train_content = content_data
    
    def get_content_shape(self):
        self.content_shape = {}
        for col in self.train_content.columns:
            self.content_shape[col] = self.train_content[col].max() + 1

    def get_training_content_matrix(self):
        self.get_content_shape()
        idx_pid = self.data.training[self.data.fields[0]].values
        val = np.ones(self.data.training.shape[0])
        
        i = 0
        Xu = []
        
        for col in self.train_content.columns:
            idx = self.train_content[col].values
            shp = (idx_pid.max() + 1, 
                   self.content_shape[col])
        
            Xu_new = sparse.csr_matrix((val, (idx_pid, idx)), 
                                    shape=shp)
            
            if i == 0:
                Xu = Xu_new
            else:
                Xu = sparse.hstack((Xu, Xu_new))
            
            i += 1
        
        return Xu
    
    def get_test_content_matrix(self):
        self.get_content_shape()
        idx_pid = self.data.test.testset[self.data.fields[0]].values
        val = np.ones(self.data.test.testset.shape[0])
        
        i = 0
        Xu = []
        
        for col in self.train_content.columns:
            idx = self.data.test.testset[col].values
            shp = (idx_pid.max() + 1, 
                   self.content_shape[col])
        
            Xu_new = sparse.csr_matrix((val, (idx_pid, idx)), 
                                       shape=shp)
            
            if i == 0:
                Xu = Xu_new
            else:
                Xu = sparse.hstack((Xu, Xu_new))
            
            i += 1
        
        
        return Xu
        
        
    def construct_A(self, X, k=15, binary=False):
        nbrs = NearestNeighbors(n_neighbors=1 + k).fit(X)
        if binary:
            A = nbrs.kneighbors_graph(X)
        else:
            A = nbrs.kneighbors_graph(X, mode='distance')
            
        return A
    
    def build(self, content_data, 
              k=10, alpha=0.1, beta=0.05, lamb=0.001, 
              epsilon=0.01, maxiter=150, verbose=True):
        
        self.get_train_content(content_data)
        
        R = self.get_training_matrix(dtype='float64')
        Xu = self.get_training_content_matrix()
        A = self.construct_A(Xu, k)
        
        n = R.shape[0]
        v1 = R.shape[1]
        v2 = Xu.shape[1]
        
        W = np.abs(sparse.rand(n, k, 0.5, 'csr', dtype=R.dtype))
        Hi = np.abs(sparse.rand(k, v1, 0.5, 'csr', dtype=R.dtype))
        Hu = np.abs(sparse.rand(k, v2, 0.5, 'csr', dtype=R.dtype))
            
        D = sparse.dia_matrix((A.sum(axis=0), 0), A.shape)

        
        gamma = 1. - alpha
        
        trRtR = tr(R, R)
        trXutXu = tr(Xu, Xu)

        WtW = W.T.dot(W)
        WtR = W.T.dot(R)
        WtXu = W.T.dot(Xu)
        WtWHi = WtW.dot(Hi)
        WtWHu = WtW.dot(Hu)
        DW = D.dot(W)
        AW = A.dot(W)

        itNum = 1
        delta = 2.0 * epsilon

        ObjHist = []

        while True:
            
            # update H
            Hi_1 = np.divide(
                (alpha * WtR), (alpha * WtWHi + lamb * Hi).maximum(1e-6))
    
            Hi = Hi.multiply(Hi_1)
            
            Hu_1 = np.divide(
                (gamma * WtXu), (gamma * WtWHu + lamb * Hu).maximum(1e-6))
            Hu = Hu.multiply(Hu_1)
            
            # update W
            W_t1 = alpha * R.dot(Hi.T) + gamma * Xu.dot(Hu.T) + beta * AW
            
            W_t2 = alpha * W.dot(Hi.dot(Hi.T)) + gamma * \
            W.dot(Hu.dot(Hu.T)) + beta * DW + lamb * W
            
            W_t3 = np.divide(W_t1, (W_t2).maximum(1e-6))
            W = W.multiply(W_t3)

            # calculate objective function
            WtW = W.T.dot(W)
            WtR = W.T.dot(R)
            WtXu = W.T.dot(Xu)
            WtWHi = WtW.dot(Hi)
            WtWHu = WtW.dot(Hu)
            DW = D.dot(W)
            AW = A.dot(W)

            tr1 = alpha * (trRtR - 2. * tr(Hi, WtR) + tr(Hi, WtWHi))
            tr2 = gamma * (trXutXu - 2. * tr(Hu, WtXu) + tr(Hu, WtWHu))
            tr3 = beta * (tr(W, DW) - tr(W, AW))
            tr4 = lamb * (WtW.diagonal().sum() + tr(Hi, Hi) + tr(Hu, Hu))

            Obj = tr1 + tr2 + tr3 + tr4
            ObjHist.append(Obj)

            if itNum > 1:
                delta = abs(ObjHist[-1] - ObjHist[-2])
                if verbose:
                    print ("Iteration: ", itNum, "Objective: ", Obj, "Delta: ", delta)
                if itNum > maxiter or delta < epsilon:
                    break

            itNum += 1
            
        self.W = W
        self.Hu = Hu 
        self.Hi = Hi
        
        
    def get_recommendations(self):
        Xu = self.get_test_content_matrix()
        Wt = np.linalg.lstsq(self.Hu.T.toarray(), Xu.T.toarray(), rcond=-1)[0]
        R = Wt.T.dot(self.Hi.toarray())
        return np.flip(np.argsort(R, axis=1), axis=1)[:self.topk]

In [8]:
def reindex_content(content_data, col, sort=True, inplace=True):
    grouper = content_data.groupby(col, sort=sort).grouper
    new_val = grouper.group_info[1]
    old_val = grouper.levels[0]
    val_transform = pd.DataFrame({'old': old_val, 'new': new_val})
    new_data = grouper.group_info[0]

    if inplace:
        result = val_transform
        content_data.loc[:, col] = new_data
    else:
        result = (new_data, val_transform)
    return result
        
def reindex_content_columns(content_data, columns):
    index_content = {}
    for col in columns:
        index_content[col] = reindex_content(content_data, col)
    return index_content

In [9]:
def tr(A, B):
    x = A.multiply(B)
    return (x.sum(axis=0)).sum(axis=1)

In [10]:
index_content = reindex_content_columns(full_dataset, ['artistid', 'albumid'])

In [11]:
data_model = RecommenderData(full_dataset,'pid', 'trackid', 'rating', seed=0)
data_model.prepare_training_only()

Preparing data...
Done.


In [12]:
a = LocalCollectiveEmbeddings(data_model)

In [13]:
a.build(full_dataset[['artistid', 'albumid']], verbose=False)



In [14]:
def cross_validation_lce(data, n_splits=5, seed=1, test_size=0.05, topk=500, rank=10):
    
    index_content = reindex_content_columns(data, ['artistid', 'albumid'])
    
    kf = KFold(n_splits=n_splits, random_state=seed)
    i = 1
    scores = {'precision': [], 'recall': [], 'miss_rate': [], 'nDCG': []}
    for users_ids, unseen_users_ids in kf.split(data['pid'].drop_duplicates()):
        print('=========================Fold {}============================='.format(i))
        i += 1
        users = data['pid'].drop_duplicates().values[users_ids]
        unseen_users = data['pid'].drop_duplicates().values[unseen_users_ids]
        train = data.query('pid in @users')
        test = data.query('pid in @unseen_users')
        train_albums = train.albumid.unique()
        train_artists = train.artistid.unique()
        test = test.query('albumid in @train_albums')
        test = test.query('artistid in @train_artists')
        test_sampled = test.sample(frac=1-test_size, random_state=seed).sort_values('pid')
        test_holdout = test[~test.index.isin(test_sampled.index)]
        
        data_model = RecommenderData(train,'pid', 'trackid', 'rating', seed=seed)
        data_model.prepare_training_only()
        lce = LocalCollectiveEmbeddings(data_model)
        lce.build(train[['artistid', 'albumid']], verbose=False)
        
        data_model.set_test_data(testset=test_sampled, holdout=test_holdout, warm_start=True)
        lce.switch_positive = 1
        lce.topk = topk
        relevance = lce.evaluate('relevance')
        ranking = lce.evaluate('ranking')
        
        scores['precision'].append(relevance.precision)
        scores['recall'].append(relevance.recall)
        scores['miss_rate'].append(relevance.miss_rate)
        scores['nDCG'].append(ranking.nDCG)
        
    result = pd.DataFrame(scores)
    return result

In [15]:
cross_validation_lce(full_dataset, test_size=0.009, topk=10)

Preparing data...
Done.




2 unique trackid's within 2 testset interactions were filtered. Reason: not in the training data.
137 pid's were filtered out from testset. Reason: inconsistent with holdout.
Preparing data...
Done.
2 unique trackid's within 3 testset interactions were filtered. Reason: not in the training data.
2 pid's were filtered out from holdout. Reason: inconsistent with testset.
128 pid's were filtered out from testset. Reason: inconsistent with holdout.
Preparing data...
Done.
3 unique trackid's within 4 testset interactions were filtered. Reason: not in the training data.
131 pid's were filtered out from testset. Reason: inconsistent with holdout.
Preparing data...
Done.
1 unique trackid's within 1 testset interactions were filtered. Reason: not in the training data.
131 pid's were filtered out from testset. Reason: inconsistent with holdout.
Preparing data...
Done.
4 unique trackid's within 6 testset interactions were filtered. Reason: not in the training data.
137 pid's were filtered out fro

Unnamed: 0,miss_rate,nDCG,precision,recall
0,0.0,0.18439,1.0,1.0
1,0.0,0.125873,1.0,1.0
2,0.0,0.188345,1.0,1.0
3,0.0,0.208832,1.0,1.0
4,0.0,0.193643,1.0,1.0
