In [2]:
import lightfm
from lightfm.data import Dataset

import json
from itertools import islice

import pandas as pd
import numpy as np
from collections import Counter

from scipy.sparse import csr_matrix



## Data Transform

In [3]:
from lightfm.cross_validation import random_train_test_split

def transform_interaction(df, test_percent):
    
    '''
        Params:
            df: dataframe which must contain 'user_id','book_id','rating' columns
            test_percent: percent of data to be put into test set
            
        Return: 
            train_matrix: a scipy sparse matrix containing interactions for training
            test_matrix: a scipy sparse matrix containing interactions for testing
            
            return type: a tuple (train_matrix, test_matrix) of scipy.sparse.COOMatrix
    '''
    
    interaction = pd.pivot_table(df, index='user_id', columns='book_id', values='rating')
    interaction = interaction.fillna(0)
    
    all_csr = csr_matrix(interaction.values)
    
    (train_matrix, test_matrix) = random_train_test_split(all_csr, test_percentage=test_percent)
    
    return (train_matrix, test_matrix)

## Train and test

In [4]:
from lightfm import LightFM

def lightfm_train(train, rank, regParam, maxIter, model_type='warp'):
    
    '''
        Params:
            train: training csr matrix in form of scipy.sparse.COOMatrix
            rank: dimensionality of the feature latent embeddings
            regParam: L2 penalty on user features
            maxIter: number of epochs to run
            model_type: 'warp' - Weighted Approximate-Rank Pairwise Loss 
                        'bpr' - Bayesian Personalised Ranking
            
        Return: 
            model: lightfm model trained on training set
            
            return type: lightfm instance
    '''
    if model_type == 'bpr':
        model = LightFM(loss='bpr',
                no_components=rank,
                user_alpha=regParam)
        
    else:    
        model = LightFM(loss='warp',
                no_components=rank,
                user_alpha=regParam)

    model = model.fit(train, epochs=maxIter,verbose=False)
    
    return model

In [5]:
from lightfm.evaluation import precision_at_k
from time import time

def train_and_test(train, test, rank, regParam, maxIter, top=500, model_type='warp'):
    
    '''
        Params:
            train: training csr matrix in form of scipy.sparse.COOMatrix
            test: test csr matrix in form of scipy_sparse_COOMatrix
            rank: dimensionality of the feature latent embeddings
            regParam: L2 penalty on user features
            maxIter: number of epochs to run
            top: number of top recommendations to evaluate on
            model_type: 'warp' - Weighted Approximate-Rank Pairwise Loss 
                        'bpr' - Bayesian Personalised Ranking
            
        Return: 
            p_at_k: precision at k
            time: time for train and evaluation
    '''
    
    st = time()
    
    model = lightfm_train(train, rank, regParam, maxIter, model_type='warp')
    p_at_k = precision_at_k(model, test, k=top).mean()
    
    t = round(time()-st, 5)
    
    print('Model with maxIter = {}, reg = {}, rank = {} complete'.format(maxIter,regParam,rank))
    print('Precision at K:', p_at_k)
    print('Time used:', t)
    
    return p_at_k, t

## Experiment on 1% data

In [6]:
train1 = pd.read_csv('train1.csv')
val1 = pd.read_csv('valid1.csv')
test1 = pd.read_csv('test1.csv')

raw1 = pd.concat([train1, val1, test1])
raw1 = raw1[['0','1','3']].rename(columns={"0": "user_id", "1": "book_id", "3":"rating"})

In [7]:
raw1.shape

(198323, 3)

In [12]:
train_1, test_1 = transform_interaction(raw1, 0.2)

In [13]:
for rank in [10,20,30,40,50,60,70,80,90,100,120,140,160,180]:
    pk_1, time_1 = train_and_test(train_1, test_1, rank, 0.01, 10, top=500, model_type='warp')

Model with maxIter = 10, reg = 0.01, rank = 10 complete
('Precision at K:', 2.190369e-05)
('Time used:', 59.92625)
Model with maxIter = 10, reg = 0.01, rank = 20 complete
('Precision at K:', 2.142052e-05)
('Time used:', 64.53654)
Model with maxIter = 10, reg = 0.01, rank = 30 complete
('Precision at K:', 1.9326784e-05)
('Time used:', 73.01607)
Model with maxIter = 10, reg = 0.01, rank = 40 complete
('Precision at K:', 2.1259462e-05)
('Time used:', 67.46433)
Model with maxIter = 10, reg = 0.01, rank = 50 complete
('Precision at K:', 1.997101e-05)
('Time used:', 78.93602)
Model with maxIter = 10, reg = 0.01, rank = 60 complete
('Precision at K:', 1.8038332e-05)
('Time used:', 83.20069)
Model with maxIter = 10, reg = 0.01, rank = 70 complete
('Precision at K:', 2.0293122e-05)
('Time used:', 96.41673)
Model with maxIter = 10, reg = 0.01, rank = 80 complete
('Precision at K:', 1.9326784e-05)
('Time used:', 89.76761)
Model with maxIter = 10, reg = 0.01, rank = 90 complete
('Precision at K:',

In [14]:
for rank in [10,20,30,40,50,60,70,80,90,100,120,140,160,180]:
    pk_1, time_1 = train_and_test(train_1, test_1, rank, 0.01, 10, top=500, model_type='bpr')

Model with maxIter = 10, reg = 0.01, rank = 10 complete
('Precision at K:', 2.22258e-05)
('Time used:', 64.43484)
Model with maxIter = 10, reg = 0.01, rank = 20 complete
('Precision at K:', 1.8199387e-05)
('Time used:', 65.37152)
Model with maxIter = 10, reg = 0.01, rank = 30 complete
('Precision at K:', 1.9326784e-05)
('Time used:', 74.37135)
Model with maxIter = 10, reg = 0.01, rank = 40 complete
('Precision at K:', 2.029312e-05)
('Time used:', 68.85651)
Model with maxIter = 10, reg = 0.01, rank = 50 complete
('Precision at K:', 1.9648896e-05)
('Time used:', 78.62235)
Model with maxIter = 10, reg = 0.01, rank = 60 complete
('Precision at K:', 1.6588823e-05)
('Time used:', 83.94908)
Model with maxIter = 10, reg = 0.01, rank = 70 complete
('Precision at K:', 2.045418e-05)
('Time used:', 96.65876)
Model with maxIter = 10, reg = 0.01, rank = 80 complete
('Precision at K:', 2.2064747e-05)
('Time used:', 95.79974)
Model with maxIter = 10, reg = 0.01, rank = 90 complete
('Precision at K:', 

In [10]:
for regParam in [0.001, 0.01, 0.05, 0.1, 0.5]:
    pk_2, time_2 = train_and_test(train_1, test_1, 160, regParam, 10, top=500, model_type='warp')

Model with maxIter = 10, reg = 0.001, rank = 160 complete
('Precision at K:', 1.9033792e-05)
('Time used:', 153.94538)
Model with maxIter = 10, reg = 0.01, rank = 160 complete
('Precision at K:', 1.9679006e-05)
('Time used:', 142.97807)
Model with maxIter = 10, reg = 0.05, rank = 160 complete
('Precision at K:', 2.6776353e-05)
('Time used:', 147.91705)
Model with maxIter = 10, reg = 0.1, rank = 160 complete
('Precision at K:', 0.00011871926)
('Time used:', 151.39157)
Model with maxIter = 10, reg = 0.5, rank = 160 complete
('Precision at K:', 0.00042696993)
('Time used:', 420.49977)


In [15]:
for regParam in [0.001, 0.01, 0.05, 0.1, 0.5]:
    pk_2, time_2 = train_and_test(train_1, test_1, 160, regParam, 10, top=500, model_type='bpr')

Model with maxIter = 10, reg = 0.001, rank = 160 complete
('Precision at K:', 2.1742633e-05)
('Time used:', 151.32254)
Model with maxIter = 10, reg = 0.01, rank = 160 complete
('Precision at K:', 2.4319535e-05)
('Time used:', 158.97416)
Model with maxIter = 10, reg = 0.05, rank = 160 complete
('Precision at K:', 2.5930101e-05)
('Time used:', 153.76198)
Model with maxIter = 10, reg = 0.1, rank = 160 complete
('Precision at K:', 0.00015928493)
('Time used:', 160.20987)
Model with maxIter = 10, reg = 0.5, rank = 160 complete
('Precision at K:', 0.00041198262)
('Time used:', 434.86954)
