In [1]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

In [2]:
import json
from itertools import islice

ratings, book_features = get_data()

In [3]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [4]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


In [5]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [6]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [7]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [8]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [9]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [10]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

<lightfm.lightfm.LightFM at 0x7fea4011c970>

In [18]:
import numpy as np
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data.interactions_shape()
    
    for user_id in user_ids:
        #known_positives = data['item_labels'][data.tocsr()[user_id].indices]
        
        scores = model.predict(user_id, np.arange(n_items))
        sort_scores = np.argsort(-scores)
        #items = get_ratings()
        print("User %s" % user_id)
        print("scores:")
        for x in sort_scores[:3]:
            print("          %s" % x)
        
        
        
        
        #print("User %s" % user_id)
        #print("    Known positivies:")
        
        #for x in known_positives[:3]:
            #print("      %s" % x)
        
        print("    recommended:")
        
        #for x in top_items[:3]:
            #print("          %s" % x)

In [19]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
id = user_id_map['276725']
sample_recommendation(model, dataset, [id])

User 0
scores:
          452
          421
          1632
    recommended:


In [27]:
from lightfm.evaluation import auc_score
train_auc = auc_score(model,
                      interactions,
                      None,
                      item_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)

ValueError: The user feature matrix specifies more features than there are estimated feature embeddings: 105283 vs 443805.

In [47]:
#for line in islice(book_features, 3):
    #print(json.dumps(line, indent=4))

{
    "ISBN": "0345416848",
    "Book-Title": "Last Sword of Power (Stones of Power)",
    "Book-Author": "David Gemmell",
    "Year-Of-Publication": "1997",
    "Publisher": "Del Rey Books",
    "Image-URL-S": "http://images.amazon.com/images/P/0345416848.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0345416848.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0345416848.01.LZZZZZZZ.jpg"
}
{
    "ISBN": "0385333404",
    "Book-Title": "Dating Big Bird",
    "Book-Author": "Laura Zigman",
    "Year-Of-Publication": "2000",
    "Publisher": "Dial Books",
    "Image-URL-S": "http://images.amazon.com/images/P/0385333404.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0385333404.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0385333404.01.LZZZZZZZ.jpg"
}
{
    "ISBN": "044021145X",
    "Book-Title": "The Firm",
    "Book-Author": "John Grisham",
    "Year-Of-Publication": "1992",
    "Publisher