In [None]:
import pandas as pd
import numpy as np
from scipy import sparse as spsp

In [None]:
books = pd.read_csv('BX-Books.csv.gz', sep='\";\"', encoding = 'cp1252')
ratings = pd.read_csv('BX-Book-Ratings.csv.gz', sep=';', encoding = 'cp1252')

In [None]:
ratings

In [None]:
book_content = pd.read_csv('compiled_books_content.txt.gz', sep='\t', encoding = 'cp1252', header=None)

In [None]:
book_content

First, we need to collect all books and assign them with sequence numbers.

We filter books. We have to make sure a book has metadata and content. We only collect English books.

In [None]:
from langdetect import detect

# the books with metadata.
isbn_set = set()
for i in range(len(books['\"ISBN'])):
    isbn = books['\"ISBN'][i][1:]
    isbn_set.add(isbn)

# the books with more detailed content information.
book_map = {}
num_books = 0
for book, abstract in zip(book_content[0], book_content[3]):
    if book in book_map or book not in isbn_set:
        continue
    try:
        if detect(abstract) == 'en':
            book_map[book] = num_books
            num_books += 1
    except:
        continue
assert len(book_map) == num_books
print('#books:', num_books)

In [None]:
books

Collect all of the metadata of the books.

In [None]:
book_titles = {}
book_authors = {}
book_years = np.zeros(shape=(num_books))
book_publishers = {}
book_abstracts = {}

for i in range(len(books['\"ISBN'])):
    isbn = books['\"ISBN'][i][1:]
    title = books['Book-Title'][i]
    author = books['Book-Author'][i]
    year = books['Year-Of-Publication'][i]
    publisher = books['Publisher'][i]
    if isbn not in book_map:
        continue
    book_idx = book_map[isbn]
    book_titles[book_idx] = title
    book_authors[book_idx] = author
    book_years[book_idx] = year
    book_publishers[book_idx] = publisher
print(len(book_titles))
    
for isbn, title, abstract in zip(book_content[0], book_content[2], book_content[3]):
    if isbn in book_map:
        idx = book_map[isbn]
        book_abstracts[idx] = abstract

Collect the ratings on the books with metadata.

In [None]:
filter_ratings = []
for user, isbn, rate in zip(ratings['User-ID'], ratings['ISBN'], ratings['Book-Rating']):
    if isbn in book_map:
        filter_ratings.append((user, isbn, rate))
        
print(len(filter_ratings))

Collect all users that read books with metadata. The users are assigned with sequence numbers.

In [None]:
user_map = {}
num_users = 0
for user, _, _ in filter_ratings:
    if user not in user_map:
        user_map[user] = num_users
        num_users += 1
assert len(user_map) == num_users
print('#users:', num_users)

Construct a sparse matrix for the user-book interactions.

In [None]:
user_arr = np.array([user_map[user] for user, _, _ in filter_ratings], dtype=np.int64)
book_arr = np.array([book_map[isbn] for _, isbn, _ in filter_ratings], dtype=np.int64)
rate_arr = np.array([rate for _, _, rate in filter_ratings], dtype=np.int64)

user_book_spm = spsp.coo_matrix((np.ones((len(user_arr))), (user_arr, book_arr)))
user_book_ratings = spsp.coo_matrix((rate_arr, (user_arr, book_arr)))
print(user_book_spm.shape)


Some of the users read less two books. In this case, we cannot use them in testing or validation.

In [None]:
user_deg = user_book_spm.dot(np.ones((num_books)))
print(np.sum(user_deg <= 2))
book_deg = user_book_spm.transpose().dot(np.ones((num_users)))
print(np.sum(book_deg <= 2))

create a new mapping between original user id and new id.

In [None]:
user_map1 = {}
num_users = 0
for user, _, _ in filter_ratings:
    orig_idx = user_map[user]
    if user not in user_map1 and user_deg[orig_idx] > 2:
        user_map1[user] = num_users
        num_users += 1
assert len(user_map1) == num_users
print('#users:', num_users)

In [None]:
user_book_spm = user_book_spm.tocsr()[user_deg > 2]
user_book_ratings = user_book_ratings.tocsr()[user_deg > 2]
print(user_book_spm.shape)

user_deg = user_book_spm.dot(np.ones((num_books)))
print(np.sum(user_deg <= 2))
book_deg = user_book_spm.transpose().dot(np.ones((num_users)))
print(np.sum(book_deg <= 2))

In [None]:
def counts(book_attributes):
    popularity = {}
    for _, author in book_attributes.items():
        if author in popularity:
            popularity[author] += 1
        else:
            popularity[author] = 1
    print('#attributes:', len(popularity))
    print(np.max([p for _, p in popularity.items()]))
    
counts(book_authors)
counts(book_publishers)
uniq_years, year_cnts = np.unique(book_years, return_counts=True)
print('#years:', len(uniq_years))
print('max #books a year:', np.max(year_cnts))

In [None]:
import numpy as np
import mxnet as mx
from mxnet import gluon
from mxnet.gluon import nn, Block
import gluonnlp as nlp
import time
import random
from gluonnlp.data import BERTTokenizer

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

dropout_prob = 0.1
ctx = mx.gpu(0)

bert_model, bert_vocab = nlp.model.get_model(name='bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True,
                                             ctx=ctx,
                                             use_pooler=True,
                                             use_decoder=False,
                                             use_classifier=False,
                                             dropout=dropout_prob,
                                             embed_dropout=dropout_prob)
tokenizer = BERTTokenizer(bert_vocab, lower=True)

In [None]:
abstract_emb = mx.nd.zeros(shape=(num_books, 768), ctx=mx.gpu(0))
for i in range(num_books):
    token_ids = mx.nd.expand_dims(mx.nd.array(bert_vocab[tokenizer(book_abstracts[i])],
                                              dtype=np.int32, ctx=mx.gpu(0)), axis=0)
    token_types = mx.nd.ones_like(token_ids, ctx=mx.gpu(0))
    _, sent_embedding = bert_model(token_ids, token_types)
    abstract_emb[i] = sent_embedding.transpose().squeeze()

In [None]:
title_emb = mx.nd.zeros(shape=(num_books, 768), ctx=mx.gpu(0))
for i in range(num_books):
    token_ids = mx.nd.expand_dims(mx.nd.array(bert_vocab[tokenizer(book_titles[i])],
                                              dtype=np.int32, ctx=mx.gpu(0)), axis=0)
    token_types = mx.nd.ones_like(token_ids, ctx=mx.gpu(0))
    _, sent_embedding = bert_model(token_ids, token_types)
    title_emb[i] = sent_embedding.transpose().squeeze()

In [None]:
print(abstract_emb.shape)
print(title_emb.shape)

In [None]:
np.save('bx_book_abstract.npy', abstract_emb.asnumpy())

In [None]:
np.save('bx_book_title.npy', title_emb.asnumpy())

In [None]:
def pick_test(user_book_spm):
    users = user_book_spm.row
    items = user_book_spm.col
    picks = np.zeros(shape=(len(users)))
    user_book_spm = user_book_spm.tocsr()
    indptr = user_book_spm.indptr
    valid_set = np.zeros(shape=(num_users))
    test_set = np.zeros(shape=(num_users))
    for i in range(user_book_spm.shape[0]):
        start_idx = indptr[i]
        end_idx = indptr[i+1]
        idx = np.random.choice(np.arange(start_idx, end_idx), 2, replace=False)
        valid_set[i] = items[idx[0]]
        picks[idx[0]] = 1
        test_set[i] = items[idx[1]]
        picks[idx[1]] = 1
    users = users[picks == 0]
    items = items[picks == 0]
    return spsp.coo_matrix((np.ones((len(users),)), (users, items))), valid_set, test_set

orig_user_book_spm = user_book_spm.tocsr()
user_book_spm, valid_set, test_set = pick_test(user_book_spm.tocoo())
print('#training size:', user_book_spm.nnz)
users_valid = np.arange(num_users)
items_valid = valid_set
users_test = np.arange(num_users)
items_test = test_set
valid_size = len(users_valid)
test_size = len(users_test)
print('valid set:', valid_size)
print('test set:', test_size)

In [None]:
print(orig_user_book_spm.nnz)

In [None]:
def gen_neg_set(user_item_spm, neg_sample_size):
    num_users = user_item_spm.shape[0]
    num_items = user_item_spm.shape[1]
    neg_mat = np.zeros(shape=(num_users, neg_sample_size))
    for user in range(num_users):
        item_set = set()
        while len(item_set) < neg_sample_size:
            items = np.random.choice(num_items, neg_sample_size, replace=False)
            for item in items:
                if user_item_spm[user, item] == 0:
                    item_set.add(item)
                if len(item_set) == neg_sample_size:
                    break
        neg_mat[user] = np.array(list(item_set))

    for user, items in enumerate(neg_mat):
        for idx, item in enumerate(items):
            assert user_item_spm[user, item] == 0
                
    return neg_mat

neg_valid = gen_neg_set(orig_user_book_spm.tocsr(), 99)
neg_test = gen_neg_set(orig_user_book_spm.tocsr(), 99)

In [None]:
import pickle
pickle.dump(user_book_spm, open('bx_train.pkl', 'wb'))
pickle.dump(abstract_emb, open('bx_book_abstract.pkl', 'wb'))
pickle.dump(title_emb, open('bx_book_title.pkl', 'wb'))
pickle.dump(user_map1, open('bx_user_map.pkl', 'wb'))
pickle.dump(book_map, open('bx_book_map.pkl', 'wb'))
pickle.dump((valid_set, test_set), open('bx_eval.pkl', 'wb'))
pickle.dump((neg_valid, neg_test), open('bx_neg.pkl', 'wb'))