In [1]:
import pandas as pd
import numpy as np
from scipy import sparse as spsp

In [2]:
books = pd.read_csv('BX-Books.csv', sep='\";\"', encoding = 'cp1252')
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding = 'cp1252')

  """Entry point for launching an IPython kernel.


In [3]:
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [4]:
book_content = pd.read_csv('compiled_books_content.txt.gz', sep='\t', encoding = 'cp1252', header=None)

In [5]:
book_content

Unnamed: 0,0,1,2,3
0,0877853134,"June 1, 2004",The Arrivals Naomi Gladish Smith,"Flight 785 is bound for London and Brussels, b..."
1,3426620685,"March 1, 2002",Süden und der Straßenbahntrinker. Friedrich An...,"Tabor S&#xFC;den hat Urlaub, baut &#xDC;berstu..."
2,0006513905,"December 15, 1999",Dice Man Luke Rhinehart,The cult classic that can still change your li...
3,0062506838,,The Enneagram Understanding Yourself and the O...,It would be impossible for most of us to spend...
4,0099435446,"July 3, 2003",Babes in the Wood Ruth Rendell,A woman phoned to say she and her husband went...
...,...,...,...,...
36523,0312195516,"September 15, 1998",The Red Tent Anita Diamant,"Her name is Dinah. In the Bible, her life is o..."
36524,0060928336,"May 7, 1997",Divine Secrets of the Ya-Ya Sisterhood A Novel...,"When Siddalee Walker, oldest daughter of Vivi ..."
36525,0385504209,,The Da Vinci Code Dan Brown,"While in Paris on business, Harvard symbologis..."
36526,0316666343,"June 1, 2002",The Lovely Bones Alice Sebold,This deluxe trade paperback edition of Alice S...


First, we need to collect all books and assign them with sequence numbers.

We filter books. We have to make sure a book has metadata and content. We only collect English books.

In [6]:
from langdetect import detect

# the books with metadata.
isbn_set = set()
for i in range(len(books['\"ISBN'])):
    isbn = books['\"ISBN'][i][1:]
    isbn_set.add(isbn)

# the books with more detailed content information.
book_map = {}
num_books = 0
for book, abstract in zip(book_content[0], book_content[3]):
    if book in book_map or book not in isbn_set:
        continue
    try:
        if detect(abstract) == 'en':
            book_map[book] = num_books
            num_books += 1
    except:
        continue
assert len(book_map) == num_books
print('#books:', num_books)

#books: 34446


In [7]:
books

Unnamed: 0,"""ISBN",Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,"Image-URL-L"""
0,"""0195153448",Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,"""0002005018",Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,"""0060973129",Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,"""0374157065",Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,"""0393045218",The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271374,"""0440400988",There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,"""0525447644",From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,"""006008667X",Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,"""0192126040",Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


Collect all of the metadata of the books.

In [8]:
book_titles = {}
book_authors = {}
book_years = np.zeros(shape=(num_books))
book_publishers = {}
book_abstracts = {}

for i in range(len(books['\"ISBN'])):
    isbn = books['\"ISBN'][i][1:]
    title = books['Book-Title'][i]
    author = books['Book-Author'][i]
    year = books['Year-Of-Publication'][i]
    publisher = books['Publisher'][i]
    if isbn not in book_map:
        continue
    book_idx = book_map[isbn]
    book_titles[book_idx] = title
    book_authors[book_idx] = author
    book_years[book_idx] = year
    book_publishers[book_idx] = publisher
print(len(book_titles))
    
for isbn, title, abstract in zip(book_content[0], book_content[2], book_content[3]):
    if isbn in book_map:
        idx = book_map[isbn]
        book_abstracts[idx] = abstract

34446


Collect the ratings on the books with metadata.

In [9]:
filter_ratings = []
for user, isbn, rate in zip(ratings['User-ID'], ratings['ISBN'], ratings['Book-Rating']):
    if isbn in book_map:
        filter_ratings.append((user, isbn, rate))
        
print(len(filter_ratings))

623439


Collect all users that read books with metadata. The users are assigned with sequence numbers.

In [26]:
user_map = {}
num_users = 0
for user, _, _ in filter_ratings:
    if user not in user_map:
        user_map[user] = num_users
        num_users += 1
assert len(user_map) == num_users
print('#users:', num_users)

66485
#users: 66485


Construct a sparse matrix for the user-book interactions.

In [33]:
user_arr = np.array([user_map[user] for user, _, _ in filter_ratings], dtype=np.int64)
book_arr = np.array([book_map[isbn] for _, isbn, _ in filter_ratings], dtype=np.int64)
rate_arr = np.array([rate for _, _, rate in filter_ratings], dtype=np.int64)

user_book_spm = spsp.coo_matrix((np.ones((len(user_arr))), (user_arr, book_arr)))
user_book_ratings = spsp.coo_matrix((rate_arr, (user_arr, book_arr)))
print(user_book_spm.shape)


(66485, 34446)


Some of the users read less two books. In this case, we cannot use them in testing or validation.

In [47]:
user_deg = user_book_spm.dot(np.ones((num_books)))
print(np.sum(user_deg <= 2))
book_deg = user_book_spm.transpose().dot(np.ones((num_users)))
print(np.sum(book_deg <= 2))

0
12


create a new mapping between original user id and new id.

In [42]:
user_map1 = {}
num_users = 0
for user, _, _ in filter_ratings:
    orig_idx = user_map[user]
    if user not in user_map1 and user_deg[orig_idx] > 2:
        user_map1[user] = num_users
        num_users += 1
assert len(user_map1) == num_users
print('#users:', num_users)

#users: 21890


In [43]:
user_book_spm = user_book_spm.tocsr()[user_deg > 2]
user_book_ratings = user_book_ratings.tocsr()[user_deg > 2]
print(user_book_spm.shape)


(21890, 34446)


In [12]:
def counts(book_attributes):
    popularity = {}
    for _, author in book_attributes.items():
        if author in popularity:
            popularity[author] += 1
        else:
            popularity[author] = 1
    print('#attributes:', len(popularity))
    print(np.max([p for _, p in popularity.items()]))
    
counts(book_authors)
counts(book_publishers)
uniq_years, year_cnts = np.unique(book_years, return_counts=True)
print('#years:', len(uniq_years))
print('max #books a year:', np.max(year_cnts))

#attributes: 11103
196
#attributes: 1653
1238
#years: 80
max #books a year: 2834


In [13]:
import numpy as np
import mxnet as mx
from mxnet import gluon
from mxnet.gluon import nn, Block
import gluonnlp as nlp
import time
import random
from gluonnlp.data import BERTTokenizer

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

dropout_prob = 0.1
ctx = mx.cpu(0)

bert_model, bert_vocab = nlp.model.get_model(name='bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True,
                                             ctx=ctx,
                                             use_pooler=True,
                                             use_decoder=False,
                                             use_classifier=False,
                                             dropout=dropout_prob,
                                             embed_dropout=dropout_prob)
tokenizer = BERTTokenizer(bert_vocab, lower=True)

In [14]:
abstract_emb = mx.nd.zeros(shape=(num_books, 768))
for i in range(num_books):
    token_ids = mx.nd.expand_dims(mx.nd.array(bert_vocab[tokenizer(book_abstracts[i])], dtype=np.int32), axis=0)
    token_types = mx.nd.ones_like(token_ids)
    _, sent_embedding = bert_model(token_ids, token_types)
    abstract_emb[i] = sent_embedding.transpose().squeeze()

In [15]:
title_emb = mx.nd.zeros(shape=(num_books, 768))
for i in range(num_books):
    token_ids = mx.nd.expand_dims(mx.nd.array(bert_vocab[tokenizer(book_titles[i])], dtype=np.int32), axis=0)
    token_types = mx.nd.ones_like(token_ids)
    _, sent_embedding = bert_model(token_ids, token_types)
    title_emb[i] = sent_embedding.transpose().squeeze()

In [16]:
print(abstract_emb.shape)
print(title_emb.shape)

(34446, 768)
(34446, 768)


In [49]:
def pick_test(user_book_spm):
    users = user_book_spm.row
    items = user_book_spm.col
    picks = np.zeros(shape=(len(users)))
    user_book_spm = user_book_spm.tocsr()
    indptr = user_book_spm.indptr
    valid_set = np.zeros(shape=(num_users))
    test_set = np.zeros(shape=(num_users))
    for i in range(user_book_spm.shape[0]):
        start_idx = indptr[i]
        end_idx = indptr[i+1]
        idx = np.random.choice(np.arange(start_idx, end_idx), 2, replace=False)
        valid_set[i] = items[idx[0]]
        picks[idx[0]] = 1
        test_set[i] = items[idx[1]]
        picks[idx[1]] = 1
    users = users[picks == 0]
    items = items[picks == 0]
    return spsp.coo_matrix((np.ones((len(users),)), (users, items))), valid_set, test_set

orig_user_book_spm = user_book_spm.tocsr()
user_book_spm, valid_set, test_set = pick_test(user_book_spm.tocoo())
print('#training size:', user_book_spm.nnz)
users_valid = np.arange(num_users)
items_valid = valid_set
users_test = np.arange(num_users)
items_test = test_set
valid_size = len(users_valid)
test_size = len(users_test)
print('valid set:', valid_size)
print('test set:', test_size)

#training size: 526851
valid set: 21890
test set: 21890


In [None]:
import pickle
pickle.dump(user_book_spm, open('bx_train.pkl', 'wb'))
pickle.dump(abstract_emb, open('bx_book_abstract.pkl', 'wb'))
pickle.dump(title_emb, open('bx_book_title.pkl', 'wb'))
pickle.dump(user_map1, open('bx_user_map.pkl', 'wb'))
pickle.dump(book_map, open('bx_book_map.pkl', 'wb'))
pickle.dump((valid_set, test_set), open('bx_eval.pkl', 'wb'))
pickle.dump((neg_valid, neg_test), open('bx_neg.pkl', 'wb'))