## File for training book vector embeddings
###  Contrastive Learning -> No SoftMax -> Yay!, 
### this would be fine for 10000 books, but scalability is always imporant
 - There was some sort of lit2vec repo linked in the download, but I wanted to build my own to make sure it worked with the query embed

### Main Idea:
- Generate "positive" pairs which are books that the same user like, or books with the same tag as well as negative pairs to prevent mode collapse. Feed each pair into the model and have the model learn which are positive and which are negative. The model will learn usefull representations of our books

- This is essentially word2vec where contexts our generated a bit differently.

- Lots take from the tensorflow page on Word2Vec

### Improvements that should be made:
Better Target Embedding:
- Take into account more than a single book. I think this would be done well by a transformer. The transformer part could also then be used to find an embedding of a user based on what books they had interacted with which would be good for perseonalization.   

Better Sampling: 
- Sampling is uniform as opposed to some distribution paramaterized by occurence frequencies

In [1]:

import tensorflow as tf
from tensorflow.keras import layers

import pandas as pd
import numpy
import random
import torch
from torch.nn import functional as F
import numpy as np
import collections
import string
import io
import json
import codecs


In [2]:
def gen_dict(df,keys,values):
    _list = list(zip(df[keys],df[values]))
    c = collections.defaultdict(list)
    for a,b in _list:
        c[a].extend([b])

    for key in list(c.keys()):
        if len(c[key]) < 2:
            del c[key]
    print(len(c),'c length')
    return c

### Load Data

In [99]:
b = pd.read_csv( 'books.csv' )
t = pd.read_csv( 'tags.csv' )
bt = pd.read_csv( 'book_tags.csv')
r = pd.read_csv( 'ratings.csv' )
r = r.merge(b[['book_id','original_title']], on = 'book_id')
bt = bt.merge( t, on = 'tag_id' )
bt = bt.merge( b[[ 'goodreads_book_id', 'title','book_id']], on = 'goodreads_book_id' )

r_high = r[r['rating']>4]
r_high_by_user = r_high.sort_values('user_id')
r_high_by_user = r_high_by_user[['user_id','book_id','original_title']]
bt.dropna(inplace = True)
r_high_by_user.dropna(inplace = True)


### sanity check... I was worried about the titles being different, but the different titles had differen book_ids
Better safe than sorry!

In [101]:
book_id_to_correct_title = dict(zip(r.book_id,r.original_title))
incorrect_title_to_book_id = dict(zip(bt.title, bt.book_id))

book_id_to_correct_title_r = dict(zip(r.original_title, r.book_id))
for key,value in book_id_to_correct_title_r.items():
    if key not in incorrect_title_to_book_id.keys():
        incorrect_title_to_book_id[key] = value

bt.title = bt.title.apply(lambda x: book_id_to_correct_title[incorrect_title_to_book_id[x]]) #fix naming
r.original_title = r.original_title.apply(lambda x: book_id_to_correct_title[incorrect_title_to_book_id[x]])
r_high_by_user.original_title = r_high_by_user.original_title.apply(lambda x: book_id_to_correct_title[incorrect_title_to_book_id[x]])
user_books_dict = gen_dict(r_high_by_user,'user_id','original_title')

52723 c length


In [102]:
r.dropna(inplace=True)
bt.dropna(inplace=True)

book_list = list(set(list(r.original_title.unique())+list(bt.title.unique())))

#create mapping from books to ints
book_to_int = dict(zip(book_list,[i+1 for i in range(len(book_list))]))
int_to_book = dict(zip([i+1 for i in range(len(book_list))],book_list))

### Lots of the tags were uninformative/too much to turn into useful label, especially the ones with lots of instances like to-read

In [49]:
temp = dict(collections.Counter(bt.tag_name))
legal_tags = []
for key, value in temp.items():
    if value > 2 and value < 700:
        legal_tags.append(key)
    


### Generate dictionary for books with similar tags as well as useful dictionary for users and the titles they liked

In [97]:
for key, value in title_to_tags.items():
    value_edit = list(filter((lambda x: x in legal_tags), value))
    title_to_tags[key] = value_edit
    
tags_to_titles = {}
temp = gen_dict(bt,'tag_name','title')
for key,value in temp.items():
    if key in legal_tags:
        tags_to_titles[key] = value
        
        
r_high_by_user.dropna(inplace=True)
users_to_titles = gen_dict(r,'user_id','original_title')

def GenerateContextDict(bookt_to_xs,x_to_books):
    #returns dict where dict[title] = [b1,b2,...bn]
    def flatten(t):
        return [item for sublist in t for item in sublist]
    
    total_dict = {}
    for key,value in bookt_to_xs.items():
        counts = collections.Counter(flatten([(x_to_books[x]) for x in value]))
        
        
        total_dict[key] = counts
    
        
    return total_dict
    
    
title_to_title_tags = GenerateContextDict(title_to_tags, tags_to_titles)  

16332 c length
53424 c length


In [75]:
temp = {}
for title, counter in title_to_title_tags.items():
    temp[title] = {title_value:count for title_value, count in counter.items() if count >= 4}
title_to_title_tags = temp

### Here is where we generate our positive pairs of words, i.e. words that should be embedded close together
 - We say books liked by the same user are similar and books which share a tag are similar

In [261]:
def GeneratePosSample(user, ex_per_user = 5, flatten = True):
    #function for sampling 'similar' where we weight liklihood by frequency
    books_user = random.choices(users_to_titles[user], k = ex_per_user)
    books_tag = []
    
    bad_reads = 0 
    for book in (books_user+books_user):
        try:
            tag_counter = title_to_title_tags[book]
            tag = random.choices(list(tag_counter.keys()), k=1)[0]
            if type(tag) == str:
                books_tag.append((book_to_int[book],book_to_int[tag]))
            else:
                bad_reads +=1
        except:
            bad_reads += 1
            
    books_user += random.choices(users_to_titles[user], k=bad_reads)
    
    
    book_ints_for_user_list = list(map(lambda book: book_to_int[book], books_user))
    
    shuffle = book_ints_for_user_list[:]
    random.shuffle(shuffle)
        
    book_ints_for_user_list = list(zip(book_ints_for_user_list,shuffle))
    
    return book_ints_for_user_list + books_tag
    
    

### Generate Training Data:
Here is where we generate our full training data. For ever user, we generate num_ps positive pairse. For every positive pair, we generate num_ns negative pairs. Finally we bundle everything together and shuffle it. 

In [133]:
MAX_VAL = max(book_to_int.values())+1

In [264]:
def generate_training_data(users_to_titles,num_ps, num_ns, vocab_size, book_to_int):
    targets, contexts, labels = [], [], []

    for i,user in enumerate(list(users_to_titles.keys())):
        if i % 10000 == 0:
            print(i)

        positive_skip_grams = GeneratePosSample(user,num_ps)

        for target_word, context_word in positive_skip_grams:

            context_class = tf.expand_dims(
              tf.constant([context_word], dtype="int32"), 1)

            negative_sampling_candidates = tf.random.uniform(shape=(num_ns,), minval=1, maxval=MAX_VAL, dtype=tf.int32)

            negative_sampling_candidates = tf.expand_dims(
              negative_sampling_candidates, 1)
            
            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int32")

            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
            
    return targets, contexts, labels

In [265]:
print('start')    
targets, contexts, labels = generate_training_data(users_to_titles,14,14,10001, book_to_int)


targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


start
0
10000
20000
30000
40000
50000


targets.shape: (2243808,)
contexts.shape: (2243808, 15)
labels.shape: (2243808, 15)


In [266]:
BATCH_SIZE = 1000
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)



### Define Models

In [188]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=14+1)

    def call(self, pair):
        target, context = pair
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        return dots

### Finally its time to train! 
although accuracy and loss are shown, they are not super interpretable. For example a simple estimator f(b1,b2,$\theta$) paramaterizing P(b1|b2) as f(b1,b2) = 1/num_ns will achieve high accruacy for a reasonable large number of negative examples. CategoricalCrossentropy is also opaque. All we care about is the final embeddings and that the metrixs are moving in the right directions

In [276]:
#train model + run longer
embedding_dim = 100
vocab_size = len(book_to_int) + 1
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

word2vec.fit(dataset, epochs=7)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7f1ae3677a10>

In [274]:
#save model embeddings
#have to add line to file to get embed visualizer to work

weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

len(r.book_id.unique())

for idx, book in list(int_to_book.items()):
    if idx == 0:
        pass
    vec = weights[idx]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(book + "\n")
out_v.close()
out_m.close()



In [277]:
# model = word2vec.target_embedding
# model.save_weights('./model')

with open("book_to_int.json", "w") as outfile:
    json.dump(book_to_int, outfile)
    
with open("int_to_book.json", "w") as outfile:
    json.dump(int_to_book, outfile)

with open("book_id_to_correct_title.json", "w") as outfile:
    json.dump(book_id_to_correct_title, outfile)
    
with open("incorrect_title_to_book_id.json", "w") as outfile:
    json.dump(incorrect_title_to_book_id, outfile)


In [299]:
#For using in actual recommendations
most_popular_in_order = []
books_by_popularity = pd.read_csv( 'books.csv' )
books_by_popularity.dropna(inplace=True)

for book_title in books_by_popularity.original_title.values:
    if book_title in incorrect_title_to_book_id.keys():
        book_id = incorrect_title_to_book_id[book_title]
        correct_title = book_id_to_correct_title[book_id]
        most_popular_in_order.append(correct_title)
with open("most_popular_title_in_order", "w") as outfile:
    json.dump(most_popular_in_order, outfile)

In [278]:
embed_dict = {}
for i in int_to_book:
    embed_dict[i] = weights[i].tolist()

In [281]:
with open("int_to_weight.json", "w") as outfile:
    json.dump(embed_dict, outfile)