In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from google.cloud import bigquery

In [2]:
sys.path.append(os.path.abspath(".."))

In [3]:
from utils.bq_queries.user_anime_data_queries import user_anime_retrieval_query

In [4]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Anime Genre

In [5]:
anime_genre_vocab = [
    'Action',
    'Adventure',
    'Avant Garde',
    'Comedy',
    'Demons',
    'Drama',
    'Ecchi',
    'Fantasy',
    'Game',
    'Gourmet',
    'Harem',
    'Historical',
    'Horror',
    'Kids',
    'Martial Arts',
    'Mecha',
    'Military',
    'Mystery',
    'Parody',
    'Police',
    'Psychological',
    'Romance',
    'Samurai',
    'School',
    'Sci-Fi',
    'Seinen',
    'Shoujo',
    'Shounen',
    'Slice of Life',
    'Space',
    'Sports',
    'Super Power',
    'Supernatural',
    'Suspense',
    'Vampire'
]

In [6]:
anime_data = load_big_query_data("SELECT anime_id, genres FROM `anime-rec-dev.processed_area.anime`")

In [7]:
class AnimeMultiHotModel(tf.keras.Model):
    def __init__(self,
                unique_anime_ids,
                multi_hot_feature,
                vocabulary):
        super().__init__()

        self.anime_id_lookup_layer = tf.keras.layers.StringLookup(
            vocabulary = unique_anime_ids, 
            num_oov_indices = 0,
            name = 'anime_multihot_model_id_lookup'
        )

        multi_hot_feature = multi_hot_feature.apply(lambda x : self.__class__.multi_hot_same_shape(x, max_len=len(vocabulary)))
        multi_hot_feature = list(multi_hot_feature)

        multi_hot_layer = tf.keras.layers.StringLookup(vocabulary = vocabulary,
                                                    output_mode = "multi_hot",
                                                    num_oov_indices=1
                                                    )
        multi_hot_encodings = multi_hot_layer(multi_hot_feature)
        multi_hot_encodings = multi_hot_encodings[:, 1:]
        
        num_animes = multi_hot_encodings.shape[0]
        num_multi_hot_dims = multi_hot_encodings.shape[1]

        self.multi_hot_encoding_layer = tf.keras.layers.Embedding(
            num_animes,
            num_multi_hot_dims,
            embeddings_initializer=tf.keras.initializers.Constant(multi_hot_encodings),
            trainable = False,
            name = 'multi_hot_enconding_layer'
        )
    
    def call(self, anime_id):
        anime_idx = self.anime_id_lookup_layer(anime_id)
        anime_multihot_encoding = self.multi_hot_encoding_layer(anime_idx)
        return anime_multihot_encoding
    
    @staticmethod
    def multi_hot_same_shape(list_entities, max_len = 30):
        list_entities = list_entities[:max_len]
        num_add = max_len - list_entities.shape[0]
        return np.concatenate([list_entities , num_add * ["[UNK]"]])


In [8]:
anime_genre_model = AnimeMultiHotModel(anime_data['anime_id'], anime_data['genres'], anime_genre_vocab)

# User Genres

In [9]:
train_data = load_big_query_data(user_anime_retrieval_query('TRAIN'))

In [10]:
train_data.head()

Unnamed: 0,user_id,anime_id
0,gnoel,15771
1,gnoel,1216
2,gnoel,199
3,gnoel,6553
4,gnoel,10155


In [11]:
train_data.shape

(36715669, 2)

In [22]:
anime_id_ds = tf.data.Dataset.from_tensor_slices(train_data['anime_id']).batch(1024)

In [23]:
anime_genres = []
for x in anime_id_ds:
    anime_genres.append(anime_genre_model(x))
anime_genres = np.concatenate(anime_genres)

In [29]:
train_data['anime_genre'] = list(tf.constant(anime_genres))
user_data = train_data[['user_id', 'anime_genre']].groupby(by = 'user_id', as_index = False).agg({'anime_genre' : np.sum})
user_data['anime_genre'] = user_data['anime_genre'].apply(lambda x : x / np.linalg.norm(x))
user_genre_matrix = np.stack(user_data['anime_genre'], axis = 0)

In [30]:
user_id_lookup = tf.keras.layers.StringLookup(vocabulary = user_data['user_id'], num_oov_indices=0)
user_genre_layer = tf.keras.layers.Embedding(
    user_genre_matrix.shape[0],
    user_genre_matrix.shape[1],
    embeddings_initializer=tf.keras.initializers.Constant(user_genre_matrix),
    trainable=False
)
user_genre_model = tf.keras.models.Sequential([user_id_lookup, user_genre_layer])

In [31]:
user_genre_model(tf.constant(['GFotaku']))

<tf.Tensor: shape=(1, 35), dtype=float32, numpy=
array([[0.42945725, 0.22254124, 0.00757587, 0.50000757, 0.05823952,
        0.28314823, 0.08475507, 0.25805315, 0.02651555, 0.01231079,
        0.06108047, 0.09422491, 0.04403476, 0.01373127, 0.02509508,
        0.07291777, 0.05681904, 0.11837301, 0.03929984, 0.02178063,
        0.04545523, 0.25473872, 0.03172396, 0.2017076 , 0.2012341 ,
        0.12405491, 0.06865634, 0.25095078, 0.13825966, 0.02935651,
        0.06060698, 0.08144063, 0.23295808, 0.01893968, 0.02651555]],
      dtype=float32)>

# Retrieval Data

In [32]:
val_data = load_big_query_data(user_anime_retrieval_query('VAL'))

In [33]:
val_data.head()

Unnamed: 0,user_id,anime_id
0,-Ackerman,35203
1,-Ackerman,34933
2,-Ackerman,34599
3,-Ackerman,34934
4,-Ackerman,34383


In [34]:
val_data.shape

(1719410, 2)

In [35]:
val_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            {
                'user_id' : tf.cast(val_data['user_id'], tf.string),
                'anime_id' : tf.cast(val_data['anime_id'], tf.string),
            }
        )
    )
)
val_ds = val_ds.batch(2048).cache()

# Retrieval Model

In [36]:
class UserAnimeRetrievalModel(tfrs.Model):
    '''
        User Anime Retrieval model
    '''
    def __init__(self, user_model, anime_model, unique_anime_ids):
        super().__init__()
        self.user_model = user_model
        self.anime_model = anime_model

        animes_ds = tf.data.Dataset.from_tensor_slices(unique_anime_ids)
        retrieval_metrics = tfrs.metrics.FactorizedTopK(
            candidates=animes_ds.batch(128).map(self.anime_model)
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=retrieval_metrics
        )

    def compute_loss(self, features, training=False):
        '''
            Run retrieval task
        '''
        user_embeddings = self.user_model(features["user_id"])
        positive_anime_embeddings = self.anime_model(features["anime_id"])

        return self.task(user_embeddings, positive_anime_embeddings, compute_metrics=not training)


In [37]:
genre_retrieval_model = UserAnimeRetrievalModel(user_genre_model, anime_genre_model, anime_data['anime_id'])
genre_retrieval_model.compile()
print(genre_retrieval_model.evaluate(val_ds, return_dict=True))

{'factorized_top_k/top_1_categorical_accuracy': 0.0008200487354770303, 'factorized_top_k/top_5_categorical_accuracy': 0.0021809807512909174, 'factorized_top_k/top_10_categorical_accuracy': 0.004038013052195311, 'factorized_top_k/top_50_categorical_accuracy': 0.019764918833971024, 'factorized_top_k/top_100_categorical_accuracy': 0.038769111037254333, 'loss': 8032.95361328125, 'regularization_loss': 0, 'total_loss': 8032.95361328125}
