In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from google.cloud import bigquery

In [2]:
import tensorflow_recommenders as tfrs

In [3]:
sys.path.append(os.path.abspath(".."))

In [4]:
from utils.bq_queries.anime_anime_data_queries import anime_anime_retrieval_query

In [5]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Retrieval Data

In [6]:
val_data = load_big_query_data(anime_anime_retrieval_query('VAL'))

In [7]:
val_data.head()

Unnamed: 0,animeA,animeB
0,1020,1042
1,10793,10905
2,10794,10905
3,10536,10905
4,10604,10905


In [8]:
val_data.shape

(48690, 2)

In [9]:
anime_data = load_big_query_data("SELECT anime_id, genres FROM `anime-rec-dev.processed_area.anime`")

In [10]:
anime_data.head()

Unnamed: 0,anime_id,genres
0,10507,"[Sports, Super Power, Shounen]"
1,4075,"[Action, Comedy, Mecha]"
2,36444,"[Action, Comedy, Drama, Fantasy, Slice of Life..."
3,6172,"[Adventure, Mystery]"
4,4794,"[Adventure, Comedy, Fantasy, Kids]"


In [11]:
anime_data.shape

(9192, 2)

In [12]:
val_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            {
                'animeA' : tf.cast(val_data['animeA'], tf.string),
                'animeB' : tf.cast(val_data['animeB'], tf.string),
            }
        )
    )
)
val_ds = val_ds.batch(2048).cache()

# Genres

In [13]:
anime_genre_vocab = [
    'Action',
    'Adventure',
    'Avant Garde',
    'Comedy',
    'Demons',
    'Drama',
    'Ecchi',
    'Fantasy',
    'Game',
    'Gourmet',
    'Harem',
    'Historical',
    'Horror',
    'Kids',
    'Martial Arts',
    'Mecha',
    'Military',
    'Mystery',
    'Parody',
    'Police',
    'Psychological',
    'Romance',
    'Samurai',
    'School',
    'Sci-Fi',
    'Seinen',
    'Shoujo',
    'Shounen',
    'Slice of Life',
    'Space',
    'Sports',
    'Super Power',
    'Supernatural',
    'Suspense',
    'Vampire'
]

# Models

In [14]:
class AnimeMultiHotModel(tf.keras.Model):
    def __init__(self,
                unique_anime_ids,
                multi_hot_feature,
                vocabulary):
        super().__init__()

        self.anime_id_lookup_layer = tf.keras.layers.StringLookup(
            vocabulary = unique_anime_ids, 
            num_oov_indices = 0,
            name = 'anime_multihot_model_id_lookup'
        )

        multi_hot_feature = multi_hot_feature.apply(lambda x : self.__class__.multi_hot_same_shape(x, max_len=len(vocabulary)))
        multi_hot_feature = list(multi_hot_feature)

        multi_hot_layer = tf.keras.layers.StringLookup(vocabulary = vocabulary,
                                                    output_mode = "multi_hot",
                                                    num_oov_indices=1
                                                    )
        multi_hot_encodings = multi_hot_layer(multi_hot_feature)
        multi_hot_encodings = multi_hot_encodings[:, 1:]
        
        num_animes = multi_hot_encodings.shape[0]
        num_multi_hot_dims = multi_hot_encodings.shape[1]

        self.multi_hot_encoding_layer = tf.keras.layers.Embedding(
            num_animes,
            num_multi_hot_dims,
            embeddings_initializer=tf.keras.initializers.Constant(multi_hot_encodings),
            trainable = False,
            name = 'multi_hot_enconding_layer'
        )
    
    def call(self, anime_id):
        anime_idx = self.anime_id_lookup_layer(anime_id)
        anime_multihot_encoding = self.multi_hot_encoding_layer(anime_idx)
        return anime_multihot_encoding
    
    @staticmethod
    def multi_hot_same_shape(list_entities, max_len = 30):
        list_entities = list_entities[:max_len]
        num_add = max_len - list_entities.shape[0]
        return np.concatenate([list_entities , num_add * ["[UNK]"]])


In [15]:
class RetrievalModel(tfrs.Model):

    def __init__(self, anime_model, unique_anime_ids):
        super().__init__()
        
        self.anime_model = anime_model

        animes_ds = tf.data.Dataset.from_tensor_slices(unique_anime_ids)
        retrieval_metrics = tfrs.metrics.FactorizedTopK(
            candidates = animes_ds.batch(128).map(self.anime_model)
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=retrieval_metrics
        )

    def compute_loss(self, features, training=False):
        
        anime_embeddings = self.anime_model(features["animeA"])
        positive_anime_embeddings = self.anime_model(features["animeB"])

        return self.task(anime_embeddings, positive_anime_embeddings, compute_metrics=not training)

In [16]:
anime_genre_model = AnimeMultiHotModel(anime_data['anime_id'], anime_data['genres'], anime_genre_vocab)
genre_retrieval_model = RetrievalModel(anime_genre_model, anime_data['anime_id'])
genre_retrieval_model.compile()
print(genre_retrieval_model.evaluate(val_ds, return_dict=True))

{'factorized_top_k/top_1_categorical_accuracy': 0.12370096892118454, 'factorized_top_k/top_5_categorical_accuracy': 0.13039638102054596, 'factorized_top_k/top_10_categorical_accuracy': 0.13795439898967743, 'factorized_top_k/top_50_categorical_accuracy': 0.1781885325908661, 'factorized_top_k/top_100_categorical_accuracy': 0.2116040289402008, 'loss': 11721.8037109375, 'regularization_loss': 0, 'total_loss': 11721.8037109375}
