In [2]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from google.cloud import bigquery

In [3]:
sys.path.append(os.path.abspath(".."))

In [4]:
from anime_rec_pkg.anime_rec.data.bq_queries.user_anime_ml_data_queries import user_anime_retrieval_query

In [5]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Anime Genre

In [6]:
anime_genre_vocab = [
    'Action',
    'Adventure',
    'Avant Garde',
    'Comedy',
    'Demons',
    'Drama',
    'Ecchi',
    'Fantasy',
    'Game',
    'Gourmet',
    'Harem',
    'Historical',
    'Horror',
    'Kids',
    'Martial Arts',
    'Mecha',
    'Military',
    'Mystery',
    'Parody',
    'Police',
    'Psychological',
    'Romance',
    'Samurai',
    'School',
    'Sci-Fi',
    'Seinen',
    'Shoujo',
    'Shounen',
    'Slice of Life',
    'Space',
    'Sports',
    'Super Power',
    'Supernatural',
    'Suspense',
    'Vampire'
]

In [7]:
anime_data = load_big_query_data("SELECT anime_id, genres FROM `anime-rec-dev.processed_area.anime`")

In [8]:
class AnimeMultiHotModel(tf.keras.Model):
    def __init__(self,
                unique_anime_ids,
                multi_hot_feature,
                vocabulary):
        super().__init__()

        self.anime_id_lookup_layer = tf.keras.layers.StringLookup(
            vocabulary = unique_anime_ids, 
            num_oov_indices = 0,
            name = 'anime_multihot_model_id_lookup'
        )

        multi_hot_feature = multi_hot_feature.apply(lambda x : self.__class__.multi_hot_same_shape(x, max_len=len(vocabulary)))
        multi_hot_feature = list(multi_hot_feature)

        multi_hot_layer = tf.keras.layers.StringLookup(vocabulary = vocabulary,
                                                    output_mode = "multi_hot",
                                                    num_oov_indices=1
                                                    )
        multi_hot_encodings = multi_hot_layer(multi_hot_feature)
        multi_hot_encodings = multi_hot_encodings[:, 1:]
        
        num_animes = multi_hot_encodings.shape[0]
        num_multi_hot_dims = multi_hot_encodings.shape[1]

        self.multi_hot_encoding_layer = tf.keras.layers.Embedding(
            num_animes,
            num_multi_hot_dims,
            embeddings_initializer=tf.keras.initializers.Constant(multi_hot_encodings),
            trainable = False,
            name = 'multi_hot_enconding_layer'
        )
    
    def call(self, anime_id):
        anime_idx = self.anime_id_lookup_layer(anime_id)
        anime_multihot_encoding = self.multi_hot_encoding_layer(anime_idx)
        return anime_multihot_encoding
    
    @staticmethod
    def multi_hot_same_shape(list_entities, max_len = 30):
        list_entities = list_entities[:max_len]
        num_add = max_len - list_entities.shape[0]
        return np.concatenate([list_entities , num_add * ["[UNK]"]])


In [9]:
anime_genre_model = AnimeMultiHotModel(anime_data['anime_id'], anime_data['genres'], anime_genre_vocab)

# User Genres

In [11]:
train_data = load_big_query_data(user_anime_retrieval_query('TRAIN'))

In [12]:
train_data.head()

Unnamed: 0,user_id,anime_id
0,guestkun,35073
1,guestkun,34403
2,guestkun,33926
3,guestkun,33206
4,guestkun,33731


In [13]:
train_data.shape

(69421928, 2)

In [14]:
anime_id_ds = tf.data.Dataset.from_tensor_slices(train_data['anime_id']).batch(1024)

In [15]:
anime_genres = []
for x in anime_id_ds:
    anime_genres.append(anime_genre_model(x))
anime_genres = np.concatenate(anime_genres)

In [16]:
train_data['anime_genre'] = list(tf.constant(anime_genres))
user_data = train_data[['user_id', 'anime_genre']].groupby(by = 'user_id', as_index = False).agg({'anime_genre' : np.sum})
user_data['anime_genre'] = user_data['anime_genre'].apply(lambda x : x / np.linalg.norm(x))
user_genre_matrix = np.stack(user_data['anime_genre'], axis = 0)

In [17]:
user_id_lookup = tf.keras.layers.StringLookup(vocabulary = user_data['user_id'], num_oov_indices=0)
user_genre_layer = tf.keras.layers.Embedding(
    user_genre_matrix.shape[0],
    user_genre_matrix.shape[1],
    embeddings_initializer=tf.keras.initializers.Constant(user_genre_matrix),
    trainable=False
)
user_genre_model = tf.keras.models.Sequential([user_id_lookup, user_genre_layer])

In [18]:
user_genre_model(tf.constant(['GFotaku']))

<tf.Tensor: shape=(1, 35), dtype=float32, numpy=
array([[0.4169987 , 0.21260652, 0.00724795, 0.48464623, 0.0584668 ,
        0.29909873, 0.08359302, 0.2575438 , 0.02319344, 0.01256311,
        0.06233237, 0.09035777, 0.04590368, 0.00869754, 0.02512623,
        0.06523155, 0.05460122, 0.12756391, 0.03962212, 0.02464303,
        0.05460122, 0.26237577, 0.03092458, 0.2145393 , 0.18892989,
        0.12466474, 0.06378195, 0.25802702, 0.1594549 , 0.02077746,
        0.06136597, 0.08310983, 0.24498071, 0.02271024, 0.02802541]],
      dtype=float32)>

# Retrieval Data

In [19]:
val_data = load_big_query_data(user_anime_retrieval_query('TEST'))

In [20]:
val_data.head()

Unnamed: 0,user_id,anime_id
0,-Aine-,33486
1,-Aine-,35382
2,-Aine-,32949
3,-Aine-,34152
4,-Aine-,32995


In [21]:
val_data.shape

(3646260, 2)

In [22]:
val_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            {
                'user_id' : tf.cast(val_data['user_id'], tf.string),
                'anime_id' : tf.cast(val_data['anime_id'], tf.string),
            }
        )
    )
)
val_ds = val_ds.batch(2048).cache()

# Retrieval Model

In [23]:
class UserAnimeRetrievalModel(tfrs.Model):
    '''
        User Anime Retrieval model
    '''
    def __init__(self, user_model, anime_model, unique_anime_ids):
        super().__init__()
        self.user_model = user_model
        self.anime_model = anime_model

        animes_ds = tf.data.Dataset.from_tensor_slices(unique_anime_ids)
        retrieval_metrics = tfrs.metrics.FactorizedTopK(
            candidates=animes_ds.batch(128).map(self.anime_model)
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=retrieval_metrics
        )

    def compute_loss(self, features, training=False):
        '''
            Run retrieval task
        '''
        user_embeddings = self.user_model(features["user_id"])
        positive_anime_embeddings = self.anime_model(features["anime_id"])

        return self.task(user_embeddings, positive_anime_embeddings, compute_metrics=not training)


In [24]:
genre_retrieval_model = UserAnimeRetrievalModel(user_genre_model, anime_genre_model, anime_data['anime_id'])
genre_retrieval_model.compile()
print(genre_retrieval_model.evaluate(val_ds, return_dict=True))

{'factorized_top_k/top_1_categorical_accuracy': 0.00016427792434114963, 'factorized_top_k/top_5_categorical_accuracy': 0.0005098374676890671, 'factorized_top_k/top_10_categorical_accuracy': 0.001158721512183547, 'factorized_top_k/top_50_categorical_accuracy': 0.008761031553149223, 'factorized_top_k/top_100_categorical_accuracy': 0.02243421971797943, 'loss': 5501.626953125, 'regularization_loss': 0, 'total_loss': 5501.626953125}
