In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_ranking as tfr
from google.cloud import bigquery

In [2]:
sys.path.append(os.path.abspath(".."))

In [3]:
from utils.bq_queries.user_anime_data_queries import user_anime_list_ranking_query

In [4]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Ranking Data

In [5]:
val_data = load_big_query_data(user_anime_list_ranking_query('VAL'))

In [6]:
val_data.head()

Unnamed: 0,user_id,anime_id,score
0,-AnimeManya-,"[14719, 24135, 11933, 22101, 15315, 6205, 5112...","[9, 8, 8, 8, 8, 4, 6, 8, 7, 8]"
1,-Azure-,"[27989, 33743, 33487, 33988, 33206, 32937, 332...","[7, 4, 4, 5, 5, 6, 3, 4, 5, 3]"
2,-Clandestine-,"[2924, 4789, 587, 5978, 3958, 5713, 5162, 3225...","[9, 9, 9, 7, 7, 8, 9, 8, 9, 8]"
3,-Lyka-,"[32900, 33783, 23249, 12029, 34451, 30778, 320...","[8, 7, 8, 8, 8, 7, 8, 7, 7, 8]"
4,-Shanigami-,"[34662, 35788, 35079, 36027, 34451, 35076, 345...","[6, 7, 7, 3, 8, 6, 8, 4, 6, 6]"


In [7]:
val_data.shape

(171941, 3)

In [8]:
anime_data = load_big_query_data("SELECT anime_id, score FROM `anime-rec-dev.processed_area.anime`")

In [9]:
anime_data = anime_data.fillna(0.0)

In [10]:
val_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            {
                'user_id' : np.stack(val_data['user_id'].apply(lambda x : 10 * [x]), axis = 0),
                'anime_id' : np.stack(val_data['anime_id'], axis = 0)
            },
            tf.cast(np.stack(val_data['score'], axis = 0), tf.float32)
        )
    )
)
val_ds = val_ds.batch(2048).cache()

# Models

In [11]:
def mse_and_list_mle_loss(y_true, y_pred):
    return tfr.keras.losses.MeanSquaredLoss()(y_true, y_pred) + tfr.keras.losses.ListMLELoss()(y_true, y_pred)

In [12]:
class AnimeAverageRatingModel(tf.keras.Model):
    def __init__(self, anime_ids, anime_scores):
        super().__init__()
        
        self.ratings = tf.lookup.StaticHashTable(
                tf.lookup.KeyValueTensorInitializer(anime_ids, anime_scores),
                default_value=-1
        )
    
    def call(self, features):

        pred_ratings = self.ratings.lookup(features['anime_id'])

        return pred_ratings

In [13]:
class UserAnimeListRankingAverageScoreModel(tf.keras.Model):
    def __init__(
            self,
            anime_score_model,
            topn=5,
            positive_threshold=8.0
    ):
        super().__init__()

        self.positive_threshold = positive_threshold

        self.classification_metrics = [
            tfr.keras.metrics.PrecisionMetric(topn=topn, name=f'Precision@{topn}'),
            tfr.keras.metrics.RecallMetric(topn=topn, name=f'Recall@{topn}'),
            tfr.keras.metrics.MeanAveragePrecisionMetric(topn=topn, name=f'MAP@{topn}'),
            tfr.keras.metrics.MRRMetric(topn=topn, name=f'MRR@{topn}')
        ]

        self.non_classification_metrics = [
            tfr.keras.metrics.NDCGMetric(topn=topn, name=f'ndcg@{topn}'),
            tf.keras.metrics.RootMeanSquaredError()
        ]

        
        self.anime_score_model = anime_score_model

    def test_step(self, data):
        
        x, y = data

        y_true = y
        y_true_binary = tf.cast(y_true >= self.positive_threshold, tf.int32)

        y_pred = self(x, training=False)
        y_pred = tf.cast(y_pred, dtype=tf.float32)

        self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
        self.compiled_metrics.update_state(y_true, y_pred)

        for classification_metric in self.classification_metrics:
            classification_metric.update_state(y_true_binary, y_pred)

        for non_classification_metric in self.non_classification_metrics:
            non_classification_metric.update_state(y_true, y_pred)

        return {m.name: m.result() for m in self.metrics}

    @property
    def metrics(self):
        return self.compiled_loss.metrics + \
               self.compiled_metrics.metrics + \
               self.classification_metrics + \
               self.non_classification_metrics
    
    def call(self, inputs):
        
        pred_ratings = self.anime_score_model(inputs)
        return pred_ratings

In [14]:
anime_score_model = AnimeAverageRatingModel(anime_data['anime_id'], anime_data['score'])
ranking_average_score_model = UserAnimeListRankingAverageScoreModel(anime_score_model, 3, 9.0)
ranking_average_score_model.compile(loss = mse_and_list_mle_loss)
print(ranking_average_score_model.evaluate(val_ds, return_dict=True))

{'loss': 16.26943588256836, 'Precision@3': 0.5168496370315552, 'Recall@3': 0.4541417360305786, 'MAP@3': 0.38899102807044983, 'MRR@3': 0.6819556951522827, 'ndcg@3': 0.7614755034446716, 'root_mean_squared_error': 1.444114089012146}
