In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_ranking as tfr
from google.cloud import bigquery

In [2]:
sys.path.append(os.path.abspath(".."))

In [3]:
from anime_rec_pkg.anime_rec.data.bq_queries.user_anime_ml_data_queries import user_anime_list_ranking_query

In [4]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Ranking Data

In [5]:
val_data = load_big_query_data(user_anime_list_ranking_query('TEST'))

In [6]:
val_data.head()

Unnamed: 0,user_id,anime_id,score
0,-Bibs-,47778|19815|30370|25013|33050|48483|41487|4224...,10|8|9|9|9|8|9|9|10|10
1,-Cap,44942|39617|40060|39195|38000|38816|38826|3413...,3|1|8|10|8|1|6|5|8|10
2,-Eeco,46102|39547|40591|40716|38000|38753|38680|3804...,9|9|9|8|8|6|8|8|9|8
3,-Heartthrob,47778|50696|49926|45576|41587|42249|40938|1475...,9|8|8|8|6|8|7|7|7|7
4,-SHAWON-,20899|269|42249|40028|40748|40456|41353|36862|...,9|8|8|10|9|8|8|9|10|9


In [7]:
val_data.shape

(364626, 3)

In [8]:
anime_data = load_big_query_data("SELECT anime_id, score FROM `anime-rec-dev.processed_area.anime`")

In [9]:
anime_data = anime_data.fillna(0.0)

In [13]:
val_ds = tf.data.Dataset.from_tensor_slices(
    {
        'user_id' : val_data['user_id'],
        'anime_id' : val_data['anime_id'],
        'score' : val_data['score']
    }
)

val_ds = val_ds.map(lambda x : 
    {
        'user_id' : x['user_id'],
        'anime_id' : tf.strings.split(x['anime_id'], sep='|'),
        'score' : tf.strings.split(x['score'], sep='|')
    }
)

val_ds = val_ds.map(lambda x : 
    {
        'user_id' : tf.repeat(x['user_id'], tf.shape(x['anime_id'])[0], axis = 0),
        'anime_id' : x['anime_id'],
        'score' : tf.strings.to_number(x['score'])
    }
)

val_ds = val_ds.map(lambda x : (
    {
        'user_id' : x['user_id'],
        'anime_id' : x['anime_id']
    },
    x['score'])
)

val_ds = val_ds.batch(2048).cache()

# Models

In [15]:
class AnimeAverageRatingModel(tf.keras.Model):
    def __init__(self, anime_ids, anime_scores):
        super().__init__()
        
        self.ratings = tf.lookup.StaticHashTable(
                tf.lookup.KeyValueTensorInitializer(anime_ids, anime_scores),
                default_value=-1
        )
    
    def call(self, features):

        pred_ratings = self.ratings.lookup(features['anime_id'])

        return pred_ratings

In [18]:
class UserAnimeListRankingAverageScoreModel(tf.keras.Model):
    def __init__(
            self,
            anime_score_model,
            topn=5,
            positive_threshold=8.0
    ):
        super().__init__()

        self.positive_threshold = positive_threshold

        self.classification_metrics = [
            tfr.keras.metrics.PrecisionMetric(topn=topn, name=f'Precision@{topn}'),
            tfr.keras.metrics.RecallMetric(topn=topn, name=f'Recall@{topn}'),
            tfr.keras.metrics.MeanAveragePrecisionMetric(topn=topn, name=f'MAP@{topn}'),
            tfr.keras.metrics.MRRMetric(name='MRR')
        ]

        self.non_classification_metrics = [
            tfr.keras.metrics.NDCGMetric(name='NDCG'),
            tf.keras.metrics.RootMeanSquaredError()
        ]

        
        self.anime_score_model = anime_score_model

    def test_step(self, data):
        
        x, y = data

        y_true = y
        y_true_binary = tf.cast(y_true >= self.positive_threshold, tf.int32)

        y_pred = self(x, training=False)
        y_pred = tf.cast(y_pred, dtype=tf.float32)

        self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
        self.compiled_metrics.update_state(y_true, y_pred)

        for classification_metric in self.classification_metrics:
            classification_metric.update_state(y_true_binary, y_pred)

        for non_classification_metric in self.non_classification_metrics:
            non_classification_metric.update_state(y_true, y_pred)

        return {m.name: m.result() for m in self.metrics}

    @property
    def metrics(self):
        return self.compiled_loss.metrics + \
               self.compiled_metrics.metrics + \
               self.classification_metrics + \
               self.non_classification_metrics
    
    def call(self, inputs):
        
        pred_ratings = self.anime_score_model(inputs)
        return pred_ratings

In [19]:
anime_score_model = AnimeAverageRatingModel(anime_data['anime_id'], anime_data['score'])
ranking_average_score_model = UserAnimeListRankingAverageScoreModel(anime_score_model, 3, 9.0)
ranking_average_score_model.compile(loss = tfr.keras.losses.ListMLELoss())
print(ranking_average_score_model.evaluate(val_ds, return_dict=True))

{'loss': 13.912481307983398, 'Precision@3': 0.5478235483169556, 'Recall@3': 0.48742496967315674, 'MAP@3': 0.42185670137405396, 'MRR': 0.7307184934616089, 'NDCG': 0.898320198059082, 'root_mean_squared_error': 1.4263333082199097}
