In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from google.cloud import bigquery

In [2]:
sys.path.append(os.path.abspath(".."))

In [3]:
from anime_rec_pkg.anime_rec.data.bq_queries.anime_anime_ml_data_queries import anime_anime_pair_ranking_query

In [4]:
def load_big_query_data(query):
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("processed_area")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    data = query_job.to_dataframe()
    return data

# Pair ranking data

In [6]:
val_data = load_big_query_data(anime_anime_pair_ranking_query('TEST'))

In [7]:
val_data.head()

Unnamed: 0,anime_id,retrieved_anime_id_1,retrieved_anime_id_2,label
0,8074,200,32843,1
1,8098,8740,10578,1
2,8101,523,6675,1
3,8101,6280,6007,1
4,812,3701,33902,1


In [8]:
val_data.shape

(135840257, 4)

In [9]:
anime_data = load_big_query_data("SELECT anime_id, score FROM `anime-rec-dev.processed_area.anime`")

In [10]:
anime_data = anime_data.fillna(0.0)

In [11]:
anime_data.head()

Unnamed: 0,anime_id,score
0,42447,0.0
1,49507,0.0
2,31699,0.0
3,37904,0.0
4,10797,0.0


In [12]:
anime_data.shape

(9792, 2)

In [13]:
val_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            {
                'anchor_anime' : tf.cast(val_data['anime_id'], tf.string),
                'rel_anime_1' : tf.cast(val_data['retrieved_anime_id_1'], tf.string),
                'rel_anime_2' : tf.cast(val_data['retrieved_anime_id_2'], tf.string),
            },
            val_data['label']
        )
    )
)
val_ds = val_ds.batch(2048).cache()

## Models

In [14]:
class AnimeAverageRatingModel(tf.keras.Model):
    def __init__(self, anime_ids, anime_scores):
        super().__init__()
        
        self.ratings = tf.lookup.StaticHashTable(
                tf.lookup.KeyValueTensorInitializer(anime_ids, anime_scores),
                default_value=-1
        )
    
    def call(self, features):

        pred_ratings = self.ratings.lookup(features['rel_anime'])

        return pred_ratings

In [15]:
class AnimeAnimePairClassificationModel(tf.keras.Model):
    '''
        Classification model that trains the scoring model
        This model takes as input three anime_ids and a label
            anchor_anime : the initial anime
            rel_anime_1 : first anime to be scored
            rel_anime_2 : second anime to be score
            label : 1 if rel_anime_1 is more relevant to anchor_anime than rel_anime_2
                    0 else
        Model computes the two scores and return
        sigmoid(score1 - score2) as binary classification prediction
        
    '''
    def __init__(self, anime_scoring_model):
        super().__init__()

        self.anime_scoring_model = anime_scoring_model

    def call(self, data):

        pred_score_1 = self.anime_scoring_model({
            'anchor_anime' : data["anchor_anime"],
            'rel_anime' : data["rel_anime_1"]
        })

        pred_score_2 = self.anime_scoring_model({
            'anchor_anime' : data["anchor_anime"],
            'rel_anime' : data["rel_anime_2"]
        })

        classification_score = tf.math.sigmoid(pred_score_1 - pred_score_2)
        return classification_score

In [16]:
average_anime_scoring_model = AnimeAverageRatingModel(anime_data['anime_id'], anime_data['score'])

In [17]:
average_anime_scoring_model({'anchor_anime' : tf.constant(['7016']), 'rel_anime' : tf.constant(['7016'])})

<tf.Tensor: shape=(1,), dtype=float64, numpy=array([6.78])>

In [18]:
anime_anime_pair_classification_model = AnimeAnimePairClassificationModel(average_anime_scoring_model)
anime_anime_pair_classification_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.AUC()
        ]
    )

In [19]:
anime_anime_pair_classification_model(
    {
        'anchor_anime' : tf.constant(['7016']), 
        'rel_anime_1' : tf.constant(['7016']),
        'rel_anime_2' : tf.constant(['7016']),
    }
)

<tf.Tensor: shape=(1,), dtype=float64, numpy=array([0.5])>

In [20]:
anime_anime_pair_classification_model.evaluate(val_ds)



[0.8085850477218628, 0.5010027289390564, 1.0, 0.501002848148346, 0.0]