In [None]:
! pip uninstall --yes tensorflow
! pip uninstall --yes tensorflow-io
! pip install tensorflow
! pip install --no-deps tensorflow-io

In [1]:
from google.cloud import bigquery

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np 
import cv2
from urllib.request import urlopen

In [3]:
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input

In [98]:
def get_anime_data():
    query = """
        SELECT anime_id, title, synopsis, main_pic, type, source_type, studios, genres
        FROM prod_area_us.anime
    """
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("prod_area_us")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    result = query_job.to_dataframe()
    return result

In [110]:
def download_image(img_url):
    for i in range(10):
        try:
            with urlopen(img_url) as request:
                img_array = np.asarray(bytearray(request.read()), dtype=np.uint8)
            img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (WIDTH, HEIGHT), interpolation = cv2.INTER_AREA)
            return img
        except Exception as e:
            print(e)
            continue
    raise Exception("Could not download image")

In [99]:
anime_data = get_anime_data()

In [100]:
anime_data.shape

(3937, 8)

In [101]:
anime_data.head()

Unnamed: 0,anime_id,title,synopsis,main_pic,type,source_type,studios,genres
0,37268,Z/X: Code Reunion,The signing of a peace treaty has secured a te...,https://cdn.myanimelist.net/images/anime/1271/...,TV,Card game,[Passione],"[Fantasy, Sci-Fi, School, Shounen]"
1,37739,Future Card Shin Buddyfight,Set 23 years after the end of the original Fut...,https://cdn.myanimelist.net/images/anime/1367/...,TV,Card game,"[Xebec, OLM]",[Game]
2,19067,Future Card Buddyfight,An adaptation of the Future Card Buddyfight ca...,https://cdn.myanimelist.net/images/anime/9/787...,TV,Card game,"[Xebec, OLM]",[Game]
3,43735,Cue!,No synopsis information has been added to this...,https://cdn.myanimelist.net/images/anime/1516/...,TV,Game,"[Yumeta Company, Graphinica]",[Music]
4,46654,IDOLiSH7 3rd Season Part 2,Second cour of the third season of .,https://cdn.myanimelist.net/images/anime/1166/...,TV,Game,[TROYCA],[Music]


In [102]:
HEIGHT = 192
WIDTH = 128
NUM_ANIMES = anime_data.shape[0]

In [10]:
IMG_EMB_PREPROCESS = preprocess_input
IMG_EMB_MODEL = MobileNetV2(weights='imagenet', include_top=False, pooling = 'max')
print("Downloaded Image embeddinng model")


TITLE_EMB_PREPROCESS = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
TITLE_EMB_MODEL = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2", 
                                 trainable=False)
print("Downloaded Title embeddinng model")

SYNOPSIS_EMB_PREPROCESS = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
SYNOPSIS_EMB_MODEL = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2", 
                                    trainable=False)
print("Downloaded Synopsis embeddinng model")

Downloaded Image embeddinng model
Downloaded Title embeddinng model
Downloaded Synopsis embeddinng model


# anime_id to integer lookup

In [103]:
anime_ids = anime_data['anime_id'].unique()
anime_id_lookup = tf.keras.layers.StringLookup()
anime_id_lookup.adapt(anime_ids)
anime_emb_order = tf.concat([[0], anime_id_lookup(anime_ids)], axis = 0)

# anime image embedding layer

In [111]:
import time

In [None]:
t0 = time.time()
anime_images = anime_data['main_pic'].apply(download_image)
anime_images = tf.stack(anime_images)
anime_images = tf.cast(anime_images, dtype = tf.float32)
anime_image_preprocess = IMG_EMB_PREPROCESS(anime_images)
anime_image_embeddings = IMG_EMB_MODEL(anime_image_preprocess)
anime_image_embeddings = tf.concat([tf.zeros((1, 1280)), anime_image_embeddings], axis = 0)
anime_image_embeddings = tf.gather(anime_image_embeddings, indices = anime_emb_order)
image_embedding_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    1280,
    weights=[anime_image_embeddings],
    trainable=True
)
print(time.time() - t0)

<urlopen error [Errno 54] Connection reset by peer>
<urlopen error [Errno 54] Connection reset by peer>
[Errno 54] Connection reset by peer


In [109]:
anime_images[anime_images.isnull()]

3000    None
3653    None
Name: main_pic, dtype: object

# anime title embedding layer

In [13]:
anime_title = anime_data['title']
anime_title_process = TITLE_EMB_PREPROCESS(anime_title)
anime_title_embeddings = TITLE_EMB_MODEL(anime_title_process)
anime_title_embeddings = anime_title_embeddings["pooled_output"]
anime_title_embeddings = tf.concat([tf.zeros((1, 128)), anime_title_embeddings], axis = 0)
anime_title_embeddings = tf.gather(anime_title_embeddings, indices = anime_emb_order)
title_embedding_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    128,
    weights=[anime_title_embeddings],
    trainable=True
)

# anime synposis embedding layer

In [14]:
anime_synopsis = anime_data['synopsis']
anime_synopsis_process = SYNOPSIS_EMB_PREPROCESS(anime_synopsis)
anime_synopsis_embeddings = SYNOPSIS_EMB_MODEL(anime_synopsis_process)
anime_synopsis_embeddings = anime_synopsis_embeddings["pooled_output"]
anime_synopsis_embeddings = tf.concat([tf.zeros((1, 128)), anime_synopsis_embeddings], axis = 0)
anime_synopsis_embeddings = tf.gather(anime_synopsis_embeddings, indices = anime_emb_order)
synopsis_embedding_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    128,
    weights=[anime_synopsis_embeddings],
    trainable=True
)

# anime type one-hot-encoder

In [26]:
anime_type_one_hot_layer = tf.keras.layers.StringLookup(output_mode = "one_hot")
anime_type_one_hot_layer.adapt(anime_data["type"])
anime_type_one_hot_encodings = anime_type_one_hot_layer(list(anime_data["type"]))
NUM_TYPE_ONE_HOT_DIMS = anime_type_one_hot_encodings.shape[1]
anime_type_one_hot_encodings = tf.concat([tf.zeros((1, NUM_TYPE_ONE_HOT_DIMS)), anime_type_one_hot_encodings], axis = 0)
anime_type_one_hot_encodings = tf.gather(anime_type_one_hot_encodings, indices = anime_emb_order)
type_one_hot_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    NUM_TYPE_ONE_HOT_DIMS,
    weights=[anime_type_one_hot_encodings],
    trainable=True
)

# anime source type one-hot-encoder

In [27]:
anime_source_type_one_hot_layer = tf.keras.layers.StringLookup(output_mode = "one_hot")
anime_source_type_one_hot_layer.adapt(anime_data["source_type"])
anime_source_type_one_hot_encodings = anime_source_type_one_hot_layer(list(anime_data["source_type"]))
NUM_SOURCE_TYPE_ONE_HOT_DIMS = anime_source_type_one_hot_encodings.shape[1]
anime_source_type_one_hot_encodings = tf.concat([tf.zeros((1, NUM_SOURCE_TYPE_ONE_HOT_DIMS)), anime_source_type_one_hot_encodings], axis = 0)
anime_source_type_one_hot_encodings = tf.gather(anime_source_type_one_hot_encodings, indices = anime_emb_order)
source_type_one_hot_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    NUM_SOURCE_TYPE_ONE_HOT_DIMS,
    weights=[anime_source_type_one_hot_encodings],
    trainable=True
)

# anime studios multi-hot-encoder

In [28]:
def pad_list(list_entities, max_len):
    list_entities = list_entities[:max_len]
    num_add = max_len - list_entities.shape[0]
    return np.concatenate([list_entities , num_add * ["[UNK]"]])

In [42]:
anime_studios_multi_hot_layer = tf.keras.layers.StringLookup(output_mode = "multi_hot")
anime_studios_multi_hot_layer.adapt(list(anime_data["studios"].apply(lambda x : pad_list(x, 10))))
anime_studios_multi_hot_encodings = anime_studios_multi_hot_layer(list(anime_data["studios"].apply(lambda x : pad_list(x, 10))))
anime_studios_multi_hot_encodings = anime_studios_multi_hot_encodings[:, 1:]
NUM_STUDIOS_MULTI_HOT_DIMS = anime_studios_multi_hot_encodings.shape[1]
anime_studios_multi_hot_encodings = tf.concat([tf.zeros((1, NUM_STUDIOS_MULTI_HOT_DIMS)), anime_studios_multi_hot_encodings], axis = 0)
anime_studios_multi_hot_encodings = tf.gather(anime_studios_multi_hot_encodings, indices = anime_emb_order)
studios_one_hot_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    NUM_STUDIOS_MULTI_HOT_DIMS,
    weights=[anime_studios_multi_hot_encodings],
    trainable=True
)

# anime genres multi-hot-encoder

In [43]:
anime_genres_multi_hot_layer = tf.keras.layers.StringLookup(output_mode = "multi_hot")
anime_genres_multi_hot_layer.adapt(list(anime_data["genres"].apply(lambda x : pad_list(x, 20))))
anime_genres_multi_hot_encodings = anime_genres_multi_hot_layer(list(anime_data["genres"].apply(lambda x : pad_list(x, 20))))
anime_genres_multi_hot_encodings = anime_genres_multi_hot_encodings[:, 1:]
NUM_GENRES_MULTI_HOT_DIMS = anime_genres_multi_hot_encodings.shape[1]
anime_genres_multi_hot_encodings = tf.concat([tf.zeros((1, NUM_GENRES_MULTI_HOT_DIMS)), anime_genres_multi_hot_encodings], axis = 0)
anime_genres_multi_hot_encodings = tf.gather(anime_genres_multi_hot_encodings, indices = anime_emb_order)
genres_one_hot_layer = tf.keras.layers.Embedding(
    NUM_ANIMES + 1,
    NUM_GENRES_MULTI_HOT_DIMS,
    weights=[anime_genres_multi_hot_encodings],
    trainable=True
)

# Keras siamese model

In [60]:
class AnimeModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        self.anime_id_lookup_layer = anime_id_lookup
        
        self.image_embedding_layer = tf.keras.Sequential([
            image_embedding_layer,
            tf.keras.layers.Dense(64)
        ])
        self.title_embedding_layer = tf.keras.Sequential([
            title_embedding_layer,
            tf.keras.layers.Dense(64)
        ])
        self.synopsis_embedding_layer = tf.keras.Sequential([
            synopsis_embedding_layer,
            tf.keras.layers.Dense(64)
        ])
        self.type_embedding_layer = tf.keras.Sequential([
            type_one_hot_layer,
            tf.keras.layers.Dense(2)
        ])
        self.source_type_embedding_layer = tf.keras.Sequential([
            source_type_one_hot_layer,
            tf.keras.layers.Dense(2)
        ])
        self.studios_embedding_layer = tf.keras.Sequential([
            studios_one_hot_layer,
            tf.keras.layers.Dense(8)
        ])
        self.genres_embedding_layer = tf.keras.Sequential([
            genres_one_hot_layer,
            tf.keras.layers.Dense(16)
        ])
        
        self.anime_embedding_layer =  tf.keras.layers.Dense(128)

    def call(self, inputs):
        anime_ids = self.anime_id_lookup_layer(inputs)
        image_embeddings = self.image_embedding_layer(anime_ids)
        title_embeddings = self.title_embedding_layer(anime_ids)
        synopsis_embeddings = self.synopsis_embedding_layer(anime_ids)
        type_embeddings = self.type_embedding_layer(anime_ids)
        source_type_embeddings = self.source_type_embedding_layer(anime_ids)
        studios_embeddings = self.studios_embedding_layer(anime_ids)
        genres_embeddings = self.genres_embedding_layer(anime_ids)
        concat_embeddings = tf.concat([
            image_embeddings,
            title_embeddings,
            synopsis_embeddings,
            type_embeddings,
            source_type_embeddings,
            studios_embeddings,
            genres_embeddings
        ], axis = 1)
        return self.anime_embedding_layer(concat_embeddings)

In [61]:
model = AnimeModel()

In [62]:
model(["1268"])

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-6.3418555 , -3.9214664 ,  2.6112008 , -7.9402766 ,  2.8465486 ,
         4.5414524 ,  0.12666081, -1.381447  , -3.4304955 , -5.509757  ,
        -0.8354233 , -2.6516316 , -0.13625926,  4.562217  , -2.8226378 ,
         2.8903322 ,  4.2472    ,  0.7027546 , -5.3793845 , -1.2955028 ,
        -1.9881753 ,  4.3203535 , -1.326868  ,  3.007033  ,  4.6753716 ,
        -3.7106133 , -1.0980685 , -1.1932652 ,  1.0654194 , -3.0756147 ,
         3.4580252 ,  4.093649  ,  0.865989  ,  7.6361113 ,  0.3156387 ,
         5.1992064 , -1.2044413 ,  0.02368034,  5.3396263 , -2.6670299 ,
        -4.9677844 , -0.8674688 ,  4.163064  ,  2.1001394 , -5.3449316 ,
         2.4634497 ,  2.182181  ,  2.9887006 ,  3.44288   , -6.2953    ,
        -4.401819  ,  3.9091861 , -4.80464   , -2.8993669 ,  7.5786066 ,
         2.2872233 , -1.7603097 ,  3.468829  ,  0.1263675 , -0.89772534,
         0.6013544 ,  0.70019233,  0.63990295, -3.6379678 , -0.3717012 ,
 

# Siamese data prep

In [63]:
def get_triplets_data():
    query = """
        WITH 
        positives AS (
            SELECT animeA, animeB
            FROM `anime-rec-dev.prod_area_us.anime_anime`
            WHERE recommendation = 1 OR related = 1
        ),
        easy_negatives AS (
            SELECT animeA, animeB
            FROM `anime-rec-dev.prod_area_us.anime_anime`
            WHERE recommendation = 0 AND related = 0 AND genre_IOU < 0.1 AND club_IOU < 0.1
        ),
        hard_negatives AS (
            SELECT animeA, animeB
            FROM `anime-rec-dev.prod_area_us.anime_anime`
            WHERE recommendation = 0 AND related = 0 AND genre_IOU > 0.3 AND genre_IOU < 0.6 AND club_IOU > 0.3 AND club_IOU < 0.6
        ),
        easy_triplets AS (
            SELECT A.animeA AS anchor, A.animeB AS positive, B.animeB AS negative, ROW_NUMBER() OVER (PARTITION BY A.animeA, A.animeB ORDER BY RAND()) AS row_number
            FROM positives A
            LEFT JOIN easy_negatives B
            ON A.animeA = B.animeA
            WHERE B.animeB IS NOT NULL
        ),
        hard_triplets AS (
            SELECT A.animeA AS anchor, A.animeB AS positive, B.animeB AS negative, ROW_NUMBER() OVER (PARTITION BY A.animeA, A.animeB ORDER BY RAND()) AS row_number
            FROM positives A
            LEFT JOIN hard_negatives B
            ON A.animeA = B.animeA
            WHERE B.animeB IS NOT NULL
        )
        SELECT anchor, positive, negative FROM easy_triplets WHERE row_number <= 5
        UNION DISTINCT 
        SELECT anchor, positive, negative FROM hard_triplets WHERE row_number <= 10
    """
    client = bigquery.Client(project="anime-rec-dev")
    dataset_ref = client.dataset("prod_area_us")
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    result = query_job.to_dataframe()
    return result

In [83]:
class DistanceLayer(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

In [84]:
anchor_input = tf.keras.layers.Input(name = "anchor", shape = (), dtype = "string")
positive_input = tf.keras.layers.Input(name = "positive", shape = (), dtype = "string")
negative_input = tf.keras.layers.Input(name = "negative", shape = (), dtype = "string")
distances = DistanceLayer()(
    model(anchor_input),
    model(positive_input),
    model(negative_input),
)
siamese_network = tf.keras.Model(
    inputs=[anchor_input, positive_input, negative_input], outputs=distances
)





















































































In [86]:
siamese_network({"anchor" : tf.constant(["1268"]), 
                 "positive" : tf.constant(["1268"]), 
                 "negative" : tf.constant(["31280"])})

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1172.6482], dtype=float32)>)

In [94]:
class SiameseModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.siamese_network = siamese_network
        self.loss_tracker = tf.keras.metrics.Mean(name = 'loss')
    def call(self, inputs):
        return self.network(inputs)
    def train_step(self, data):
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)

        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )

        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)

        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        ap_distance, an_distance = self.siamese_network(data)

        loss = ap_distance - an_distance
        loss = tf.maximum(loss + 0.5, 0.0)
        return loss

    @property
    def metrics(self):
        return [self.loss_tracker]

In [64]:
triplets = get_triplets_data()

In [65]:
triplets.shape

(188701, 3)

In [95]:
siamese_model = SiameseModel()
siamese_model.compile(optimizer=tf.keras.optimizers.Adam(0.0001))

In [96]:
anchor = triplets["anchor"]
positive = triplets["positive"]
negative = triplets["negative"]

In [97]:
siamese_model.fit({
    "anchor" : anchor,
    "positive" : positive,
    "negative" : negative
},
epochs = 5
)

Epoch 1/5








































































































































































Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f972b9056d8>