<a href="https://colab.research.google.com/github/yvanminyem/whyzzerRec/blob/main/WhyzzerRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

importing different necessary packages

In [80]:
import os
import tempfile
import pprint
from google.colab import files

%matplotlib inline                                        
import matplotlib.pyplot as plt  
                         
from typing import Dict, Text                                                          
import numpy as np                                        
import tensorflow as tf                                   
import keras                                               
import tensorflow_datasets as tfds      
import pandas as pd                  
                                                          
import tensorflow_recommenders as tfrs                    
                                                          
plt.style.use('seaborn-whitegrid')                        

Access bigquery data from collab

In [81]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  with open('/content/' + fn, 'wb') as f:
    f.write(uploaded[fn])
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='/content/' + fn
  projectID = fn.rsplit("-", 1)[0]

Saving whyzzerrecommender-dc1a91d72a11.json to whyzzerrecommender-dc1a91d72a11 (1).json
User uploaded file "whyzzerrecommender-dc1a91d72a11.json" with length 2323 bytes


Enable bigquery Ipython magic

In [82]:
%reload_ext google.cloud.bigquery

extract the first 10 fields

In [83]:
%%bigquery analytics_test_import
SELECT
    *
FROM `whyzzer.kaggledata`


In [None]:
analytics_test_import

In [85]:
analytics_test_import.columns

Index(['position', 'channel_id', 'channel_title', 'video_id', 'published_at',
       'video_title', 'video_description', 'video_category_id',
       'video_category_label', 'duration', 'duration_sec', 'dimension',
       'definition', 'caption', 'licensed_content', 'view_count', 'like_count',
       'dislike_count', 'favorite_count', 'comment_count'],
      dtype='object')

get just the columns we need and save it into one variable

In [86]:
%%bigquery analytics_data_real
SELECT
    video_title,position
FROM `whyzzer.kaggledata`                                 

In [None]:
analytics_data_real.head()                                

Unnamed: 0,video_title,position
0,Getting Curious: What it takes to build a TPU ...,4
1,How to use Kernels and Forums to Win Kaggle Co...,34
2,Kaggle Live Coding: Fuzzy Matching for Spellin...,40
3,Kaggle Reading Group: On NMT Search Errors and...,44
4,Kaggle Live Coding: Automating report generati...,63


In [87]:
analytics = analytics_data_real                           

rename columns to match model entries

In [88]:
analytics.rename(columns={'video_title': 'movie_title','position':'user_id'}, inplace=True)


In [None]:
analytics.head()  

Unnamed: 0,movie_title,user_id
0,Getting Curious: What it takes to build a TPU ...,4
1,How to use Kernels and Forums to Win Kaggle Co...,34
2,Kaggle Live Coding: Fuzzy Matching for Spellin...,40
3,Kaggle Reading Group: On NMT Search Errors and...,44
4,Kaggle Live Coding: Automating report generati...,63


Now we load our movielens model

In [120]:
MODEL_PATH ='/content/drive/MyDrive/Colab Notebooks/1630417522'         
model = tf.saved_model.load(MODEL_PATH )                       

In [None]:
model.trainable_variables


##### Copyright 2020 The TensorFlow Authors.

In [None]:
analytics

In [None]:
#ratings = analytics[['movie_title', 'user_id']]

#movies = analytics[['movie_title']]


In [None]:
#tf.random.set_seed(42)
#shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

#train = shuffled.take(80_000)
#test = shuffled.skip(80_000).take(20_000)

convert data frame into datasets

In [93]:
dataset = tf.data.Dataset.from_tensor_slices(dict(analytics))

In [None]:
for user_id, movie_title in dataset:
    print(f'user_id:{user_id} movie_title:{movie_title}')

In [94]:
print(dataset)

<TensorSliceDataset shapes: {movie_title: (), user_id: ()}, types: {movie_title: tf.string, user_id: tf.int64}>


In [95]:
ratings = dataset.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = dataset.map(lambda x: x["movie_title"],)

In [None]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_title': b'Getting Curious: What it takes to build a TPU | Kaggle',
 'user_id': 4}


In [116]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

#change user id to string to be able to perfom stringlookup later
#new_unique_user_ids= tf.strings.as_string(unique_user_ids)

unique_user_ids[:10]


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

### The query tower

Let's start with the query tower.

The first step is to decide on the dimensionality of the query and candidate representations:

In [97]:
embedding_dimension = 32

In [117]:
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=new_unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

A simple model like this corresponds exactly to a classic [matrix factorization](https://ieeexplore.ieee.org/abstract/document/4781121) approach. While defining a subclass of `tf.keras.Model` for this simple model might be overkill, we can easily extend it to an arbitrarily complex model using standard Keras components, as long as we return an `embedding_dimension`-wide output at the end.

### The candidate tower

We can do the same with the candidate tower.

In [118]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

In [100]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

In [101]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

The task itself is a Keras layer that takes the query and candidate embeddings as arguments, and returns the computed loss: we'll use that to implement the model's training loop.

In [102]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [122]:
model = MovielensModel(user_model, movie_model)


In [104]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

Then shuffle, batch, and cache the training and evaluation data.

In [None]:
#cached_train = train.shuffle(100_000).batch(8192).cache()
#cached_test = test.batch(4096).cache()

Then train the  model:

In [None]:
#model.fit(cached_train, epochs=3)

**WE dont need to train. we have to retriev the weight from movielens model**

Finally, we can evaluate our model on the test set:

In [None]:
#model.evaluate(cached_test, return_dict=True)

## Making predictions

Now that we have a model, we would like to be able to make predictions. We can use the `tfrs.layers.factorized_top_k.BruteForce` layer to do this.

In [123]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Kaggle Live-Coding: Code Reviews! Class imbalanced in Python | Kaggle'
 b'How to download data files from Kaggle Notebooks | Kaggle'
 b'Kaggle Days Tokyo: Highlights | Kaggle Days']


In [None]:
# index.load_weights('/content/drive/MyDrive/Colab Notebooks/1630417522/checkpoint')

Of course, the `BruteForce` layer is going to be too slow to serve a model with many possible candidates. The following sections shows how to speed this up by using an approximate retrieval index.

In [126]:
import time

t = time.time()

export_path_sm = "./{}".format(int(t))
print(export_path_sm)
                                             

tf.saved_model.save(index, export_path_sm)

./1631268957




INFO:tensorflow:Assets written to: ./1631268957/assets


INFO:tensorflow:Assets written to: ./1631268957/assets


In [127]:
!zip -r model.zip {export_path_sm}

  adding: 1631268957/ (stored 0%)
  adding: 1631268957/assets/ (stored 0%)
  adding: 1631268957/saved_model.pb (deflated 87%)
  adding: 1631268957/variables/ (stored 0%)
  adding: 1631268957/variables/variables.data-00000-of-00001 (deflated 22%)
  adding: 1631268957/variables/variables.index (deflated 33%)


In [128]:
try:
  from google.colab import files
  files.download('./model.zip')
except ImportError:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from datetime import datetime

In [None]:
dt = analytics.iloc[:,2]


In [None]:
movies=analytics.iloc[:,0]

In [None]:
movies.head()