<a href="https://colab.research.google.com/github/wesleyklhk/tensorflow_notes/blob/main/Listwise_Toy_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q tensorflow-ranking

[K     |████████████████████████████████| 88 kB 3.3 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.8 kB/s 
[K     |████████████████████████████████| 438 kB 72.1 MB/s 
[K     |████████████████████████████████| 5.8 MB 41.2 MB/s 
[K     |████████████████████████████████| 1.6 MB 48.8 MB/s 
[K     |████████████████████████████████| 4.3 MB 4.2 MB/s 
[K     |████████████████████████████████| 141 kB 4.1 MB/s 
[?25h

#Packages Version

In [None]:
!pip freeze

absl-py==1.2.0
aiohttp==3.8.1
aiosignal==1.2.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.2.0
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
asynctest==0.13.0
atari-py==0.2.9
atomicwrites==1.4.1
attrs==21.4.0
audioread==2.1.9
autograd==1.4
Babel==2.10.3
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==5.0.1
blis==0.7.8
bokeh==2.3.3
branca==0.5.0
bs4==0.0.1
CacheControl==0.12.11
cached-property==1.5.2
cachetools==4.2.4
catalogue==2.0.8
certifi==2022.6.15
cffi==1.15.1
cftime==1.6.1
chardet==3.0.4
charset-normalizer==2.1.0
click==7.1.2
clikit==0.6.2
cloudpickle==1.3.0
cmake==3.22.5
cmdstanpy==1.0.4
colorcet==3.0.0
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.4.0
coverage==3.7.1
coveralls==0.5
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.7
cvxpy==1.0.31
cycler==0.11.0
cymem==2.0.6
Cython==0.29.30
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy=

# Import Packages

In [120]:
import tensorflow as tf
import pprint

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
from tensorflow_ranking import utils
tf.random.set_seed(1123)


# Generate Synthetic Dataset

Consider having a dataset with the following structure:
<table>
  <thead>
    <tr>
      <td>x0</td>
      <td>x1</td>
      <td>...</td>
      <td>x9</td>
    </tr>
  </thead>
      <tr>
        <td>0.11</td>
        <td>0.56</td>
        <td>...</td>
        <td>0.39</td>
    </tr>
</table>
where
<br>x0,x1 is related to Product A
<br>x2,x3 is related to Product B
<br>x4,x5 is related to Product C
<br>x6,x7 is related to Product D
<br>x8,x9 is related to Product E

let $ tanh((x_{i} - x_{i-1})^{2}) $ be the product score function<br>
E.g. <br>
Product A Score Function = $ tanh((x_{1} - x_{0})^{2}) $<br>
Now that we have defined an arbitrary score function for each product. We now use this to generate the order of preference. <br>
E.g.<br>
<table>
  <thead>
    <tr>
      <td>A</td>
      <td>B</td>
      <td>C</td>
      <td>D</td>
      <td>E</td>
    </tr>
  </thead>
      <tr>
        <td>0.81</td>
        <td>0.17</td>
        <td>0.62</td>
        <td>0.64</td>
        <td>0.25</td>
      </tr>
      <tr>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
        <td>...</td>
      </tr>    
</table>

Row 1 Order of Preference would be: A>D>C>E>B

In [3]:
Xtrain_df = pd.DataFrame(np.random.rand(10000,10),columns=[f'x{i}' for i in range(10)])
Xtrain_df.index.name = 'user_id'
Xval_df = pd.DataFrame(np.random.rand(10000,10),columns=[f'x{i}' for i in range(10)])
Xval_df.index.name = 'user_id'
Xtrain_df.head()

Unnamed: 0_level_0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.854716,0.690092,0.029551,0.202737,0.635251,0.227461,0.638354,0.112027,0.75262,0.238631
1,0.369388,0.126473,0.742065,0.789462,0.27354,0.765324,0.607484,0.012529,0.638205,0.069444
2,0.660032,0.477653,0.29319,0.602123,0.664522,0.612226,0.361328,0.124869,0.228796,0.300909
3,0.061146,0.72145,0.140084,0.043767,0.088932,0.254212,0.728351,0.026874,0.05614,0.311474
4,0.612052,0.964349,0.136497,0.877306,0.851079,0.254706,0.214313,0.576485,0.115037,0.381863


In [4]:
def product_score_func(_df):
  _sc = pd.DataFrame({
    p:np.tanh((_df[f'x{i+1}'] - _df[f'x{i}'])**2) 
    for i,p in enumerate(['A','B','C','D','E'])      
  })
  _sc.index = _df.index
  return _sc
Xtrain_score_df = product_score_func(Xtrain_df)
Xval_score_df = product_score_func(Xval_df)    

Xtrain_score_df.head()

Unnamed: 0_level_0,A,B,C,D,E
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.027095,0.410585,0.029984,0.184917,0.164777
1,0.058939,0.361799,0.002246,0.260063,0.237244
2,0.03325,0.034013,0.09515,0.003894,0.002735
3,0.410324,0.325678,0.009277,0.00204,0.027311
4,0.12348,0.59498,0.499619,0.000688,0.341386


In [5]:
def rank_score(_df):
  return _df.rank(method='first',ascending=True,axis=1)
train_rank_df = rank_score(Xtrain_score_df)
val_rank_df = rank_score(Xval_score_df)
train_rank_df.head()

Unnamed: 0_level_0,A,B,C,D,E
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,5.0,2.0,4.0,3.0
1,2.0,5.0,1.0,4.0,3.0
2,3.0,4.0,5.0,2.0,1.0
3,5.0,4.0,2.0,1.0,3.0
4,2.0,5.0,4.0,1.0,3.0


In [6]:
def rank_v2(_df):
  _min_rank = int(_df.min().min())
  _max_rank = int(_df.max().max())

  _res = pd.DataFrame({f'rank{i}':np.select(
      condlist=[(_df==i).sum(axis=1)==1],
      choicelist=[(_df==i).idxmax(axis=1)],
      default=np.NaN
  ) for i in range(_min_rank,_max_rank+1)})
  _res.index = _df.index
  return _res

train_rank2_df = rank_v2(train_rank_df)
val_rank2_df = rank_v2(val_rank_df)
train_rank2_df.head()

Unnamed: 0_level_0,rank1,rank2,rank3,rank4,rank5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,A,C,E,D,B
1,C,A,E,D,B
2,E,D,A,B,C
3,D,C,E,B,A
4,D,A,E,C,B


In [171]:
def denom(_df):
  _res = pd.concat([
      _df[[col]].rename(columns={col:'product'}).assign(_rank=col) 
      for col in _df.columns
  ])
  _res['_rank'] = _res._rank.str.replace('rank','').astype('float')
  return _res
denom_train_rank_df = denom(train_rank2_df).reset_index().sort_values(['user_id','product']).set_index('user_id')#.sort_index()
denom_val_rank_df = denom(val_rank2_df).reset_index().sort_values(['user_id','product']).set_index('user_id')#.sort_index()
denom_train_rank_df.head(10)

Unnamed: 0_level_0,product,_rank
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,A,1.0
0,B,5.0
0,C,2.0
0,D,4.0
0,E,3.0
1,A,2.0
1,B,5.0
1,C,1.0
1,D,4.0
1,E,3.0


In [172]:
Xtrain_df.sort_index(inplace=True)
Xval_df.sort_index(inplace=True)

Up to this point, we have prepared 2 major types of dataset:
1. Xtrain_df (which contains the numeric features) <strong>[1 row = 1 user_id = 1 observation]</strong>
2. denom_train_rank_df (this dataset contains the rank across products for each user_id) <strong>[5 rows = 1 user_id = 1 observation]</strong>

Note: type1 has fewer rows than type2<br>
We need to somehow group every 5 rows in type2 into meaning 1 observation. To do this, we can rely on tensorflow dataset magic

In [173]:
from pprint import pprint
######tensorflow dataset magic groups every 5 rows into a single observation
tfd_denom_train_rank_df = tf.data.Dataset.from_tensor_slices(dict(denom_train_rank_df.reset_index('user_id'))).batch(5)
tfd_denom_val_rank_df = tf.data.Dataset.from_tensor_slices(dict(denom_val_rank_df.reset_index('user_id'))).batch(5)

####sanity checking only 3 observations
for ind,i in enumerate(tfd_denom_train_rank_df.take(3)):
  print('*'*10 + f'Observation {1+ind} [START]' + '*'*10)
  pprint(i)
  print('*'*10 + f'Observation {1+ind} [END]' + '*'*10)


**********Observation 1 [START]**********
{'_rank': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([1., 5., 2., 4., 3.])>,
 'product': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'A', b'B', b'C', b'D', b'E'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 0, 0, 0, 0])>}
**********Observation 1 [END]**********
**********Observation 2 [START]**********
{'_rank': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([2., 5., 1., 4., 3.])>,
 'product': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'A', b'B', b'C', b'D', b'E'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 1, 1, 1, 1])>}
**********Observation 2 [END]**********
**********Observation 3 [START]**********
{'_rank': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([3., 4., 5., 2., 1.])>,
 'product': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'A', b'B', b'C', b'D', b'E'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(5,), dtype=int64, 

For type1, we dont have much to change. We just need to convert to tensorflow dataset. However, there are 2 ways to convert dataframe into tensorflow dataset:
1. tf.data.Dataset.from_tensor_slices(dict(Xtrain_df))
2. tf.data.Dataset.from_tensor_slices(Xtrain_df.values)

If we use method1, then we will have a dictionary structured tensorflow dataset into our model which means each element is a dictionary of tensors. We will have to refer to each column by its column. Hence, it is useful when we need to have some differential treatment to each column. <br>

If we use method2, then each element will be a normal tensor. This gets useful when we need just the same variable treatment for all columns. E.g. features normalization.

For our purpose, we will use method2 to process type1 because they are all continuous features and only need to apply normalization for all columns.

In [174]:
#####no change to Xtrain_df,Xval_df; just need to convert to tensorflow dataset for training
tfd_Xtrain_df = tf.data.Dataset.from_tensor_slices(Xtrain_df.values)
tfd_Xval_df = tf.data.Dataset.from_tensor_slices(Xval_df.values)

####sanity checking only 3 observations
for ind,i in enumerate(tfd_Xtrain_df.take(3)):
  print('*'*10 + f'Observation {1+ind} [START]' + '*'*10)
  pprint(i)
  print('*'*10 + f'Observation {1+ind} [END]' + '*'*10)


**********Observation 1 [START]**********
<tf.Tensor: shape=(10,), dtype=float64, numpy=
array([0.85471643, 0.69009183, 0.0295508 , 0.20273663, 0.63525129,
       0.22746102, 0.63835403, 0.11202659, 0.75261977, 0.23863053])>
**********Observation 1 [END]**********
**********Observation 2 [START]**********
<tf.Tensor: shape=(10,), dtype=float64, numpy=
array([0.36938765, 0.12647271, 0.74206513, 0.78946227, 0.27353964,
       0.76532375, 0.60748418, 0.0125288 , 0.63820458, 0.069444  ])>
**********Observation 2 [END]**********
**********Observation 3 [START]**********
<tf.Tensor: shape=(10,), dtype=float64, numpy=
array([0.66003202, 0.47765289, 0.29319027, 0.60212263, 0.66452154,
       0.61222604, 0.36132765, 0.12486857, 0.22879641, 0.30090873])>
**********Observation 3 [END]**********


Now both type1 and type2 dataset are ready. We can combine them together into a single dataset. There are many advantages to combining these dataset into one but the major reason is to allow it to shuffle. 

In [175]:
from pprint import pprint
#####combining into 1 dataset
def map_fn(cf,prdf):
  res = prdf.copy()
  res['conf'] = cf
  return res

tfd_train_df = tf.data.Dataset.zip( (tfd_Xtrain_df,tfd_denom_train_rank_df) ).map(map_fn)#.shuffle(32)
tfd_val_df = tf.data.Dataset.zip( (tfd_Xval_df,tfd_denom_val_rank_df) ).map(map_fn)#.shuffle(32)

####sanity checking only 3 observations
for ind,i in enumerate(tfd_train_df.take(3)):
  print('*'*10 + f'Observation {1+ind} [START]' + '*'*10)
  pprint(i)
  print('*'*10 + f'Observation {1+ind} [END]' + '*'*10)


**********Observation 1 [START]**********
{'_rank': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([1., 5., 2., 4., 3.])>,
 'conf': <tf.Tensor: shape=(10,), dtype=float64, numpy=
array([0.85471643, 0.69009183, 0.0295508 , 0.20273663, 0.63525129,
       0.22746102, 0.63835403, 0.11202659, 0.75261977, 0.23863053])>,
 'product': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'A', b'B', b'C', b'D', b'E'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 0, 0, 0, 0])>}
**********Observation 1 [END]**********
**********Observation 2 [START]**********
{'_rank': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([2., 5., 1., 4., 3.])>,
 'conf': <tf.Tensor: shape=(10,), dtype=float64, numpy=
array([0.36938765, 0.12647271, 0.74206513, 0.78946227, 0.27353964,
       0.76532375, 0.60748418, 0.0125288 , 0.63820458, 0.069444  ])>,
 'product': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'A', b'B', b'C', b'D', b'E'], dtype=object)>,
 'user_id': <tf.Tens

FINALLY, all tensorflow datasets are ready. Lets move on to model building

# Model Building
We will modify the RankingModel architecture provided in https://www.tensorflow.org/recommenders/examples/listwise_ranking

In [49]:
unique_products = ['A','B','C','D','E']

In [176]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()


    # Compute embeddings for movies.
    self.product_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_products),
      tf.keras.layers.Embedding(len(unique_products) + 2, 4)
    ])

    self.norm_layer = tf.keras.layers.Normalization(axis=-1)

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    print(features)

    
    
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    product_embeddings = self.product_embeddings(features["product"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    norm_con_feats = self.norm_layer(features['conf'])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = tf.shape(features["product"])[1]

    
    print('product_embeddings',product_embeddings.shape)
    print('product',features["product"].shape)
    print('norm_con_feats',norm_con_feats.shape)
    print('tf.expand_dims(norm_con_feats, 1)',tf.expand_dims(norm_con_feats, 1).shape)
    


    norm_con_feats_repeated = tf.repeat(
        tf.expand_dims(norm_con_feats, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [norm_con_feats_repeated, product_embeddings], 2)

    return self.score_model(concatenated_embeddings)#norm_con_feats_repeated#

  def compute_loss(self, features, training=False):
    labels = features.pop("_rank")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [177]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.norm_layer.adapt(tfd_Xtrain_df.batch(128))
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))
listwise_model.fit(tfd_train_df.batch(128),epochs=100,verbose=True)

# listwise_model.predict(tfd_train_df.batch(3).take(1))

Epoch 1/100
{'user_id': <tf.Tensor 'IteratorGetNext:3' shape=(None, None) dtype=int64>, 'product': <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=string>, 'conf': <tf.Tensor 'ranking_model_31/Cast:0' shape=(None, 10) dtype=float32>}
product_embeddings (None, None, 4)
product (None, None)
norm_con_feats (None, 10)
tf.expand_dims(norm_con_feats, 1) (None, 1, 10)
{'user_id': <tf.Tensor 'IteratorGetNext:3' shape=(None, None) dtype=int64>, 'product': <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=string>, 'conf': <tf.Tensor 'ranking_model_31/Cast:0' shape=(None, 10) dtype=float32>}
product_embeddings (None, None, 4)
product (None, None)
norm_con_feats (None, 10)
tf.expand_dims(norm_con_feats, 1) (None, 1, 10)
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epo

<keras.callbacks.History at 0x7f2d062dcd90>

In [178]:
listwise_model.evaluate(tfd_val_df.batch(32),verbose=True)

{'user_id': <tf.Tensor 'IteratorGetNext:3' shape=(None, None) dtype=int64>, 'product': <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=string>, 'conf': <tf.Tensor 'ranking_model_31/Cast:0' shape=(None, 10) dtype=float32>}
product_embeddings (None, None, 4)
product (None, None)
norm_con_feats (None, 10)
tf.expand_dims(norm_con_feats, 1) (None, 1, 10)


[0.9946333169937134,
 12.254427909851074,
 0.6180744171142578,
 0,
 0.6180744171142578]

In [179]:
val_res = listwise_model.predict(tfd_val_df.batch(128),verbose=True)

val_res_df = pd.DataFrame(val_res.reshape((10000,5)),columns=unique_products)
val_res_df_rank = rank_score(val_res_df)#.rank(method='first',axis=1)
val_res_df_rank

{'user_id': <tf.Tensor 'IteratorGetNext:3' shape=(None, None) dtype=int64>, 'product': <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=string>, '_rank': <tf.Tensor 'ranking_model_31/Cast:0' shape=(None, None) dtype=float32>, 'conf': <tf.Tensor 'ranking_model_31/Cast_1:0' shape=(None, 10) dtype=float32>}
product_embeddings (None, None, 4)
product (None, None)
norm_con_feats (None, 10)
tf.expand_dims(norm_con_feats, 1) (None, 1, 10)


Unnamed: 0,A,B,C,D,E
0,3.0,4.0,1.0,2.0,5.0
1,3.0,1.0,5.0,4.0,2.0
2,5.0,3.0,4.0,1.0,2.0
3,5.0,3.0,1.0,2.0,4.0
4,1.0,2.0,4.0,5.0,3.0
...,...,...,...,...,...
9995,4.0,3.0,1.0,5.0,2.0
9996,4.0,2.0,1.0,5.0,3.0
9997,5.0,1.0,4.0,2.0,3.0
9998,3.0,5.0,1.0,2.0,4.0


In [193]:
1- (((val_rank_df.values == val_res_df_rank.values)==False).sum(axis=1) > 0).mean() #/ 50000

0.8345

In [186]:
val_rank_df.shape

(10000, 5)