In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets

In [None]:
!pip install transformers

In [4]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

import tensorflow as tf

# huggingface
from datasets import Dataset
from transformers import BertForMaskedLM, TFBertForMaskedLM, BertConfig, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay

# Sequential Recommandation Engine

## Data

### Construct Tensorflow Dataset

In [None]:
!unzip /content/drive/MyDrive/Bert4Rec/recommendation.zip -d /content/drive/MyDrive/Bert4Rec/

In [5]:
feature_description = {
        "userIndex": tf.io.FixedLenFeature([], tf.int64),
        "movieIndices": tf.io.RaggedFeature(tf.int64, row_splits_dtype=tf.int64),
        "timestamps": tf.io.RaggedFeature(tf.int64, row_splits_dtype=tf.int64)
    }


In [6]:
def parse_tfrecord_fn(example):
    feature_description = {
        "userIndex": tf.io.FixedLenFeature([], tf.int64),
        "movieIndices": tf.io.RaggedFeature(tf.int64, row_splits_dtype=tf.int64),
        "timestamps": tf.io.RaggedFeature(tf.int64, row_splits_dtype=tf.int64)
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['userIndex'], example['timestamps'], example['movieIndices']
raw_dataset = tf.data.TFRecordDataset(glob("/content/drive/MyDrive/Bert4Rec/recommendation/dataset/*"))
parsed_dataset = raw_dataset.map(parse_tfrecord_fn)

### Transform to pandas.Dataframe for exploration purpose

In [32]:
df_dict = {}
for features in tqdm(parsed_dataset):
    u, t, m = features
    df_dict.setdefault('userIndex', []).append(u.numpy())
    df_dict.setdefault('timestamps', []).append(t.numpy())
    df_dict.setdefault('movieIndices', []).append(m.numpy())

324849it [01:29, 3622.46it/s]


In [33]:
df = pd.DataFrame.from_dict(df_dict, orient='index').transpose()

In [34]:
Nsessions, _ = df.shape
print(Nsessions)

324849


In [35]:
df.head()

Unnamed: 0,userIndex,timestamps,movieIndices
0,49744,"[946265827, 946265827, 946266411, 946266496, 9...","[2612, 190, 7780, 1279, 2593, 934]"
1,49745,"[941372629, 941372708, 941372708, 941372708, 9...","[813, 171, 1234, 1580, 43, 57, 1013, 175, 178,..."
2,49746,"[837170891, 837170944, 837170987, 837170987, 8...","[40, 46, 1191, 44, 158, 49, 1190, 528, 574, 39..."
3,49747,"[1546520504, 1546520514, 1546520519, 154652056...","[162, 51, 175, 109, 1016, 169, 994]"
4,49748,"[1476643481, 1476643487, 1476643874, 147664402...","[4957, 29, 10070, 8257, 2160, 18, 432, 9133, 7..."


In [36]:
# minimal length session
df['movieIndices'].apply(len).min()

4

In [37]:
# maximal length session
df['movieIndices'].apply(len).max()

49

In [38]:
# Number of movies
maxMovieIndex = df['movieIndices'].apply(np.max).max()
print(maxMovieIndex)

40850


In [39]:
minMovieIndex = df['movieIndices'].apply(np.min).min()
print(minMovieIndex)

0


In [40]:
Nmovies = maxMovieIndex - minMovieIndex + 1
print(Nmovies)

40851


### Train/Val/Test split

As we do not encode text but use directly "movieIndices" as `input_ids`, many things sould be taken into account:


- the "moviesIndices" should be shifted by 104 to not mistaken with Bert special tokens


- The DataFrame shows that the longest sessions has 49 movies, so we choose `max_length=50` and not `200` as defined in the paper. Thus, all session will left-pad to meet a length of `50`.

- As Paper does, we get rid of `token_type_ids`

raw | train | label |
 --- | --- | --- |
[v1, v2, v3, v4, v5, v6, v7] | ["[PAD]", "[PAD]", v1, v2, v3, v4, v5, v6, v7] | ["[PAD]", "[PAD]", v1, v2, "[MASK]", v4, "[MASK]", v6, v7]   

raw | val | label |
 --- | --- | --- |
[v1, v2, v3, v4, v5, v6, v7] | ["[PAD]", "[PAD]", v1, v2, v3, v4, v5, "[MASK]]" | v6 


raw | test | label |
 --- | --- | --- |
[v1, v2, v3, v4, v5, v6, v7] | ["[PAD]", "[PAD]", v1, v2, v3, v4, v5, v6, "[MASK]]" | v7 


In [7]:
max_length = 50
pad_token_id = 0
mask_token_id = 103


def pad(u, t, m):
    att = tf.concat([pad_token_id*tf.ones((2+int(max_length-tf.shape(m)[0]),), dtype=tf.int64), tf.ones_like(m)], axis=0)
    # add 104 to movie indices to not mistake with special tokens ids
    m   = tf.concat([pad_token_id*tf.ones((2+int(max_length-tf.shape(m)[0]),), dtype=tf.int64), m+104], axis=0)
    return att, m

def get_train(att, m):
    return {"attention_mask": att[:-2], "input_ids": m[:-2]}

def get_val(att, m):
    return {"attention_mask": att[1:-1], "input_ids": tf.experimental.numpy.append(m[1:-2], mask_token_id), "labels": m[-2]}

def get_test(att, m):
    return {"attention_mask": att[2:], "input_ids": tf.experimental.numpy.append(m[2:-1], mask_token_id), "labels": m[-1]}


train_dataset = parsed_dataset.map(pad).map(get_train)
val_dataset = parsed_dataset.map(pad).map(get_val)
test_dataset = parsed_dataset.map(pad).map(get_test)

In [8]:
for el in val_dataset.take(1):
    print(el)

{'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1])>, 'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0, 2716,  294, 7884, 1383,  103])>, 'labels': <tf.Tensor: shape=(), dtype=int64, numpy=2697>}


###  Create Tensorflow tf.Data.Dataset

In [9]:
train_input_ids = []
train_attention_mask = []
for el in tqdm(train_dataset, "loop over train set"):
    train_input_ids.append(el['input_ids'].numpy())
    train_attention_mask.append(el['attention_mask'].numpy())
ds_train_ = Dataset.from_dict({"input_ids": train_input_ids,
                              "attention_mask": train_attention_mask})
ds_train = ds_train_.with_format("tf")
ds_train[0]

loop over train set: 324849it [02:16, 2371.30it/s]


{'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 2716,  294, 7884, 1383])>,
 'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1])>}

In [10]:
val_input_ids = []
val_attention_mask = []
val_labels = []
for el in tqdm(val_dataset, "loop over validation set"):
    val_input_ids.append(el['input_ids'].numpy())
    val_attention_mask.append(el['attention_mask'].numpy())
    val_labels.append(el['labels'].numpy())
ds_val_ = Dataset.from_dict({"input_ids": val_input_ids,
                            "attention_mask": val_attention_mask})
ds_val = ds_val_.with_format("tf")
ds_val[0]

loop over validation set: 324849it [02:21, 2288.56it/s]


{'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 2716,  294, 7884, 1383,  103])>,
 'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1])>}

In [11]:
test_input_ids = []
test_attention_mask = []
test_labels = []
for el in tqdm(test_dataset, "loop over test set"):
    test_input_ids.append(el['input_ids'].numpy())
    test_attention_mask.append(el['attention_mask'].numpy())
    test_labels.append(el['labels'].numpy())
ds_test_ = Dataset.from_dict({"input_ids": test_input_ids,
                             "attention_mask": test_attention_mask})
ds_test = ds_test_.with_format("tf")
ds_test[0]

loop over test set: 324849it [02:21, 2288.56it/s]


{'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        2716,  294, 7884, 1383, 2697,  103])>,
 'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1])>}

In [12]:
# Get a tokenizer to benefit from DataCollector dedicated to MLM
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
print(tokenizer.pad_token_id)
print(tokenizer.unk_token_id)
print(tokenizer.cls_token_id)
print(tokenizer.sep_token_id)
print(tokenizer.mask_token_id)

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

0
100
101
102
103


In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.4, return_tensors="tf")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [14]:
batch_size=256

tf_train_set = ds_train.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_val_set = ds_val.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_test_set = ds_test.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

In [15]:
# verify that you can batch over the tf.Data.Dataset
for el in tf_test_set.take(2):
    print(el['input_ids'].shape)

(32, 50)
(32, 50)


## Model

In [16]:
bert_config = BertConfig(vocab_size=40851+104, # number of movies after shifting
                         max_position_embeddings=max_length,
                         hidden_size=32,
                         intermediate_size=32,
                         num_hidden_layers=2,
                         num_attention_heads=2)

In [17]:
bert_for_maskedlm = TFBertForMaskedLM(bert_config)

In [18]:
optimizer = AdamWeightDecay(learning_rate=1e-4,
                            beta_1=0.9,
                            beta_2=0.999,
                            weight_decay_rate=0.01,
                            clipnorm=np.sqrt(5))

In [None]:
bert_for_maskedlm.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
bert_for_maskedlm.fit(x=tf_train_set, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0ef36bffd0>

In [None]:
bert_for_maskedlm.save_pretrained('/content/drive/MyDrive/Bert4Rec/Bert4Rec')

## Evaluation

In [19]:
model = TFBertForMaskedLM.from_pretrained('/content/drive/MyDrive/Bert4Rec/Bert4Rec')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at /content/drive/MyDrive/Bert4Rec/Bert4Rec.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [None]:
val_top10_movies = []
for batch in tqdm(tf_val_set, "predict on val set"):
  logits = model.predict(batch)['logits'][:,-1,:] # keep only last token (the masked one) logits
  val_top10_movies.append(tf.math.top_k(logits, 10).indices)
val_top10_movies = tf.concat(val_top10_movies, axis=0)
  

predict on val set:   4%|▍         | 396/10152 [07:33<2:37:32,  1.03it/s]

In [None]:
test_top10_movies = []
for batch in tqdm(tf_test_set, "predict on test set"):
  logits = model.predict(batch)['logits'][:,-1,:] # keep only last token  (the masked one) logits
  test_top10_movies.append(tf.math.top_k(logits, 10).indices)
test_top10_movies = tf.concat(test_top10_movies, axis=0)

In [63]:
def compute_HRk(gt_movies, top10movies, k):
  """Compute Hit Ratio_{k} metric

     Arguments:
       labels (`tf.tensor`): 1-D tensor of all movieIndices to predict for each session (dim=(nbSessions,))
       top10movies (`tf.tensor`): 2-D tensor of 10 best movieIndices for each session (dim=(nbSessions, 10))
       k (int): number of movies for the Hit Ratio metrics (between 1 and 10)
  """
  hits = 0
  gt_movies = tf.expand_dims(gt_movies, axis=1) # from flatten (None,) to tensor (None,1)
  hits = gt_movies - top10movies[:,:k] # (None,1) - (None, k) : 0 means we hit the gt movie
  hits = tf.cast(tf.math.count_nonzero(hits, axis=1)<k, dtype=tf.int32) # hits=(None,1) saying True/False
                                                                        # if True, so row contains a zero, so our k first movies
                                                                        # hit the gt movie
  number_of_hits = tf.math.reduce_sum(hits)
  return (number_of_hits/len(top10movies)).numpy()

In [None]:
print(compute_HRk(val_labels, val_top10_movies, 1))
print(compute_HRk(val_labels, val_top10_movies, 5))
print(compute_HRk(val_labels, val_top10_movies, 10))

In [None]:
print(compute_HRk(test_labels, test_top10_movies, 1))
print(compute_HRk(test_labels, test_top10_movies, 5))
print(compute_HRk(test_labels, test_top10_movies, 10))