In [1]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM
from tqdm import tqdm
import pandas as pd
from scipy import stats
from collections import Counter
from matplotlib import pyplot as plt

# Zero-shot LMRecSys

In [2]:
def convert(ids, n_mask=10):
    # Input: [1, 2, 3, 4, 5]
    # Output: A user watched A, B, C, D, E. Now the user may want to watch {[MASK] * n_mask}.
    s = 'A user watched '
    for id in ids:
        s += id2name[id] + ', '
    s = s.strip()[:-1] + '. '
    s += 'Now the user may want to watch '
    s += tokenizer.mask_token * n_mask
    s += '.'
    return s

def score(ids, gpu=False):
    # Input: [1, 2, 3, 4, 5]
    # Output: Prob(y | [1, 2, 3, 4, 5]), where y = [1, ..., n_movie]
    logits_all = []
    for n_mask in range(1, 11): # leave 1~10 masks (O(10) inferences)
        input = tokenizer([convert(ids, n_mask=n_mask)], max_length=512, return_tensors='pt')
        if gpu: input = {key: input[key].cuda() for key in input}
        output = model(**input).logits # shape = (1 x seq_lenth x vocab_size)
        mask_idxs = [i for i, token in enumerate(input['input_ids'][0]) if token == tokenizer.mask_token_id] # shape = (n_mask)
        logits = output[0, mask_idxs].softmax(-1).log().detach().cpu().numpy() # shape = (n_mask x vocab_size)
        logits_all.append(logits)
    
    id2score = [np.mean([logits_all[len(tokens) - 1][i, token] for i, token in enumerate(tokens)]) for tokens in id2tokens]
    return id2score

In [3]:
gpu = True # Set True to use GPU for inference!

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForMaskedLM.from_pretrained('bert-base-cased').eval()
if gpu: model = model.cuda()

data = [json.loads(line) for line in open('data.jsonl')]
id2name = json.load(open('id2name.json'))
id2tokens = [tokenizer(name, add_special_tokens=False)['input_ids'][:10] for name in id2name] # max 10 tokens for each item

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
idx = 0
ids, label = data[idx]['ids'][:7][:-2], data[idx]['ids'][:7][-2]
id2score = score(ids, gpu=gpu)
top_ids = np.array(id2score).argsort()[::-1].tolist()

print(convert(ids))
print()
print('\n'.join([str((id2name[id], id2score[id])) for id in top_ids[:5]]))
print()
print('\n'.join([str((id2name[id], id2score[id])) for id in top_ids[-5:]]))
print()
print(id2name[label], top_ids.index(label))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


A user watched Ben-Hur, Dumbo, Schindler's List, Beauty and the Beast, Toy Story. Now the user may want to watch [MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK].

('If....', -5.2231994)
('Twilight', -5.2773)
('The Specials', -5.667444)
('Casablanca', -5.688287)
('Aliens', -5.6935673)

('Sparrows', -14.944679)
('Volunteers', -15.050343)
('Endurance', -15.124055)
('Tainted', -15.305075)
('Lauderdale', -16.802883)

A Bug's Life 155


In [5]:
def compute_recall_at_k(top_preds, labels, k=20):
    assert(len(top_preds) == len(labels))
    r_k = (top_preds[:, :k] == labels.reshape(-1, 1)).sum(-1).mean(-1)
    return float(r_k)

def compute_mrr_at_k(top_preds, labels, k=20):
    assert(len(top_preds) == len(labels))
    top_preds, labels = top_preds[:, :k].tolist(), labels.tolist()
    mrr_k = np.mean([1 / (top_pred.index(label) + 1) if label in top_pred else 0. for top_pred, label in zip(top_preds, labels)])
    return float(mrr_k)

top_preds, labels = [], []
for item in tqdm(data): # evaluate first 50 items
    id2score = score(item['ids'][:-2], gpu=gpu)
    top_pred = np.array(id2score).argsort()[::-1].tolist()
    top_preds.append(top_pred)
    labels.append(item['ids'][-2])
top_preds, labels = np.array(top_preds), np.array(labels)

metrics = {
    'r@20': compute_recall_at_k(top_preds, labels),
    'mrr@20': compute_mrr_at_k(top_preds, labels)
}
print(metrics)

100%|██████████| 5337/5337 [22:45<00:00,  3.91it/s]


{'r@20': 0.011617013303353943, 'mrr@20': 0.0018845515385167014}
