In [1]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM
from tqdm import tqdm
import pandas as pd
from scipy import stats

In [2]:
# MovieLens names are wrong, but use it to initialize
id2name = {}
for line in open('movies.dat'): 
    id, name, type = line.strip().split('::')
    id = int(id)
    assert(name[-1] == ')' and name[-6] == '(')
    id2name[id] = name[:-6].strip()

# Update new name from Hao's metadata, some ids are missing
for item in pd.read_csv('ML1M_Meta_3670.csv').values.tolist():
    id2name[item[0]] = item[3]

In [3]:
oldid2newid = {} # trim empty ids
for new_id, old_id in enumerate(sorted(id2name.keys())): oldid2newid[old_id] = new_id
id2name = [id2name[id] for id in sorted(id2name.keys())]

In [4]:
user2ids = {}
for line in open('ratings.dat'):
    user, id, rating, time = line.strip().split('::')
    user, id, rating, time = int(user), oldid2newid[int(id)], int(rating), int(time)
    if user not in user2ids: user2ids[user] = []
    user2ids[user].append((id, time))
for user in user2ids:
    user2ids[user] = sorted(user2ids[user], key=lambda x: x[1])
    user2ids[user], _ = zip(*user2ids[user])

In [5]:
print('# user:', len(user2ids), ',', '# movie:', len(id2name), ',', '# interaction:', sum([len(user2ids[user]) for user in user2ids]))
print('History:', stats.describe([len(user2ids[user]) for user in user2ids]))
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print('# words in movie name:', stats.describe([len(name.split()) for name in id2name]))
print('# tokens in movie name:', stats.describe([len(tokenizer(name, add_special_tokens=False)['input_ids']) for name in id2name]))

# user: 6040 , # movie: 3883 , # interaction: 1000209
History: DescribeResult(nobs=6040, minmax=(20, 2314), mean=165.5975165562914, variance=37151.41721522576, skewness=2.743966112411747, kurtosis=11.1920071429534)
# words in movie name: DescribeResult(nobs=3883, minmax=(1, 15), mean=2.8532062838011845, variance=2.672933431676114, skewness=1.5587508576280893, kurtosis=4.413184077254987)
# tokens in movie name: DescribeResult(nobs=3883, minmax=(1, 25), mean=4.110739119237703, variance=5.747651256756257, skewness=2.024892296904092, kurtosis=7.619923800325971)


In [15]:
def save_jsonl(data, filename):
    with open(filename, 'w') as fout:
        for item in data:
            fout.write(json.dumps(item) + '\n')

data = [{'user': user, 'ids': user2ids[user]} for user in user2ids]
save_jsonl(data, 'data.jsonl')
json.dump(id2name, open('id2name.json', 'w'))

In [16]:
user = 1
print(data[user - 1]['ids'], len(data[user - 1]['ids']))
print(user2ids[user], len(user2ids[user]))

(3117, 1250, 1672, 1009, 2271, 1768, 3339, 2735, 1189, 1176, 711, 257, 907, 604, 2623, 1892, 1959, 3036, 926, 1022, 1893, 1949, 148, 1015, 1081, 902, 1267, 2728, 2693, 1226, 655, 2849, 527, 3045, 2722, 2252, 1016, 1179, 590, 2329, 1506, 523, 591, 2618, 735, 584, 0, 2286, 2225, 773, 1526, 1838, 47) 53
(3117, 1250, 1672, 1009, 2271, 1768, 3339, 2735, 1189, 1176, 711, 257, 907, 604, 2623, 1892, 1959, 3036, 926, 1022, 1893, 1949, 148, 1015, 1081, 902, 1267, 2728, 2693, 1226, 655, 2849, 527, 3045, 2722, 2252, 1016, 1179, 590, 2329, 1506, 523, 591, 2618, 735, 584, 0, 2286, 2225, 773, 1526, 1838, 47) 53


# Zero-shot PET

In [3]:
def convert(ids, n_pad=10):
    s = 'A user watched '
    for id in ids:
        s += id2name[id] + ', '
    s = s.strip()[:-1] + '. '
    s += 'Now the user may want to watch '
    s += tokenizer.mask_token * n_pad
    s += '.'
    return s

In [4]:
data = [json.loads(line) for line in open('data.jsonl')]
id2name = json.load(open('id2name.json'))
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
model = model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
model(**tokenizer(['hello world'], return_tensors='pt')).logits[0].softmax(-1).log().exp().sum(-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

In [18]:
tokenizer(convert(data[0]['ids'], n_pad=5))['input_ids'][-12:]

[5310, 2089, 2215, 2000, 3422, 103, 103, 103, 103, 103, 1012, 102]

In [6]:
id2tokens = []
for name in id2name:
    id2tokens.append(tokenizer(name, add_special_tokens=False)['input_ids'][:10])
print(max([len(tokens) for tokens in id2tokens]))

10


In [35]:
def score(ids):
    input = tokenizer([convert(ids)], max_length=500, return_tensors='pt')
    # input = {key: input[key].cuda() for key in input}
    output = model(**input)
    logits = output.logits[0].softmax(-1).log().detach().cpu().numpy()
    id2score = []
    for tokens in id2tokens:
        id2score.append([])
        for i, tok_id in enumerate(tokens):
            id2score[-1].append(logits[-12 + i, tok_id].item())
        # id2score[id] = np.mean(id2score[id])
    return id2score

In [8]:
def evaluate(data):
    scores = [score(item['ids'][:5]) for item in tqdm(data)]
    return scores

scores = evaluate(data[:50])

In [9]:
def compute_metrics(data, scores):
    assert(len(data) == len(scores))
    r_20 = []
    for item, id2score in tqdm(zip(data, scores)):
        id2score = {id: np.max(id2score[id]) for id in id2score}
        top_ids = sorted(id2score.items(), key=lambda x:x[1], reverse=True)
        top_ids, _ = zip(*top_ids)
        r_20.append(item['val_id'] in top_ids[:20])
    print(np.mean(r_20))

In [10]:
compute_metrics(data, scores)

NameError: name 'scores' is not defined

In [11]:
id2name.index('Contempt')

1532

In [67]:
idx += 1
ids, label = data[idx]['ids'][:7][:-2], data[idx]['ids'][:7][-2]
id2score = score(ids)
id2score = {i: np.mean(score) for i, score in enumerate(id2score)} # TODO: np.mean
top_ids = sorted(id2score.items(), key=lambda x:x[1], reverse=True)
top_ids, _ = zip(*top_ids)

print(convert(ids))
print()
print('\n'.join([str((id2name[id], id2score[id])) for id in top_ids[:5]]))
print()
print('\n'.join([str((id2name[id], id2score[id])) for id in top_ids[-5:]]))
print()
print(id2name[label], top_ids.index(label))


A user watched Raiders of the Lost Ark, Ghost, The Perfect Storm, The Shawshank Redemption, The Silence of the Lambs. Now the user may want to watch [MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK].

('If....', -3.693471097946167)
('The Show', -4.037599563598633)
('Them!', -4.9892332553863525)
('The Program', -5.034773111343384)
('The Fly', -5.152930498123169)

('Duets', -17.51946258544922)
('Persuasion', -17.645301818847656)
('Sprung', -18.321044921875)
('Nowhere', -18.484943389892578)
('Trois', -19.61321449279785)

The Usual Suspects 1500


# Simpler Case

In [22]:
def convert(ids, n_pad=10):
    s = 'A user has bought '
    for id in ids:
        s += id2name[id] + '; '
    s = s.strip()[:-1] + '. '
    s += 'Now the user wants to buy'
    s += '[MASK]' * n_pad
    return s

In [23]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForMaskedLM.from_pretrained('bert-base-cased').cuda()
model = model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
id2name = {1: 'apple', 2: 'banana', 3: 'mango', 4: 'MP3', 5: 'computer', 6: 'laptop', 7: 'mobile phone', 8: 'pen'}
id2tokens = {}
for id in id2name:
    id2tokens[id] = tokenizer(id2name[id], add_special_tokens=False)['input_ids'][:10]
print(id2tokens)
print(max([len(id2tokens[id]) for id in id2tokens]))

{1: [12075], 2: [21806], 3: [1299, 2758], 4: [5478, 1495], 5: [2775], 6: [12574], 7: [5093, 2179], 8: [8228]}
2


In [26]:
ids = [1,2]
id2score = score(ids)
print(convert(ids))
id2score = {id: np.mean(id2score[id]) for id in id2score}
top_ids = sorted(id2score.items(), key=lambda x:x[1], reverse=True)
print(top_ids)

A user has bought apple; banana. Now the user wants to buy[MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK][MASK]
[(2, 9.848603248596191), (1, 9.236679077148438), (5, 1.58921480178833), (3, 0.9985607117414474), (6, 0.48988187313079834), (8, 0.07451194524765015), (7, -0.29580698907375336), (4, -0.3808416575193405)]


In [467]:
input = tokenizer('hello world', max_length=500, return_tensors='pt')
input = {key: input[key].cuda() for key in input}
output = model(**input)
logits = output.logits[0].detach().cpu().numpy()
print(logits)

[[ -7.5906496  -7.4921894  -7.673322  ...  -6.4238377  -6.172981
   -6.539128 ]
 [ -7.8396683  -8.055583   -7.957813  ...  -6.5347376  -6.4125085
   -6.7794714]
 [ -9.92513    -9.655717   -9.891573  ...  -5.8455057  -6.6210732
   -7.4404745]
 [-13.494137  -13.658618  -13.19817   ... -10.935408  -10.908022
  -11.860087 ]]


In [11]:
idx = torch.tensor([[0,1], [1,2], [2,3]])
# expected output:
# tensor([[[ 0,  1,  2,  3,  4],
#          [ 5,  6,  7,  8,  9]],

#         [[25, 26, 27, 28, 29],
#          [30, 31, 32, 33, 34]],

#         [[50, 51, 52, 53, 54],
#          [55, 56, 57, 58, 59]]])

In [12]:
a[torch.arange(3).unsqueeze(-1), idx]

tensor([[[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9]],

        [[25, 26, 27, 28, 29],
         [30, 31, 32, 33, 34]],

        [[50, 51, 52, 53, 54],
         [55, 56, 57, 58, 59]]])

In [24]:
b = torch.arange(60).reshape(4,3,5)
b

tensor([[[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9],
         [10, 11, 12, 13, 14]],

        [[15, 16, 17, 18, 19],
         [20, 21, 22, 23, 24],
         [25, 26, 27, 28, 29]],

        [[30, 31, 32, 33, 34],
         [35, 36, 37, 38, 39],
         [40, 41, 42, 43, 44]],

        [[45, 46, 47, 48, 49],
         [50, 51, 52, 53, 54],
         [55, 56, 57, 58, 59]]])

In [25]:
b[:, [[0,1,2]], [[1,2,3], [2,3,4], [0,1,2]]]

tensor([[[ 1,  7, 13],
         [ 2,  8, 14],
         [ 0,  6, 12]],

        [[16, 22, 28],
         [17, 23, 29],
         [15, 21, 27]],

        [[31, 37, 43],
         [32, 38, 44],
         [30, 36, 42]],

        [[46, 52, 58],
         [47, 53, 59],
         [45, 51, 57]]])

In [13]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer('hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello ', add_special_tokens=False, padding='max_length', max_length=10, truncation=True)['input_ids']

[19082, 19082, 19082, 19082, 19082, 19082, 19082, 19082, 19082, 19082]

In [36]:
np.array([1,3,2]).argsort().reshape(-1)

array([0, 2, 1])

In [38]:
import torch.nn as nn

In [41]:
softmax = nn.Softmax()

In [44]:
a = torch.arange(15).reshape(3,5)
a

tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]])

In [45]:
a[[0,1,2],[0,2,1]]

tensor([ 0,  7, 11])