In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cd/38/c9527aa055241c66c4d785381eaf6f80a28c224cae97daa1f8b183b5fabb/transformers-2.9.0-py3-none-any.whl (635kB)
[K     |████████████████████████████████| 645kB 8.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 22.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 45.6MB/s 
[?25hCollecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████

In [0]:
import transformers as trf
import torch as pt
import numpy as np

import matplotlib.pyplot as plt
from collections import OrderedDict
import seaborn as sns
import json
import tqdm

import pandas as pd

from scipy.stats import spearmanr as Sr

  import pandas.util.testing as tm


In [0]:
data = pd.read_csv('misconception_data.txt', sep=';', header=None)

In [0]:
data

Unnamed: 0,0,1,2,3,4,5
0,There are [MASK] taste groups a human tongue c...,four,two,three,many,many
1,Mental abilities are [MASK] separated into the...,absolutely,clearly,generally,loosely,loosely
2,Drinking eight glasses of water a day is [MASK...,necessary,mandatory,must,recommended,recommended
3,Chewing gum is [MASK] indigestible and passes ...,not,never,no,mostly,mostly
4,Vaccines [MASK] cause autism.,can,often,mostly,cannot,cannot
...,...,...,...,...,...,...
104,There is [MASK] universal sign language.,one,only,deaf,no,no
105,The light bulb was [MASK] invented by Thomas E...,first,original,science,not,not
106,It is [MASK] that Henry Ford invented the auto...,true,certain,possible,false,false
107,"As 30% of people don't wash their hands, door ...",more,most,ill,fewer,fewer


In [0]:
def check_if_singles(first_ans, second_ans, third_ans, fourth_ans, true_ans):
  i = 1
  good = True
  for t in zip(first_ans, second_ans, third_ans, fourth_ans, true_ans):
    if any(len(x) > 1 for x in t):
      print(i)
      print(t)
      good = False
    
    i += 1
  return good

In [0]:
def encode_answers(data, tkr):
  first_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[1]))
  second_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[2]))
  third_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[3]))
  fourth_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[4]))
  true_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[5]))
  good = check_if_singles(first_ans, second_ans, 
                          third_ans, fourth_ans, true_ans)
  if not good:
    print("Some answers are not single tokens")
  return first_ans, second_ans, third_ans, fourth_ans, true_ans

In [0]:
#first_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[1]))
"""for i, t in enumerate(first_ans):
  print(i, "  ", t)"""
#second_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[2]))
#for i, t in enumerate(second_ans):
#  print(i, "  ", t)
#third_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[3]))
#for i, t in enumerate(third_ans):
#  print(i, "  ", t)
#fourth_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[4]))
#for i, t in enumerate(fourth_ans):
#  print(i+1, "  ", t)
#true_ans = list(map(lambda s: tkr.encode(s)[1:-1], data[5]))
#for i, t in enumerate(true_ans):
#  print(i+1, "  ", t)


'for i, t in enumerate(first_ans):\n  print(i, "  ", t)'

In [0]:
def evaluate(model, tks, data_tks_targets, 
             true_ans, first_ans, second_ans, third_ans, fourth_ans):
    
    correct = 0
    incorrect = 0
    fails = []
    most_common_misconceptions = 0
    h1 = 0
    h2 = 0
    h5 = 0
    h10 = 0
    h20 = 0
    h50 = 0
    h100 = 0
    total = 0
    with pt.no_grad():
        for tk, target, ans, fa, sa, ta, ftha in tqdm.tqdm(zip(tks, data_tks_targets, true_ans, first_ans, second_ans, third_ans, fourth_ans)):

            tkm = tk.copy()
            
            total += 1
            ranks = ((-model(pt.tensor(tkm).unsqueeze(0))[0][0, target]).argsort())
            idx = list(ranks).index(ans[0])
            idx_first = list(ranks).index(fa[0])
            idx_second = list(ranks).index(sa[0]) 
            idx_third = list(ranks).index(ta[0])
            idx_fourth = list(ranks).index(ftha[0])
            pred = min(idx, idx_first, idx_second, idx_third, idx_fourth)
            if pred == idx:
              correct += 1
            else: 
              incorrect += 1
              fails.append((total, list(ranks)[pred]))
              if pred == idx_first:
                most_common_misconceptions += 1
            top_ranks = ranks[:100]
            try:
              idx = list(top_ranks).index(ans[0])
            except:
                continue
            if 1 > idx:
                h1 += 1
            if 2 > idx:
                h2 += 1
            if 5 > idx:
                h5 += 1
            if 10 > idx:
                h10 += 1
            if 20 > idx:
                h20 += 1
            if 50 > idx:
                h50 += 1
            h100 += 1
    print()
    print("Correct: ", correct)
    print("Incorrect: ", incorrect)
    print("Misconceptions: ", most_common_misconceptions)

    print(f"""H@1: {h1}
H@2: {h2}
H@5: {h5}
H@10: {h10}
H@20: {h20}
H@50: {h50}
H@100: {h100}
Total: {total}""")
    return fails

In [0]:
def eval_helper(model, tkr, data, roberta=False):
  if roberta:
    tks = list(map(lambda s: tkr.encode(s.replace('[MASK]', '<mask>')), data[0]))
  else:
    tks = list(map(lambda s: tkr.encode(s), data[0]))
  first_ans, second_ans, third_ans, fourth_ans, true_ans = encode_answers(data, 
                                                                          tkr)
  data_tks_targets = list(map(lambda x: x.index(tkr.mask_token_id), tks))
  fails = evaluate(model, tks, data_tks_targets,
                   true_ans, first_ans, second_ans, third_ans, fourth_ans)
  for t in fails:
    print(t[0], tkr.convert_ids_to_tokens([t[1]]))
  local_fails = [(t[0], tkr.convert_ids_to_tokens([t[1]])) for t in fails]
  return local_fails


In [0]:
print("############ BERT ############")
tkr_bert = trf.BertTokenizer.from_pretrained('bert-base-uncased')
bert = trf.BertForMaskedLM.from_pretrained('bert-base-uncased', output_attentions=True)
bert_fails = eval_helper(bert, tkr_bert, data)

############ BERT ############


109it [01:16,  1.43it/s]


Correct:  38
Incorrect:  71
Misconceptions:  51
H@1: 14
H@2: 22
H@5: 33
H@10: 57
H@20: 66
H@50: 85
H@100: 91
Total: 109
2 ['generally']
3 ['necessary']
4 ['not']
5 ['can']
6 ['may']
7 ['enough']
8 ['can']
9 ['different']
11 ['fact']
12 ['can']
13 ['can']
14 ['can']
15 ['five']
16 ['often']
17 ['can']
18 ['will']
20 ['sure']
23 ['never']
24 ['true']
25 ['true']
26 ['can']
27 ['hide']
29 ['an']
31 ['will']
32 ['years']
33 ['fact']
34 ['always']
35 ['every']
36 ['china']
38 ['corn']
39 ['true']
41 ['quicker']
42 ['capital']
44 ['vegetable']
45 ['blind']
46 ['minutes']
49 ['can']
50 ['four']
51 ['more']
52 ['can']
53 ['meat']
57 ['can']
60 ['sugar']
61 ['can']
64 ['never']
67 ['hot']
68 ['can']
69 ['must']
70 ['can']
72 ['one']
73 ['can']
75 ['quicker']
76 ['equivalent']
77 ['can']
79 ['first']
80 ['tree']
81 ['only']
83 ['about']
85 ['first']
87 ['the']
93 ['true']
94 ['july']
96 ['very']
97 ['often']
99 ['true']
100 ['directly']
102 ['also']
103 ['very']
106 ['first']
107 ['possible']
1




In [0]:
print("############ RoBERTa ############")
tkr_roberta = trf.RobertaTokenizer.from_pretrained('roberta-base')
roberta = trf.RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_fails = eval_helper(roberta, tkr_roberta, data, roberta=True)

############ RoBERTa ############


109it [02:05,  1.15s/it]


Correct:  34
Incorrect:  75
Misconceptions:  55
H@1: 15
H@2: 23
H@5: 37
H@10: 53
H@20: 64
H@50: 76
H@100: 85
Total: 109
1 ['Ġthree']
2 ['Ġgenerally']
4 ['Ġnot']
5 ['Ġcan']
6 ['Ġmay']
7 ['Ġenough']
8 ['Ġcan']
9 ['Ġdifferent']
12 ['Ġcan']
13 ['Ġcan']
14 ['Ġcan']
15 ['Ġfive']
16 ['Ġoften']
17 ['Ġcan']
18 ['Ġcan']
19 ['Ġequal']
20 ['Ġsure']
23 ['Ġnever']
24 ['Ġtrue']
25 ['Ġtrue']
26 ['Ġcan']
27 ['Ġhide']
29 ['Ġan']
31 ['Ġwill']
32 ['Ġyears']
33 ['Ġfact']
34 ['Ġcan']
35 ['Ġevery']
36 ['ĠGermany']
39 ['Ġtrue']
41 ['Ġquicker']
42 ['Ġcapital']
44 ['Ġvegetable']
45 ['Ġpink']
46 ['Ġseconds']
49 ['Ġcan']
50 ['Ġfour']
51 ['Ġmore']
52 ['Ġcan']
53 ['Ġcheese']
55 ['Ġhealthy']
56 ['Ġdinosaurs']
57 ['Ġcan']
58 ['Ġapple']
60 ['Ġbeans']
61 ['Ġcan']
62 ['Ġyellow']
63 ['Ġalways']
64 ['Ġnever']
67 ['Ġhot']
68 ['Ġcan']
69 ['Ġmust']
70 ['Ġcan']
72 ['Ġone']
73 ['Ġcan']
76 ['Ġequivalent']
77 ['Ġcan']
78 ['Ġlight']
79 ['Ġfirst']
80 ['Ġtree']
81 ['Ġonly']
83 ['Ġabout']
85 ['Ġfirst']
87 ['Ġthe']
93 ['Ġtrue']
94 [




In [0]:
print("############ ALBERT ############")
tkr_albert = trf.AlbertTokenizer.from_pretrained('albert-base-v2')
albert = trf.AlbertForMaskedLM.from_pretrained('albert-base-v2')
albert_fails = eval_helper(albert, tkr_albert, data)

############ ALBERT ############


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=47376696.0, style=ProgressStyle(descrip…




109it [01:13,  1.48it/s]


Correct:  37
Incorrect:  72
Misconceptions:  51
H@1: 8
H@2: 16
H@5: 28
H@10: 46
H@20: 59
H@50: 72
H@100: 82
Total: 109
4 ['▁not']
6 ['▁may']
7 ['▁enough']
8 ['▁can']
9 ['▁different']
10 ['▁cannot']
11 ['▁fact']
12 ['▁can']
13 ['▁can']
14 ['▁can']
16 ['▁often']
17 ['▁can']
18 ['▁will']
19 ['▁same']
20 ['▁sure']
23 ['▁never']
24 ['▁true']
25 ['▁true']
27 ['▁hide']
28 ['▁slowly']
29 ['▁an']
31 ['▁often']
32 ['▁years']
33 ['▁fact']
34 ['▁always']
35 ['▁every']
36 ['▁china']
39 ['▁true']
41 ['▁thicker']
42 ['▁capital']
43 ['▁perth']
44 ['▁vegetable']
46 ['▁seconds']
47 ['▁cannot']
49 ['▁can']
50 ['▁four']
52 ['▁can']
53 ['▁garlic']
54 ['▁yellow']
55 ['▁sober']
58 ['▁carrot']
60 ['▁beans']
61 ['▁can']
62 ['▁yellow']
64 ['▁never']
66 ['▁directly']
67 ['▁hot']
68 ['▁can']
69 ['▁must']
70 ['▁can']
71 ['▁true']
72 ['▁one']
73 ['▁can']
75 ['▁quicker']
76 ['▁equivalent']
77 ['▁can']
79 ['▁first']
80 ['▁tree']
83 ['▁about']
87 ['▁the']
90 ['▁true']
91 ['▁drowned']
93 ['▁true']
94 ['▁july']
96 ['▁v




In [0]:
print("############ ELECTRA ############")
tkr_electra = trf.ElectraTokenizer.from_pretrained('google/electra-small-generator')
electra = trf.ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
electra_fails = eval_helper(electra, tkr_electra, data)

############ ELECTRA ############


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=463.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=54236116.0, style=ProgressStyle(descrip…




109it [01:06,  1.65it/s]


Correct:  33
Incorrect:  76
Misconceptions:  51
H@1: 11
H@2: 21
H@5: 31
H@10: 51
H@20: 60
H@50: 81
H@100: 89
Total: 109
2 ['generally']
3 ['necessary']
4 ['not']
5 ['can']
6 ['may']
7 ['enough']
8 ['can']
9 ['different']
11 ['fact']
12 ['can']
13 ['can']
14 ['can']
16 ['often']
17 ['can']
18 ['will']
19 ['equal']
20 ['sure']
24 ['true']
25 ['true']
26 ['can']
27 ['hide']
28 ['long']
29 ['an']
31 ['will']
32 ['years']
33 ['fact']
34 ['can']
35 ['every']
36 ['china']
38 ['nuts']
39 ['true']
41 ['quicker']
42 ['capital']
43 ['sydney']
44 ['vegetable']
45 ['dogs']
46 ['minutes']
49 ['can']
50 ['four']
52 ['can']
53 ['meat']
54 ['green']
55 ['healthy']
57 ['can']
59 ['losses']
60 ['sugar']
61 ['can']
63 ['always']
64 ['never']
67 ['hot']
68 ['can']
69 ['must']
70 ['can']
72 ['some']
73 ['can']
75 ['good']
76 ['equivalent']
77 ['can']
78 ['light']
79 ['first']
80 ['tree']
83 ['about']
87 ['the']
90 ['true']
93 ['true']
94 ['may']
96 ['very']
99 ['true']
100 ['directly']
101 ['only']
102 ['a




In [0]:
tkr_roberta.mask_token

'<mask>'

In [0]:
tkr_bert.encode('[MASK]')

[101, 103, 102]

In [0]:
for t in fails:
  print(t[0], tkr.convert_ids_to_tokens([t[1]]))

1 ['china']
2 ['mozart']
3 ['water']
4 ['nuts']
5 ['apple']
6 ['10']
8 ['quicker']
9 ['good']
10 ['sydney']
11 ['vegetable']
13 ['dogs']
14 ['minutes']
17 ['meat']
20 ['can']
21 ['losses']
22 ['can']
24 ['can']
25 ['four']
27 ['can']


In [0]:
# BERT 3, 4, 6, 17, are not common misconceptions just wrong answers

In [0]:
# RoBERTa 3, 8, 10, 15, 16, 17, 18, 20, 25

In [0]:
# ALbert 2, 10, 12, 17, 20, 23

In [0]:
# Electra 3, 9, 13, 14, 17, 21, 