In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

data = dict()
tree = ET.parse('./training_set/data_homo_train.xml')
root = tree.getroot()

for child in root:
    # print(child.tag, child.attrib)
    data[ child.attrib['id'] ] =  [ i.text for i in child.findall('word') ]


benchmark = pd.read_csv("./training_set/benchmark_homo_train.csv").set_index("text_id").T.to_dict('records')[0]
# print(benchmark)

data_array = list()

for i in data.keys() :
    id = benchmark[i].split("_")[-1]
    pun = data[i][int(id)-1]
    sent = " ".join(data[i])
    # print(sent)

    label = []
    for w in data[i] :
        if w == pun :
            label.append(1)
        else :
            label.append(0)
    data_array.append([i, id, pun, sent, label])

df = pd.DataFrame(data_array, columns=['id', 'id_pun', 'pun', 'sent', 'label' ] )

In [14]:
data_array

[['hom_1',
  '12',
  'sweat',
  'They hid from the gunman in a sauna where they could sweat it out .',
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]],
 ['hom_2',
  '9',
  'saving',
  "Wal - Mart isn ' t the only saving place !",
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]],
 ['hom_3',
  '7',
  'sting',
  'Can honeybee abuse lead to a sting operation ?',
  [0, 0, 0, 0, 0, 0, 1, 0, 0]],
 ['hom_4',
  '5',
  'entrenched',
  'A ditch digger was entrenched in his career .',
  [0, 0, 0, 0, 1, 0, 0, 0, 0]],
 ['hom_5',
  '15',
  'forge',
  "She was only a Blacksmith ' s daughter , but she knew how to forge ahead .",
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]],
 ['hom_7',
  '14',
  'hit',
  "Did you hear about the new pinata ? It ' s a huge hit .",
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]],
 ['hom_8',
  '17',
  'vault',
  'A bank manager who was also a high jumper spent most of his time in the vault .',
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]],
 ['hom_9',
  '11',
  '

In [15]:
df

Unnamed: 0,id,id_pun,pun,sent,label
0,hom_1,12,sweat,They hid from the gunman in a sauna where they...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
1,hom_2,9,saving,Wal - Mart isn ' t the only saving place !,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,hom_3,7,sting,Can honeybee abuse lead to a sting operation ?,"[0, 0, 0, 0, 0, 0, 1, 0, 0]"
3,hom_4,5,entrenched,A ditch digger was entrenched in his career .,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,hom_5,15,forge,"She was only a Blacksmith ' s daughter , but s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...
1281,hom_1801,7,chops,A karate school restaurant served mainly chops .,"[0, 0, 0, 0, 0, 0, 1, 0]"
1282,hom_1803,15,drop,For insomnia move to the edge of the bed and y...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1283,hom_1804,14,saw,He tried to sell me glass with imperfections i...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1284,hom_1805,15,unwind,Those who make balls of wool for a living like...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [16]:
df.insert(3, "question", ['what is the pun word from the sentence?'] * 1286, True)

In [17]:
df

Unnamed: 0,id,id_pun,pun,question,sent,label
0,hom_1,12,sweat,what is the pun word from the sentence?,They hid from the gunman in a sauna where they...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
1,hom_2,9,saving,what is the pun word from the sentence?,Wal - Mart isn ' t the only saving place !,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,hom_3,7,sting,what is the pun word from the sentence?,Can honeybee abuse lead to a sting operation ?,"[0, 0, 0, 0, 0, 0, 1, 0, 0]"
3,hom_4,5,entrenched,what is the pun word from the sentence?,A ditch digger was entrenched in his career .,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,hom_5,15,forge,what is the pun word from the sentence?,"She was only a Blacksmith ' s daughter , but s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...
1281,hom_1801,7,chops,what is the pun word from the sentence?,A karate school restaurant served mainly chops .,"[0, 0, 0, 0, 0, 0, 1, 0]"
1282,hom_1803,15,drop,what is the pun word from the sentence?,For insomnia move to the edge of the bed and y...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1283,hom_1804,14,saw,what is the pun word from the sentence?,He tried to sell me glass with imperfections i...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1284,hom_1805,15,unwind,what is the pun word from the sentence?,Those who make balls of wool for a living like...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [18]:
rand_num = np.random.randint(0, len(df))
question = df["question"][rand_num]
sent = df["sent"][rand_num]

In [19]:
input_ids = tokenizer.encode(question, sent)

In [20]:
token_ids = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(token_ids, input_ids):
    print('{:8}{:8,}'.format(token, id))

[CLS]        101
what       2,054
is         2,003
the        1,996
pun       26,136
word       2,773
from       2,013
the        1,996
sentence   6,251
?          1,029
[SEP]        102
what       2,054
do         2,079
they       2,027
call       2,655
all        2,035
that       2,008
lumber    13,891
on         2,006
the        1,996
star       2,732
trek      10,313
'          1,005
s          1,055
enterprise   6,960
?          1,029
captain    2,952
'          1,005
s          1,055
log        8,833
.          1,012
[SEP]        102


In [21]:
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        question = sample['question']
        text = sample['text']
        target_indices = sample['label']

        # Perform any additional preprocessing if necessary

        return (question, text, target_indices)

dataset = QADataset(df)

In [22]:
dataset

<__main__.QADataset at 0x138d8bd00>

In [23]:
# Create dataloader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [24]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x138d8b070>

In [25]:
print(dataloader.__len__)

<bound method DataLoader.__len__ of <torch.utils.data.dataloader.DataLoader object at 0x138d8b070>>


In [26]:
data = next(iter(dataloader))

KeyError: 225

In [None]:
# Now you can iterate over the dataloader in your training loop
for batch in dataloader:
    print(batch)
    # questions = batch['question']
    # texts = batch['text']
    # target_indices = batch['target_indices']

    # Now you can use these batches for training your BERT model

