In [2]:
import json

In [3]:
from collections.abc import Iterable

def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [4]:
def read_data(path):
    with open(path, 'r') as f:
      data = json.load(f)
    
    contexts = []
    questions = []
    answers = []
    
    for group in data:
        # Removing yes/no questions not found in the context
        if "yes" in group['answer'] or "no" in group['answer']:
            continue
        contexts.append(''.join(flatten(group['context'])))
        questions.append(group['question'])
        answers.append(group['answer'])
        
    return contexts, questions, answers

In [5]:
train_contexts, train_questions, train_answers = read_data('train_set.json')
val_contexts, val_questions, val_answers = read_data('dev_set.json')

In [14]:
print(len(train_answers))
print(len(val_answers))

83159
6768


In [15]:
print(val_answers[0])
print(val_questions[0])
print(val_contexts[0])

Chief of Protocol
What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Meet Corliss ArcherMeet Corliss Archer, a program from radio's Golden Age, ran from January 7, 1943 to September 30, 1956. Although it was CBS's answer to NBC's popular "A Date with Judy", it was also broadcast by NBC in 1948 as a summer replacement for "The Bob Hope Show". From October 3, 1952 to June 26, 1953, it aired on ABC, finally returning to CBS. Despite the program's long run, fewer than 24 episodes are known to exist.Shirley TempleShirley Temple Black (April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, businesswoman, and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938. As an adult, she was named United States ambassador to Ghana and to Czechoslovakia and also served as Chief of Protocol of the United States.Janet WaldoJanet Marie Waldo (February 4, 1920 – June 12, 2016) was an American r

In [16]:
train_answers

["Arthur's Magazine",
 'Delhi',
 'President Richard Nixon',
 'American',
 'alcohol',
 'Jonathan Stark',
 'Crambidae',
 'Badr Hari',
 '2006',
 '6.213 km long',
 'Jaime Meline',
 'Walter Darwin Coy',
 'United States',
 'Super Bowl XLVIII',
 'US 60',
 '2006',
 'Hetfield and Ulrich, longtime lead guitarist Kirk Hammett, and bassist Robert Trujillo.',
 'Fox',
 '2017',
 'Nevada',
 'Hawaii',
 'Kelli Ward',
 'The Wolfhounds',
 '16-year-old',
 'World War II',
 'Todd Phillips',
 'Carol Lawrence',
 'New York City',
 'Amy Jo Johnson',
 'Aleksander Ford',
 'director',
 'Roseau, Minnesota, USA',
 'The Saimaa Gesture',
 'David Lee Roth',
 'Nassau County',
 'Australia',
 'California',
 'Dessau',
 'Roman Catholic',
 'The Joshua Tree',
 'Ulster County',
 'Tammy Wynette',
 'Sir Francis Nethersole',
 'ingredients in beer',
 'Dennis Howard Marks',
 'Robert Sheehan',
 'Glenn Hughes',
 'March 28, 1941',
 'standard gauge track',
 'Sex and the City',
 'Kato',
 'England',
 'Robert Zemeckis',
 '1932',
 'nine',
 

In [15]:
def update_train_answers(answers, contexts):
    temp = []
    for answer, context in zip(answers,contexts):
        gold_text = answer
        start_idx = context.find(answer)
        # There are some yes/no answers not found in the context
        if start_idx == -1:
            print(answer)
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            temp.append({'text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
        else:
            for n in [1,2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    temp.append({'text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
    return temp

train_answers_n = update_train_answers(train_answers,train_contexts)
val_answers_n = update_train_answers(val_answers,val_contexts)

In [18]:
print(len(val_answers_n))
print(val_answers_n[-5:])

6768
[{'text': 'Southwest Oregon Regional Airport', 'answer_start': 1575, 'answer_end': 1608}, {'text': 'President John F. Kennedy', 'answer_start': 6830, 'answer_end': 6855}, {'text': 'Ouse and Foss', 'answer_start': 1930, 'answer_end': 1943}, {'text': 'Yasir Hussain', 'answer_start': 278, 'answer_end': 291}, {'text': 'Norwood, Massachusetts', 'answer_start': 2255, 'answer_end': 2277}]


In [6]:
ARCHITECTURE_NAME = "models/distilbert-custom/"

In [7]:
from transformers import AutoTokenizer

# Tokenise input using DistilBERT tokeniser from pretrained 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(ARCHITECTURE_NAME)

In [8]:
# train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [9]:
def add_token_answers(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i,answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i,answers[i]['answer_end']))
        
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    encodings.update({
        'start_positions':start_positions,
        'end_positions':end_positions
                     })
    print(None in start_positions)
    print(None in end_positions)

In [16]:
# add_token_answers(train_encodings,train_answers_n)
add_token_answers(val_encodings,val_answers_n)

False
False


In [17]:
# train_encodings.keys()
val_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [18]:
# print(train_encodings['start_positions'][1])
# print(train_encodings['end_positions'][1])

In [19]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained(ARCHITECTURE_NAME)

In [20]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import torch

In [21]:
import torch

class NLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# train_dataset = NLDataset(train_encodings)
val_dataset = NLDataset(val_encodings)

In [None]:
# # switch model out of training mode
# model.eval()

# #val_sampler = SequentialSampler(val_dataset)
# val_loader = DataLoader(val_dataset, batch_size=16)

# acc = []

# # initialize loop for progress bar
# loop = tqdm(val_loader)
# # loop through batches
# for batch in loop:
#     # we don't need to calculate gradients as we're not training
#     total_tp = 0
#     total_fp = 0
#     total_fn = 0
#     with torch.no_grad():
#         # pull batched items from loader
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         start_true = batch['start_positions']
#         end_true = batch['end_positions']
#         # make predictions
#         outputs = model(input_ids, attention_mask=attention_mask)
#         # pull preds out
#         start_pred = torch.argmax(outputs['start_logits'], dim=1)
#         end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
#         # Calculate ACCURACY
#         # calculate accuracy for both and append to accuracy list
#         acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
#         acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
#         # Calculate F1
#         # calculate True Positive, False Negative and False Positive
#         for i in range(len(start_pred)):
#             x = range(start_pred[i],end_pred[i])
#             y = range(start_true[i],end_true[i])

#             xs = set(x)
#             ys = set(y)
#             tp = len(xs&ys)
#             fp = len(xs-ys)
#             fn = len(ys-xs)
#             total_tp += tp
#             total_fp += fp
#             total_fn += fn
        
# # calculate average accuracy in total
# acc = sum(acc)/len(acc)
# precision = total_tp/(total_tp+total_fp)
# recall=total_tp/(total_tp+total_fn)

In [35]:
# print("T/F\tstart\tend\n")
# for i in range(len(start_true)):
#     print(f"true\t{start_true[i]}\t{end_true[i]}\n"
#           f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	512	494
pred	23	494

true	37	43
pred	37	126

true	323	327
pred	323	327

true	512	471
pred	316	471

true	512	489
pred	356	489

true	512	496
pred	279	281

true	512	493
pred	283	493

true	47	49
pred	43	49

true	512	494
pred	207	494

true	267	269
pred	364	4

true	512	490
pred	37	490

true	344	347
pred	344	347

true	512	491
pred	465	491

true	401	405
pred	401	402

true	71	74
pred	43	45

true	485	488
pred	152	156



In [22]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc2 = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    total_tp = 0
    total_fp = 0
    total_fn = 0
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        start_true = batch['start_positions']
        end_true = batch['end_positions']
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # Calculate ACCURACY
        # calculate accuracy for both and append to accuracy list
        acc2.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc2.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
        # Calculate F1
        # calculate True Positive, False Negative and False Positive
        for i in range(len(start_pred)):
            x = range(start_pred[i],end_pred[i])
            y = range(start_true[i],end_true[i])

            xs = set(x)
            ys = set(y)
            tp = len(xs&ys)
            fp = len(xs-ys)
            fn = len(ys-xs)
            total_tp += tp
            total_fp += fp
            total_fn += fn
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/423 [00:11<?, ?it/s]


In [28]:
print(len(start_true))
print(start_true[0],end_true[0])
print(start_pred[0],end_pred[0])
print(val_answers_n[0])
print(batch["input_ids"][0][181:183])
print(tokenizer.decode(batch["input_ids"][0][349:489]))
val_questions[0]

16
tensor(181) tensor(183)
tensor(349) tensor(488)
{'text': 'Chief of Protocol', 'answer_start': 759, 'answer_end': 776}
tensor([2708, 1997])
lord high treasurer or lord treasurer was an english government position and has been a british government position since the acts of union of 1707. a holder of the post would be the third - highest - ranked great officer of state, below the lord high steward and the lord high chancellor. a kiss for corlissa kiss for corliss is a 1949 american comedy film directed by richard wallace and written by howard dimsdale. it stars shirley temple in her final starring role as well as her final film appearance. it is a sequel to the 1945 film " kiss and tell ". " a kiss for corliss " was retitled " almost a bride " before release and this title appears in the title sequence


'What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?'

In [31]:
print(len(start_true))
print(start_true[1],end_true[1])
print(start_pred[1],end_pred[1])
print(val_answers_n[1])
print(tokenizer.decode(batch["input_ids"][1][328:332]))
print(tokenizer.decode(batch["input_ids"][1][328:479]))
val_questions[1]

16
tensor(328) tensor(332)
tensor(328) tensor(479)
{'text': 'Animorphs', 'answer_start': 1542, 'answer_end': 1551}
animorphs
animorphs " series, written by k. a. applegate. with respect to continuity within the series, it takes place before book # 23, " the pretender ", although the events told in the story occur between the time of " the ellimist chronicles " and " the andalite chronicles ". the book is introduced by tobias, who flies to the valley of the free hork - bajir, where jara hamee tells him the story of how the yeerks enslaved the hork - bajir, and how aldrea, an andalite, and her companion, dak hamee, a hork - bajir, tried to save their world from the invasion. jara


'What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?'

In [33]:
print(len(start_true))
print(start_true[2],end_true[2])
print(start_pred[2],end_pred[2])
print(val_answers_n[2])
print(tokenizer.decode(batch["input_ids"][2][250:256]))
print(tokenizer.decode(batch["input_ids"][2][250:490]))
val_questions[2]

16
tensor(250) tensor(256)
tensor(250) tensor(490)
{'text': 'Greenwich Village, New York City', 'answer_start': 1191, 'answer_end': 1223}
greenwich village, new york city
greenwich village, new york city. trigiani has published a novel a year since 2000. great eastern conventionsgreat eastern conventions, inc. was an entertainment company which produced comic book conventions, most actively during the years 1987 - 1996. in new york city, the great eastern shows filled the gap between the mid - 1980s demise of the annual comic art convention and creation conventions, and the establishment of promoter michael carbonaro's annual big apple comic con in 1996. from 1993 – 1995, great eastern hosted two new york city shows annually at the jacob k. javits convention center. great eastern also ran shows in new jersey, pennsylvania, massachusetts, oregon, minnesota, and texas. new york society of model engineersthe new york society of model engineers ( nysme ) was originally incorporated in 1926

'The director of the romantic comedy "Big Stone Gap" is based in what New York city?'

'What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?'

In [53]:
print(len(batch["input_ids"]))


16


In [34]:
for x in batch["input_ids"]:
    print(tokenizer.decode(x))
    break

[CLS] meet corliss archermeet corliss archer, a program from radio's golden age, ran from january 7, 1943 to september 30, 1956. although it was cbs's answer to nbc's popular " a date with judy ", it was also broadcast by nbc in 1948 as a summer replacement for " the bob hope show ". from october 3, 1952 to june 26, 1953, it aired on abc, finally returning to cbs. despite the program's long run, fewer than 24 episodes are known to exist. shirley templeshirley temple black ( april 23, 1928 – february 10, 2014 ) was an american actress, singer, dancer, businesswoman, and diplomat who was hollywood's number one box - office draw as a child actress from 1935 to 1938. as an adult, she was named united states ambassador to ghana and to czechoslovakia and also served as chief of protocol of the united states. janet waldojanet marie waldo ( february 4, 1920 – june 12, 2016 ) was an american radio and voice actress. she is best known in animation for voicing judy jetson, nancy in " shazzan ", p

In [25]:
# print(len(outputs['start_logits']))
# for x in outputs['start_logits']:
#     print(tokenizer.decode(x))

In [26]:
print(len(outputs['start_logits']))
for x in start_pred:
    print(x)

16
tensor(349)
tensor(328)
tensor(250)
tensor(454)
tensor(375)
tensor(102)
tensor(83)
tensor(448)
tensor(296)
tensor(440)
tensor(213)
tensor(186)
tensor(427)
tensor(298)
tensor(29)
tensor(472)
