In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

import evaluate
import transformers
import argparse
import torch
import json
import datasets

from numpy import mean
from tqdm import tqdm

TOKEN_QUESTION = "### Question:"
TOKEN_END_QUESTION = ""
TOKEN_CONTEXT = "### Context:"
TOKEN_END_CONTEXT = ""
TOKEN_ANSWER = "### Answer:"
TOKEN_END_ANSWER = ""
HIGHLIGHT_ANSWER = ""
SPLIT_SEED = 42
NPROC = 32
HIGHLIGHT = True
LLAMA_PATH = "./results/llama-7B/checkpoint-1000"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(LLAMA_PATH)
model = transformers.AutoModelForCausalLM.from_pretrained(LLAMA_PATH, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.75s/it]


In [5]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [3]:

def get_inputs_target(e):
    answer_start = e["answers"]["answer_start"][0]
    # add highlight token to context
    ans_len = len(e["answers"]["text"][0])

    e["context"] = (
        e["context"][:answer_start]
        + " "
        + HIGHLIGHT_ANSWER
        + " "
        + e["context"][answer_start : answer_start + ans_len]
        + " "
        + HIGHLIGHT_ANSWER
        + " "
        + e["context"][answer_start + ans_len :]
        + " "
        + TOKEN_ANSWER
        + " "
        + e["answers"]["text"][0]
        + " "
        + TOKEN_END_ANSWER
    )

    return {
        # answer + context + question for causal language modeling
        "text": f'<s> {e["context"]} {TOKEN_QUESTION}',
    }



def preprocess_squad_dataset(dataset_name="squad", split="train"):
    dataset = datasets.load_dataset(dataset_name, split=split).select(range(1000,2000))
    dataset = dataset.map(get_inputs_target, num_proc=NPROC)
    return dataset


# load dataset
valid_dataset = preprocess_squad_dataset(dataset_name="squad", split="validation")

Map (num_proc=32): 100%|██████████| 1000/1000 [00:00<00:00, 1787.43 examples/s]


In [4]:
shuffled = valid_dataset.shuffle(SPLIT_SEED).select(range(100))


In [12]:
# put all contexts, questions, and references answers in a json file
with open("llama.json", "w") as f:
    for e in shuffled:
        f.write(
            json.dumps(
                {
                    "context": e["context"],
                    "question": e["question"],
                    "answers": e["answers"]["text"][0],
                }
            )
            + ",\n"
        )

In [21]:
example = valid_dataset[2]
sequences = pipeline(
            example['text'],
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            max_length=512
        )
prediction = str([s["generated_text"] for s in sequences])
print("> Prediction:", prediction.split(TOKEN_QUESTION)[1].split('?')[0], "\n", "> Original:", example["question"], "\n") 

> Prediction:  What is the name of the big park in the northern Mokotów 
 Original: Where was the first horse racetrack located? 



In [30]:
predictions = []
references = []
contexes = []

for examples in tqdm(valid_dataset.shuffle(SPLIT_SEED).select(range(100))):
    sequences = pipeline(
            examples['text'],
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            max_length=512
        )
    prediction = str([s["generated_text"] for s in sequences])
    predictions.append(prediction.split(TOKEN_QUESTION)[1].split('?')[0] + "?")
    references.append(examples["question"])
    contexes.append(examples["context"])
    print("> Prediction:", prediction.split(TOKEN_QUESTION)[1].split('?')[0], "\n" + "> Original:", examples["question"], "\n") 

  0%|          | 0/100 [00:00<?, ?it/s]

  1%|          | 1/100 [00:27<45:57, 27.85s/it]

> Prediction:  What is the main problem with modern schooling 
> Original: What do critics think the cause of problems with modern schooling is? 



  2%|▏         | 2/100 [00:55<45:12, 27.68s/it]

> Prediction:  What is the most significant difference between primary school and secondary school teaching 
> Original: What's the biggest difference in the teaching relationship for primary and secondary school? 



  3%|▎         | 3/100 [01:25<46:51, 28.99s/it]

> Prediction:  Who is credited with the invention of the Turing machine 
> Original: Who was the most influential researcher among those grappling with the deficit of work surrounding the complexity posed by algorithmic problems? 



  4%|▍         | 4/100 [01:58<48:44, 30.47s/it]

> Prediction:  What is another name for a school 
> Original: Where is another indoor location for a teacher other than a school? 



  5%|▌         | 5/100 [02:26<46:33, 29.41s/it]

> Prediction:  When did Roger de Tosny travel to the Iberian Peninsula 
> Original: What year did Roger de Tosny fail to accomplish what he set out to do? 



  6%|▌         | 6/100 [02:54<45:18, 28.92s/it]

> Prediction:  Where did Tesla bring injured pigeons to nurse back to health 
> Original: To what place did he bring the injured pigeons to take care of them? 



  7%|▋         | 7/100 [03:19<43:12, 27.88s/it]

> Prediction:  What is the time complexity of the knapsack problem 
> Original: How quickly can an algorithm solve an NP-complete knapsack problem? 



  8%|▊         | 8/100 [03:44<41:16, 26.91s/it]

> Prediction:  What did Tesla do for 60 florins a month 
> Original: What did Tesla work as after dropping out?  



  9%|▉         | 9/100 [04:13<41:54, 27.63s/it]

> Prediction:  Who bought the rights to the islands from Jean's nephew 
> Original: Who bought the rights? 



 10%|█         | 10/100 [04:39<40:32, 27.03s/it]

> Prediction:  What motivates the concept of a problem being hard for a complexity class 
> Original: What would create a conflict between a problem X and problem C within the context of reduction?  



 11%|█         | 11/100 [05:09<41:13, 27.79s/it]

> Prediction:  What position did Tesla fire because of weight 
> Original: Who did he fire? 



 12%|█▏        | 12/100 [05:35<40:07, 27.35s/it]

> Prediction:  What is balanced with positive reinforcement 
> Original: What is the balance for positive reinforcement? 



 13%|█▎        | 13/100 [05:58<37:47, 26.07s/it]

> Prediction:  What time did Tesla have dinner 
> Original: At what time did Tesla get dinner? 



 14%|█▍        | 14/100 [06:21<36:02, 25.14s/it]

> Prediction:  What does a teacher need to show to have a positive learning experience 
> Original: What must a teacher show towards the course materials for increase learning? 



 15%|█▌        | 15/100 [06:46<35:39, 25.17s/it]

> Prediction:  In what city did Tesla work for the Budapest Telephone Exchange 
> Original: Where did Tesla move in 1881? 



 16%|█▌        | 16/100 [07:16<36:58, 26.41s/it]

> Prediction:  What type of military personnel could be targeted by the "peace ray" 
> Original: What ground based group did Tesla think the weapon could be used on? 



 17%|█▋        | 17/100 [07:36<33:49, 24.45s/it]

> Prediction:  What was the date of Tesla's first experiment at his Colorado Springs lab 
> Original: What date did Tesla begin his Colorado Springs experiments? 



 18%|█▊        | 18/100 [08:02<34:21, 25.14s/it]

> Prediction:  What caused Milutin Tesla's death 
> Original: What was one of theories as to what caused Tesla's father's unspecified illness? 



 19%|█▉        | 19/100 [08:24<32:33, 24.12s/it]

> Prediction:  What is the population of the greater metropolitan area of Warsaw 
> Original: How many residents does the greater metropolitan area have? 



 20%|██        | 20/100 [08:46<31:06, 23.33s/it]

> Prediction:  When did the Normans begin to develop a distinct cultural and ethnic identity 
> Original: What century did the Normans first gain their separate identity? 



 21%|██        | 21/100 [09:10<31:14, 23.72s/it]

> Prediction:  What is the name of the class of complexity classes between P and PSPACE 
> Original: Where can the complexity classes RP, BPP, PP, BQP, MA, and PH be located? 



 22%|██▏       | 22/100 [09:26<27:40, 21.28s/it]

> Prediction:  Who was illuminated by a Geissler tube 
> Original: Who was Tesla trying to photograph when he accidentally took the X-Ray image? 



 23%|██▎       | 23/100 [09:45<26:40, 20.78s/it]

> Prediction:  What was the name of the geographical treatise written by al-Idrisi for King Roger II of Sicily 
> Original: What is another name for the Tabula Rogeriana? 



 24%|██▍       | 24/100 [10:05<25:52, 20.42s/it]

> Prediction:  During his second year of study at what city did Tesla develop a passion for billiards, chess and card-playing 
> Original: Where was Tesla studying when he started playing cards and billiards? 



 25%|██▌       | 25/100 [10:33<28:16, 22.62s/it]

> Prediction:  What is the class of problems defined using Boolean circuits 
> Original: AC and NC are complexity classes typically associated with what type of circuit? 



 26%|██▌       | 26/100 [10:51<26:24, 21.42s/it]

> Prediction:  What is the name of the park close to the Sejm and John Lennon street 
> Original: What park is close to John Lennon street? 



 27%|██▋       | 27/100 [11:12<25:41, 21.12s/it]

> Prediction:  What is the time used to solve a problem 
> Original: What is the most critical resource measured to in assessing the determination of a Turing machine's ability to solve any given set of problems? 



 28%|██▊       | 28/100 [11:32<24:51, 20.71s/it]

> Prediction:  What is the name of the forest located on the southern border of Warsaw 
> Original: What forest is by Warsaw's southern border? 



 29%|██▉       | 29/100 [12:01<27:30, 23.25s/it]

> Prediction:  Who may accompany students on field trips 
> Original: Who might be responsible for student discipline? 



 30%|███       | 30/100 [12:22<26:36, 22.80s/it]

> Prediction:  Who wrote that Tesla was a "gentlemanly" person 
> Original: Who was Tesla's secretary?  



 31%|███       | 31/100 [12:44<25:55, 22.54s/it]

> Prediction:  What is the population of Warsaw 
> Original: Where does Warsaw rank in terms of population in the EU? 



 32%|███▏      | 32/100 [13:12<27:08, 23.95s/it]

> Prediction:  When did Tesla state that he had built the death ray 
> Original: When did Tesla claim to have built the weapon? 



 33%|███▎      | 33/100 [13:38<27:40, 24.78s/it]

> Prediction:  Who partnered with Tesla to finance an electric lighting company 
> Original: Who did Tesla partner with in 1886? 



 34%|███▍      | 34/100 [14:06<28:18, 25.73s/it]

> Prediction:  What city was invaded by the Normans in 1185 
> Original: What was the naval base called? 



 35%|███▌      | 35/100 [14:35<28:59, 26.76s/it]

> Prediction:  What complexity classes are between P and L 
> Original: What are two complexity classes between L and P? 



 36%|███▌      | 36/100 [15:00<27:45, 26.02s/it]

> Prediction:  What has begun to shape the way teachers approach their roles in the classroom 
> Original: What has started to change the way teachers teach in the classroom, generally? 



 37%|███▋      | 37/100 [15:19<25:06, 23.92s/it]

> Prediction:  Who purchased the FSO Car Factory in 2005 
> Original: Who bought the factory in 2005? 



 38%|███▊      | 38/100 [15:45<25:28, 24.65s/it]

> Prediction:  What groups advocate a more assertive and confrontational style of discipline 
> Original: Who wants a more confrontational type of discipline? 



 39%|███▉      | 39/100 [16:08<24:29, 24.08s/it]

> Prediction:  Who was the despot of Cyprus 
> Original: Who ruled Cyprus in 1191? 



 40%|████      | 40/100 [16:34<24:33, 24.57s/it]

> Prediction:  Who led the papal army in the War of Barbastro 
> Original: Who was in charge of the papal army in the War of Barbastro? 



 41%|████      | 41/100 [17:00<24:48, 25.23s/it]

> Prediction:  How much did Tesla spend on the injured pigeon 
> Original: How much did Tesla spend on the injured pigeon? 



 42%|████▏     | 42/100 [17:28<25:11, 26.05s/it]

> Prediction:  What does co-teaching focus on 
> Original: What does co-teaching get the students to focus on? 



 43%|████▎     | 43/100 [17:56<25:16, 26.61s/it]

> Prediction:  What is the big O notation for T(n) 
> Original: How would one write T(n) = 7n2 + 15n + 40 in big O notation?  



 44%|████▍     | 44/100 [18:23<24:51, 26.63s/it]

> Prediction:  What is the advantage of a platoon system 
> Original: The teacher's in a platoon style teaching are usually more _____? 



 45%|████▌     | 45/100 [18:41<21:55, 23.93s/it]

> Prediction:  How many languages did Tesla speak 
> Original: How many languages did Tesla know? 



 46%|████▌     | 46/100 [19:02<20:47, 23.10s/it]

> Prediction:  Where did Bohemond sign a peace treaty with the Byzantines 
> Original: What river was Petrela located by? 



 47%|████▋     | 47/100 [19:32<22:18, 25.25s/it]

> Prediction:  What are complexity measures 
> Original: Decision tree is an example of what type of measure? 



 48%|████▊     | 48/100 [19:56<21:26, 24.74s/it]

> Prediction:  What time did Tesla often work until 
> Original: On some nights how late did Tesla work until? 



 49%|████▉     | 49/100 [20:26<22:25, 26.39s/it]

> Prediction:  What type of waves did Tesla observe during this time 
> Original: What sort of waves did he claim to observe? 



 50%|█████     | 50/100 [20:52<21:51, 26.22s/it]

> Prediction:  What are the City Council's committees 
> Original: What does the City Council divide itself into? 



 51%|█████     | 51/100 [21:16<20:58, 25.69s/it]

> Prediction:  How did Tesla plan to make dull students bright 
> Original: What was Tesla's plan to make students "bright"? 



 52%|█████▏    | 52/100 [21:43<20:56, 26.17s/it]

> Prediction:  Where did Tesla run away to avoid being drafted into the Austro-Hungarian Army 
> Original: Where did Tesla run to avoid the army draft? 



 53%|█████▎    | 53/100 [22:11<20:49, 26.58s/it]

> Prediction:  In which country is classroom behavior problematic 
> Original: What country has higher scores on standardized tests than the U.S.? 



 54%|█████▍    | 54/100 [22:39<20:50, 27.18s/it]

> Prediction:  What did Tesla say helped his scientific abilities 
> Original: What characteristic did Tesla say helped his scientific abilities? 



 55%|█████▌    | 55/100 [22:48<16:16, 21.71s/it]

> Prediction:  Did Tesla work on Sundays or holidays 
> Original: Did Tesla graduate from the university? 



 56%|█████▌    | 56/100 [23:18<17:40, 24.10s/it]

> Prediction:  What religions did Tesla have a profound respect for 
> Original: Which two religions did Tesla express respect for? 



 57%|█████▋    | 57/100 [23:46<18:08, 25.31s/it]

> Prediction:  When did Nikola Tesla die 
> Original: In what year did Tesla die?  



 58%|█████▊    | 58/100 [24:13<18:05, 25.84s/it]

> Prediction:  What did Tesla patent in 1891 
> Original: What did Tesla patent in 1891? 



 59%|█████▉    | 59/100 [24:34<16:35, 24.28s/it]

> Prediction:  What do proponents of Sudbury model democratic schools argue that a school with has 
> Original: What must be passed using democratic means by the entire school community? 



 60%|██████    | 60/100 [24:59<16:20, 24.52s/it]

> Prediction:  What have sources discovered that have lain hidden in what 
> Original:  Where were missing patents later found? 



 61%|██████    | 61/100 [25:26<16:25, 25.28s/it]

> Prediction:  What is the most common alphabet used in computational problems 
> Original: What is the name of the alphabet is most commonly used in a problem instance? 



 62%|██████▏   | 62/100 [25:49<15:31, 24.50s/it]

> Prediction:  When did Tesla begin his experiments with X-rays 
> Original: when did tesla begin researching x-ray imaging? 



 63%|██████▎   | 63/100 [26:10<14:25, 23.39s/it]

> Prediction:  Where did Tesla demonstrate his radio-controlled boat 
> Original: Where was the boat demonstration given? 



 64%|██████▍   | 64/100 [26:37<14:46, 24.63s/it]

> Prediction:  Where did Tesla demonstrate his radio wave theories 
> Original: What Philadelphia institution did Tesla give a demonstration to? 



 65%|██████▌   | 65/100 [27:05<14:59, 25.69s/it]

> Prediction:  What architectural style did Norman architecture typically stand out as a new stage in the architectural history of the regions they subdued 
> Original: What is the Norman architecture idiom? 



 66%|██████▌   | 66/100 [27:34<15:04, 26.61s/it]

> Prediction:  What defines a complexity measure 
> Original: What is typically used to broadly define complexity measures? 



 67%|██████▋   | 67/100 [27:43<11:41, 21.25s/it]

> Prediction:  When did Tesla\'s father die 
> Original: In what year did Tesla's father die? 



 68%|██████▊   | 68/100 [28:12<12:36, 23.65s/it]

> Prediction:  What are the bodies designed to instill, preserve and update the knowledge and professional standing of teachers called 
> Original: What do government's run that affects teachers? 



 69%|██████▉   | 69/100 [28:35<12:11, 23.60s/it]

> Prediction:  What did Tesla do to the patents he had generated 
> Original: What did Tesla do with his patents causing him to lose them? 



 70%|███████   | 70/100 [29:00<11:58, 23.96s/it]

> Prediction:  What did Tesla believe caused damage to the skin 
> Original: At first what did Tesla think was the main cause of damage to skin cells when they were exposed to X-rays? 



 71%|███████   | 71/100 [29:27<11:56, 24.72s/it]

> Prediction:  What did Tesla believe humans\' "pity" had interfered with 
> Original: What was his belief as to what nature was supposed to be? 



 72%|███████▏  | 72/100 [29:54<11:52, 25.45s/it]

> Prediction:  What did Tesla's father promise to do if he recovered from the illness 
> Original: What bargain did his father make with him if Tesla recovered? 



 73%|███████▎  | 73/100 [30:18<11:16, 25.05s/it]

> Prediction:  What event did Tesla blame for his financial troubles 
> Original: On what did Tesla blame for the loss of the initial money? 



 74%|███████▍  | 74/100 [30:45<11:06, 25.63s/it]

> Prediction:  What is the most common reduction 
> Original: What measurement of time is used in polynomial time reduction? 



 75%|███████▌  | 75/100 [31:07<10:12, 24.49s/it]

> Prediction:  What was Tesla\'s personality like 
> Original: With what word was Tesla's sociability described? 



 76%|███████▌  | 76/100 [31:27<09:17, 23.24s/it]

> Prediction:  What was the length of Tesla's first spark 
> Original: What was the recorded length of the first spark? 



 77%|███████▋  | 77/100 [31:51<08:58, 23.41s/it]

> Prediction:  What did commutators require that induction motors did not 
> Original: What high maintenance part did Tesla's AC motor not require? 



 78%|███████▊  | 78/100 [32:22<09:24, 25.67s/it]

> Prediction:  What is the general consensus on Tesla\'s religious views 
> Original: Because of certain statements what was the believed state of his religious views? 



 79%|███████▉  | 79/100 [32:49<09:08, 26.12s/it]

> Prediction:  What book did Tesla publish in 1900 
> Original: What was one of Tesla's books where articles can be read? 



 80%|████████  | 80/100 [33:16<08:44, 26.25s/it]

> Prediction:  In what year did Tesla state that he believed in eugenics 
> Original: When did he talk about his beliefs in an interview? 



 81%|████████  | 81/100 [33:38<07:55, 25.00s/it]

> Prediction:  What organization did Tesla demonstrate his alternating current system to 
> Original: What is the IEEE? 



 82%|████████▏ | 82/100 [34:00<07:13, 24.09s/it]

> Prediction:  What banner was used to announce the demonstration of the Tesla Polyphase System 
> Original: What did Tesla call his electrical effects in 1893? 



 83%|████████▎ | 83/100 [34:30<07:21, 25.99s/it]

> Prediction:  What happened to the contents of Tesla's lab in 1906 
> Original: What happened to the things inside the lab after it was torn down? 



 84%|████████▍ | 84/100 [34:55<06:49, 25.59s/it]

> Prediction:  What is the answer to the question 
> Original: What will the output be for a member of the language of a decision problem? 



 85%|████████▌ | 85/100 [35:19<06:17, 25.17s/it]

> Prediction:  What did Tesla theorize would enhance intelligence 
> Original: What did Tesla think could improve the brain's intelligence? 



 86%|████████▌ | 86/100 [35:41<05:38, 24.20s/it]

> Prediction:  Where is the treatise currently located 
> Original: Where can the Treatise be found? 



 87%|████████▋ | 87/100 [36:06<05:18, 24.49s/it]

> Prediction:  What is the name of the problem that is NP-complete 
> Original: What is the example of another problem characterized by large instances that is routinely solved by SAT handlers employing efficient algorithms? 



 88%|████████▊ | 88/100 [36:28<04:45, 23.79s/it]

> Prediction:  What is the basis for the complexity class P 
> Original: What thesis specifies that a polynomial relationship exists within time complexities in a computational model?  



 89%|████████▉ | 89/100 [36:51<04:18, 23.52s/it]

> Prediction:  When did Richard the Lion-hearted leave Messina with a large fleet 
> Original: What year did the storm hit Richard's fleet? 



 90%|█████████ | 90/100 [37:18<04:06, 24.66s/it]

> Prediction:  Tesla's father originally wanted him to do what 
> Original: What did Tesla's father originally want him to do? 



 91%|█████████ | 91/100 [37:36<03:21, 22.44s/it]

> Prediction:  What does the teacher do to the cocky 
> Original: What would a teacher do for someone who is cocky? 



 92%|█████████▏| 92/100 [37:58<03:00, 22.54s/it]

> Prediction:  What did Tesla call his X-ray imaging experiments 
> Original: What did Tesla begin to research in March 1896? 



 93%|█████████▎| 93/100 [38:31<02:58, 25.45s/it]

> Prediction:  What religion was Tesla raised in 
> Original: What religion did Tesla grow up in? 



 94%|█████████▍| 94/100 [38:54<02:29, 24.91s/it]

> Prediction:  What does the word "passion" mean 
> Original: What gets transferred to students who are receptive to the teacher? 



 95%|█████████▌| 95/100 [39:03<01:39, 19.92s/it]

> Prediction:  What was the total nominal GDP of the city in 2010 
> Original: What was the total nominal GDP of Warsaw in 2010? 



 96%|█████████▌| 96/100 [39:33<01:32, 23.00s/it]

> Prediction:  What was Tesla's method of accurately determining the location of underground mineral deposits 
> Original: What did he hope to locate underground? 



 97%|█████████▋| 97/100 [39:46<01:00, 20.09s/it]

> Prediction:  What culture did the Normans have a profound effect on 
> Original: What culture did the Normans combine with in Ireland? 



 98%|█████████▊| 98/100 [40:13<00:44, 22.10s/it]

> Prediction:  What did Tesla believe X-rays to be 
> Original: What did tesla incorrectly believe about x-rays? 



 99%|█████████▉| 99/100 [40:32<00:21, 21.19s/it]

> Prediction:  Who was awarded the contract to build the AC distribution system at the Niagara Falls 
> Original: What company was chosen to build an AC distribution system at Niagara Falls? 



100%|██████████| 100/100 [40:54<00:00, 24.54s/it]

> Prediction:  What is the main focus of complexity theory 
> Original: Complexity theory classifies problems based on what primary attribute? 






In [39]:
import numpy as np
np.mean(bertscore.compute(predictions=predictions, references=references, lang="en")['f1'])

0.9142226606607438

In [33]:
# save scores
metrics = {
    "bertscore": bertscore.compute(predictions=predictions, references=references, lang="en"),
    "rouge": rouge.compute(predictions=predictions, references=references),
    "predictions": predictions,
    "references": references,
    "contexes": contexes
}

with open("results/llama-7B/eval.json", "w") as f:
    json.dump(metrics, f)