In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

import argparse
import json
import gc
import datasets
import transformers
import torch
import evaluate
from tqdm import tqdm
import json
import numpy as np
from trl import PPOConfig, PPOTrainer, AutoModelForSeq2SeqLMWithValueHead
from peft import LoraConfig, TaskType, get_peft_model, PeftModelForSeq2SeqLM, PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

"""
Train fine tuned T5 model with Proximal Policy Optimization (PPO) algorithm.
"""
parser = argparse.ArgumentParser()
#parser.add_argument("--model_name", type=str, default="t5-small")
#parser.add_argument("--highlight", type=bool, default=True)
#args = parser.parse_args()


bertscore = evaluate.load("bertscore")
average_question_length = 10.0

HIGHLIGHT = True
TOKEN_QUESTION = "<question>"
TOKEN_END_QUESTION = "<question>"
TOKEN_CONTEXT = "<context>"
TOKEN_END_CONTEXT = "<context>"
TOKEN_ANSWER = "<answer>"
TOKEN_END_ANSWER = "<answer>"
HIGHLIGHT_ANSWER = "<hl>"
SPLIT_SEED = 42
NPROC = 32

model_name = "t5-small"
HIGHLIGHT = True
if HIGHLIGHT:
    model_name = f"{model_name}-hl"

  from .autonotebook import tqdm as notebook_tqdm


[2023-12-01 15:27:37,687] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [3]:
peft_config = LoraConfig(
    r=128,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["q", "v"],
)


#model = AutoModelForSeq2SeqLM.from_pretrained(f"./models/{model_name}/", device_map="auto")
#peft_model = PeftModelForSeq2SeqLM.from_pretrained(model, model_id=f"./models/{model_name}/", config=peft_config, device_map='auto', is_trainable=True)

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(f"./models/{model_name}/", device_map="cuda:0")
ref_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(f"./models/{model_name}/", device_map="cuda:0")
tokenizer = transformers.AutoTokenizer.from_pretrained(f"./models/{model_name}", model_max_length=512)
torch.cuda.empty_cache()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [4]:
tokenize_query = lambda e : tokenizer(e["query"], return_tensors='pt', padding=True, truncation=True).input_ids.squeeze().to('cuda:0')

def tokenize_query(e):
    return tokenizer(e["query"], return_tensors='pt', padding=True, truncation=True).input_ids.squeeze().to('cuda:0')

def get_inputs_target(e):
    answer_start = e["answers"]["answer_start"][0]
    # add highlight token to context
    ans_len = len(e["answers"]["text"][0])

    if HIGHLIGHT:
        e["context"] = (
            e["context"][:answer_start]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start : answer_start + ans_len]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start + ans_len :]
        )

    e['query'] = e.pop('context')

    return {
        # answer + context
        "query": f'generate question: {TOKEN_ANSWER} {e["answers"]["text"][0]} {TOKEN_END_ANSWER} {TOKEN_CONTEXT} {e["query"]} {TOKEN_END_CONTEXT}',
        # question
        "target": f'{TOKEN_QUESTION} {e["question"]} {TOKEN_END_QUESTION}',
        "answer": e["answers"]["text"][0],
    }


def preprocess_squad_dataset(dataset_name="squad", split="train"):
    dataset = datasets.load_dataset(dataset_name, split=split).shuffle(42).select(range(10000))  
    # Add question, answer and context tokens to dataset in a new column named text
    dataset = dataset.map(
        get_inputs_target,
        num_proc=16
    )
    return dataset

# Need to have training dataset aligned with PPO input format

train_dataset = preprocess_squad_dataset(dataset_name="squad", split="train") 

Map (num_proc=16): 100%|██████████| 10000/10000 [00:00<00:00, 12804.01 examples/s]


In [5]:
def preprocess_prediction(example):
    """
    this function will preprocess the prediction
    """
    return example.replace("<pad>", "").replace("<unk>", "").replace("</s>", "").replace("question>", "").replace("<question>", "").replace('<', "").strip()

def reward_model(example):
    """
    this function will return a reward function for PPO
    """
    try:
        context = example["query"]
        target = example["target"]
        answer = example["answer"]
        prediction = example["prediction"]
        prediction = [preprocess_prediction(pred) for pred in prediction] if isinstance(prediction, list) else preprocess_prediction(prediction)
        

        if isinstance(target, list):
            target = [preprocess_prediction(ans) for ans in answer]
        else:
            target = preprocess_prediction(target)

        
        

        reward = bertscore.compute(
            predictions=[prediction] if isinstance(prediction, str) else prediction,
            references=[target] if isinstance(target, str) else target,
            lang="en",
            model_type="bert-base-uncased",
        )["f1"][0]

        prediction = prediction[0] if isinstance(prediction, list) else prediction
        target = target[0] if isinstance(target, list) else target
        

        repetition_penalty = -5.0 if answer.lower() in prediction.lower() else 1.0
        question_word_penalty = -0.5 if target.split()[0].lower() != prediction.split()[0].lower() else 0.5
        question_length_penalty = -0.5 if len(target.split()) > 20 else 0.5

        reward = reward + (repetition_penalty + question_word_penalty  + question_length_penalty)
        # make it between 0 and 1
        reward = torch.nn.Sigmoid()(torch.tensor(reward))
        print(f"> Ans: {answer}, Q: {prediction}, P: {target}, R: {reward}")
        return reward

    except Exception as e:
        print("WARNING", e)
        return torch.tensor(0.0)

In [6]:
BATCH_SIZE = 16
config = PPOConfig(
    learning_rate=1e-5,
    log_with='tensorboard',
    project_kwargs={'logging_dir': f'./logs/{model_name}-ppo'},
    batch_size=BATCH_SIZE,
)

ppo_trainer = PPOTrainer(
    model=ppo_model,
    ref_model=ref_model,
    config=config,
    tokenizer=tokenizer,
)


batched_dataset = [[train_dataset[i] for i in range(j, min(j+BATCH_SIZE, len(train_dataset)))] for j in range(0, len(train_dataset), BATCH_SIZE)]
# remove last batch if it is not full
if len(batched_dataset[-1]) != BATCH_SIZE:
    batched_dataset.pop(-1)

for batch in tqdm(batched_dataset):
    query_tensors = [tokenizer(e["query"], return_tensors='pt', padding=True, truncation=True).input_ids.squeeze().to('cuda:0') for e in batch]

    response_tensors = ppo_trainer.generate(query_tensors, max_length=32, early_stopping=True)

    # put response_tensors into batch
    for i in range(len(batch)):
        batch[i]["prediction"] = tokenizer.decode(response_tensors[i], skip_special_tokens=True)

    pipe_outputs = [reward_model(e) for e in batch] 
    rewards = pipe_outputs

    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')

    print('-'.join('' for x in range(100)))

# save model
ppo_trainer.save_pretrained(f"./models/{model_name}-ppo/")



  0%|          | 0/625 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


> Ans: 84%, Q: How many Egyptians polled support the death penalty for those who leave Islam?, P: What percentage of Egyptians polled support death penalty for those leaving Islam?, R: 0.8697241544723511
> Ans: books, Q: What is the number of books sold per capita in Ann Arbor?, P: Ann Arbor ranks 1st among what goods sold?, R: 0.011874043382704258
> Ans: the executive, Q: Who controls the judiciary?, P: In developing countries, who makes most of the spending decisions?, R: 0.8336333632469177
> Ans: Anjiro, Q: What was the name of Francis's samurai?, P: Who impressed Xavier by taking notes in church?, R: 0.8101040720939636
> Ans: loops, Q: What are elements of the fundamental group represented by?, P: What represents elements of the fundamental group?, R: 0.9454767107963562
> Ans: 2.2 billion, Q: How many people are the population of the Commonwealth of Nations?, P: What is the population of the Commonwealth?, R: 0.8545026183128357
> Ans: Military Governor of the U.S. Occupation Zone, 

  0%|          | 1/625 [00:12<2:10:32, 12.55s/it]

objective/kl: 45.34385299682617
ppo/returns/mean: -2.016042470932007
ppo/policy/advantages_mean: -0.03571716696023941
---------------------------------------------------------------------------------------------------
> Ans: echiurans and sipunculan, Q: What are the polychaetes', P: What offshoots of polychaetes are unsegmented?, R: 0.9337579607963562
> Ans: Religion, Q: What is the important to the Tibetans?, P: What has a strong influence over all aspect of Tibetans lives?, R: 0.9346240758895874
> Ans: paralyzes muscles, Q: What does Clostridium tetani release a toxin?, P: What does the of toxin Clostridium tetani releases do?, R: 0.9425755143165588
> Ans: tituli, Q: What diocese of Rome is the term, P: The Church of England uses what term that is held by two senior members of the College of Minor Canons of St. Pauls Catherdral?, R: 0.6144264340400696
> Ans: CBS Television City, Q: Who is the title of the finals?, P: Where studio hosts the live final rounds on American Idol?, R: 0.82

  0%|          | 2/625 [00:22<1:52:49, 10.87s/it]

objective/kl: 48.285545349121094
ppo/returns/mean: -2.303199291229248
ppo/policy/advantages_mean: -5.0502829253673553e-05
---------------------------------------------------------------------------------------------------
> Ans: exceeds 2%, Q: What is the prevalence of HIV prevalence among the 15–49 group?, P: How prevalent is HIV among the 15-49 age group in Eritrea?, R: 0.8509702086448669
> Ans: 1763, Q: What year did the Treaty of Paris sign?, P: When was the Treaty of Paris?, R: 0.8581595420837402
> Ans: Mark Dybul, Q: Who is director of the Global Fund to Fight AIDS?, P: What member of leadership at the Global Fund to Fight AIDS, Tuberculosis and Malaria called TB a "pandemic"?, R: 0.8438253402709961
> Ans: From at least the late nineteenth century, Q: From what century did Europe speculate that the range of human sexual response looked more like a continuum?, P: When did the questioning of human sexual responses begin?, R: 0.836018979549408
> Ans: NASCAR (headquartered in Daytona

  0%|          | 3/625 [00:32<1:49:04, 10.52s/it]

objective/kl: 49.49913024902344
ppo/returns/mean: -2.5499267578125
ppo/policy/advantages_mean: -0.02233893796801567
---------------------------------------------------------------------------------------------------
> Ans: Angevin, Q: Under what dynasty did France expand their authority over the nobility?, P: To what dynasty did Henry II belong?, R: 0.8318046927452087
> Ans: leaving Field Marshal Hans von Lehwaldt in East Prussia to guard against Russian invasion from the east, Q: How did Frederick leave Field Marshal Kurt von Schwerin in East Prussia to guard against Russian invasion from the east P, P: How did Frederick protect East Prussia when he went to invade Saxony?, R: 0.9329959750175476
> Ans: those educated, considered their ethnicity (genos) to be Hellenic, Q: What did the Greek speakers consider Romioi?, P: What did the literate segment of Greeks consider to be a part of their ethic lines ?, R: 0.933443009853363
> Ans: 136, Q: How many high-tech enterprises in the zone?, P:

  1%|          | 4/625 [00:42<1:49:03, 10.54s/it]

objective/kl: 50.710777282714844
ppo/returns/mean: -2.351426124572754
ppo/policy/advantages_mean: -0.013105562888085842
---------------------------------------------------------------------------------------------------
> Ans: connections between different electrical services, Q: What are issues of, P: What is the other issue that comes to sight when using electrification system?, R: 0.9284484386444092
> Ans: May, Q: What is the hottest month in Hyderabad?, P: What is generally the hottest month in Hyderabad?, R: 0.9498719573020935
> Ans: in a single record, Q: What would all of this data be placed in a navigational approach?, P: How is information stored in a navigational system?, R: 0.8410745859146118
> Ans: Western, Q: What did the Great powers of 1914 do?, P: Which societal values had become world-wide by 1914?, R: 0.8219700455665588
> Ans: India, Q: Where is Sanskrit in Greater?, P: Where is Sanskrit usually found?, R: 0.941259503364563
> Ans: Sahel, Q: What is the sahara belt of 

  1%|          | 5/625 [00:53<1:47:55, 10.44s/it]

objective/kl: 46.12009811401367
ppo/returns/mean: -2.59531831741333
ppo/policy/advantages_mean: -0.013819878920912743
---------------------------------------------------------------------------------------------------
> Ans: convert attempts, Q: What does the clock do not run during?, P: Which plays do not cause time to run off the clock during the final minutes of a half?, R: 0.830646276473999
> Ans: Sassanian architecture, Q: What was the point arch used in?, P: What other type of architecture also made use of the pointed arch?, R: 0.9307811260223389
> Ans: ring-porous, Q: What does the vessels of the wood appear on a finished surface appear on a surface?, P: What type of woods might sometimes appear to have darker earlywood on a finished surface?, R: 0.9331985116004944
> Ans: absolute terms, Q: What terms should the number of individuals targeted be evaluated?, P: The number of people targeted in a genocide should not be solely evaluated by what?, R: 0.8431704044342041
> Ans: have b

  1%|          | 6/625 [01:03<1:47:32, 10.42s/it]

objective/kl: 49.962738037109375
ppo/returns/mean: -2.508425712585449
ppo/policy/advantages_mean: -0.00360055360943079
---------------------------------------------------------------------------------------------------
> Ans: more than 60, Q: How many Asian-American organizations did the coalition of?, P: How many Asian organizations were involved in filing the federal complaints?, R: 0.9362801909446716
> Ans: Vladimir the Great, Q: What did the Rus' name in the book of the Rus'?, P: Which ruler introduced Christianity in Kievan Rus?, R: 0.8240792751312256
> Ans: Murad I, Q: Who was the sultanate of Ottoman Empire?, P: Conquests by who began the transformation of the Ottoman sultanate into an Empire?, R: 0.8370700478553772
> Ans: 53%, Q: How many adolescents have had a romantic relationship?, P: How many percent of adolescents have had a romantic relationship lasting one month or longer by age 15?, R: 0.9405272603034973
> Ans: the Alps, Q: What are the most popular tourist destinations

  1%|          | 7/625 [01:13<1:45:35, 10.25s/it]

objective/kl: 44.89136505126953
ppo/returns/mean: -2.359884738922119
ppo/policy/advantages_mean: -0.004864310845732689
---------------------------------------------------------------------------------------------------
> Ans: Hong Kong, Q: Where was LaserDisc in 1990?, P: In what city was LaserDisk used as a popular rental medium in the 1990s?, R: 0.8421430587768555
> Ans: 2001, Q: Inducted into the Rock and Roll Hall of Fame in 2001, the band is the only group in which the band has composed more, P: When was Queen inducted into the Hall of Fame?, R: 0.011103290133178234
> Ans: 1920, Q: What year was the Tribhuvan Museum dedicated to king Mahendra?, P: What was the birth year of King Mahendra?, R: 0.9376464486122131
> Ans: 2014, Q: When did Hodgson et al. study the Afro-Asiatic languages? question, P: When did Hodgson publish his DNA study?, R: 0.9314708113670349
> Ans: MDNA, Q: What was the album Madonna's second album in 2012?, P: What was the name of Madonna's twelfth album?, R: 0.9

  1%|▏         | 8/625 [01:23<1:43:37, 10.08s/it]

objective/kl: 45.19919967651367
ppo/returns/mean: -2.3741812705993652
ppo/policy/advantages_mean: -0.026690345257520676
---------------------------------------------------------------------------------------------------
> Ans: using American-supplied equipment during the Turkish invasion of Cyprus in 1974, Q: What was a military embargo on Turkey in 1974?, P: Why did the US impose an arms embargo on Turkey?, R: 0.8507280945777893
> Ans: Gasparinus de Bergamo, Q: What was the name of the first book in France, Epistolae?, P: Who wrote Epistolae?, R: 0.8419656157493591
> Ans: fined 200 guilders (a year's wage for a skilled tradesman) and banned from the city, Q: In 17th-century Leiden, how could people opening their homes to services be?, P: What was the punishment for people who opened their homes to Catholic services during 17th-century Leiden?, R: 0.8519103527069092
> Ans: complexity, Q: What does classical music achieve?, P: Since it is written down, classical music can attain a high 

  1%|▏         | 9/625 [01:32<1:40:27,  9.79s/it]

objective/kl: 46.113616943359375
ppo/returns/mean: -2.273925304412842
ppo/policy/advantages_mean: 0.012690378352999687
---------------------------------------------------------------------------------------------------
> Ans: Gigantopterids, Q: What is a group of extinct seed plants that share many morphological traits with flowering plants? question, P: What group of now extinct seed plants had many of the traits of what are now flowering plants?, R: 0.9394246339797974
> Ans: the treatment of the windows and wall surfaces, Q: In the interior of the building, the verticals are usually repeated in what?, P: Where are the verticals found repeated in the interior of Gothic buildings?, R: 0.844828188419342
> Ans: forces in the field, Q: What forces in the field can deploy their own air defence capability?, P: Who will deploy their own air defence if their is an air threat?, R: 0.013707729056477547
> Ans: subject–verb–object, Q: What pattern does a basic sentence follow?, P: Being topic pro

  2%|▏         | 10/625 [01:39<1:32:44,  9.05s/it]

objective/kl: 45.88789749145508
ppo/returns/mean: -2.6724367141723633
ppo/policy/advantages_mean: -0.007313806563615799
---------------------------------------------------------------------------------------------------
> Ans: explicitly prohibited false therapeutic claims, Q: What did the Federal Food, Drug, and Cosmetic Act of 1938 prohibit false therapeutic claims?, P: What was one of the things the Federal Food, Drug and Cosmetic Act do?, R: 0.936554491519928
> Ans: None of the summits reaches the region of perpetual snow., Q: What does the Appalachian belt mean?, P: What is the climate like on the summits?, R: 0.9308024048805237
> Ans: Melbourne Shuffle, Q: What does the name of the contemporary dance in Melbourne?, P: What is one example of Australian contemporary dance?, R: 0.9353888034820557
> Ans: fourth, Q: What is the world's smallest film studio?, P: Among major film studies, where does Universal Studios rank in terms of age?, R: 0.8194790482521057
> Ans: 92%, Q: How many E

  2%|▏         | 11/625 [01:47<1:28:53,  8.69s/it]

objective/kl: 37.59285354614258
ppo/returns/mean: -1.76035475730896
ppo/policy/advantages_mean: -0.04554015025496483
---------------------------------------------------------------------------------------------------
> Ans: Carnivals, Q: What does some Belgian cities hold during Lent?, P: What do some Belgian cities hold during Lent?, R: 0.9518266320228577
> Ans: plasmids, Q: What type of DNA does Prokaryotes encode?, P: Small circles of DNA that encode only a few genes and are transferable between individuals are called what?, R: 0.818358838558197
> Ans: March 1, 1969, Q: What was a press release written on?, P: When was a press release put out about Kerry earning the Silver Star?, R: 0.838189423084259
> Ans: 1971, Q: What year did Meinhard die in a car accident?, P: What year did Schwarzenegger's brother die?, R: 0.9352932572364807
> Ans: before the turn of the 20th century, Q: What did the railway tracks as the Benguela railway in Angola started to be built?, P: When were railroad t

  2%|▏         | 12/625 [01:54<1:23:55,  8.22s/it]

objective/kl: 46.17695617675781
ppo/returns/mean: -2.4369006156921387
ppo/policy/advantages_mean: -0.005503034219145775
---------------------------------------------------------------------------------------------------
> Ans: the monarchy, Q: What was the name of Caesar's monarchy?, P: What did some members of the conspiracy believe Caesar would bring back?, R: 0.930492639541626
> Ans: King Edward's heavy redecorations, Q: What did the palace feel like King Edward VII's heavy redecorations?, P: What was at odds with Nash's original work in the palace?, R: 0.9279318451881409
> Ans: represent the nearest approximation in every respect of the author's final intentions, Q: What did the eds of Greg's work, P: What did Bower's say about Greg's method?, R: 0.9310576915740967
> Ans: Farnsworth image dissector, Q: What was the Baird system used for live programming?, P: What kind of camera was used to broadcast live shows under the Baird system?, R: 0.9409307241439819
> Ans: quiet instruments,

  2%|▏         | 13/625 [02:03<1:24:13,  8.26s/it]

objective/kl: 42.985836029052734
ppo/returns/mean: -2.1224071979522705
ppo/policy/advantages_mean: 0.0011252202093601227
---------------------------------------------------------------------------------------------------
> Ans: 1949, Q: When did the People's Republic of China begin?, P: When was the People's Republic of China founded?, R: 0.9489282965660095
> Ans: 1917, Q: When did the Commonwealth Liberal Party merged with several Labor dissidents?, P: In what year was the Nationalist Party of Australia formed?, R: 0.8352047204971313
> Ans: her second son Alfred ("Affie") died, Q: How did the Queen die in July?, P: What tragedy did Victoria face in July of 1900?, R: 0.8404062986373901
> Ans: Liberians and the Lebanese, Q: What are the differences between ethnic and Lebanese?, P: There is high percentage of interracial marriage between what two groups?, R: 0.8178682327270508
> Ans: fight against the town's racism, Q: What does Atticus need to do without help from other white citizens?,

  2%|▏         | 14/625 [02:10<1:20:21,  7.89s/it]

objective/kl: 43.773521423339844
ppo/returns/mean: -2.388742446899414
ppo/policy/advantages_mean: 0.016567349433898926
---------------------------------------------------------------------------------------------------
> Ans: Keïta regime, Q: What was overthrown in a bloodless military coup led by Moussa Traoré?, P: What regime was overthrown in 1968?, R: 0.9327109456062317
> Ans: qualitative, Q: What does the method of cultural sociological research do?, P: What is most of the research into sociological culture ?, R: 0.9367489218711853
> Ans: Deitsch, Q: What is Pennsylvania Dutch?, P: What's the native Pennsylvania Dutch word for the language?, R: 0.8459311127662659
> Ans: Professor Skousen, Q: Who took over as editor and head of the FARMS Critical Text of the Book of Mormon Project?, P: Who took over after the preliminary phase?, R: 0.9292077422142029
> Ans: receiver, Q: What type of receiver did Bell work on?, P: Bell needed to fix both the transmitter and what?, R: 0.0111335562542

  2%|▏         | 15/625 [02:17<1:19:21,  7.81s/it]

objective/kl: 45.25752639770508
ppo/returns/mean: -2.400679588317871
ppo/policy/advantages_mean: -0.011275867000222206
---------------------------------------------------------------------------------------------------
> Ans: a century, Q: How did David Hume write?, P: How long before Peirce did Hume write?, R: 0.9405149221420288
> Ans: Adabas, Oracle and DB2, Q: What are some general-purpose DBMSs that are undergoing upgrades since the 1970s?, P: Name three DBMSs that have been used since the 1970s., R: 0.844393789768219
> Ans: half an hour, Q: What was the average weight of Schwarzenegger working out for a year?, P: In 2011, how much time each day did Schwarzenegger say he lifted weights?, R: 0.8268279433250427
> Ans: Nizam VII, Q: What was the name of the hliam VII?, P: Which Nizam defeated by the Indian Army?, R: 0.8097097873687744
> Ans: the Constabulary, Q: What was the name of the Constabulary?, P: What was the Constable's force called, in English?, R: 0.03404450789093971
> Ans:

  3%|▎         | 16/625 [02:25<1:17:46,  7.66s/it]

objective/kl: 47.933841705322266
ppo/returns/mean: -2.7767176628112793
ppo/policy/advantages_mean: -0.017772844061255455
---------------------------------------------------------------------------------------------------
> Ans: Yellow, Q: What was the electric electric chair?, P: What color was Alabama's electric chair?, R: 0.9382452964782715
> Ans: historical or mythological themes, Q: What are the earliest forms of Greek drama?, P: A tragedy typically involved what subject matter?, R: 0.812821090221405
> Ans: Pesticides are substances meant for attracting, seducing, and then destroying any pest, Q: What is the term pesticide?, P: What is the purpose of a pesticide?, R: 0.9418738484382629
> Ans: the 1720s, Q: In what period did French Freemasonry spread to France?, P: When did English Freemasonry arrive in France?, R: 0.8615116477012634
> Ans: TV Parental Guidelines, Q: What is the content of TCM's films rated on?, P: What rating system is often used by TCM?, R: 0.9328557252883911
> A

  3%|▎         | 17/625 [02:32<1:18:09,  7.71s/it]

objective/kl: 44.073875427246094
ppo/returns/mean: -2.3906631469726562
ppo/policy/advantages_mean: 0.013294518925249577
---------------------------------------------------------------------------------------------------
> Ans: Law of Tuvalu, Q: What is the Supreme Court of Tuvalu?, P: What is the High Court's right to determine?, R: 0.9320095777511597
> Ans: Northern Russians, Q: What does the report say that Y Haplogroup N is a common name in Pomors? question, P: Pomors are also known as what?, R: 0.8148460388183594
> Ans: isolate the rebel barons in London, Q: John's strategy was to hl>, P: What was John's strategy?, R: 0.8332576155662537
> Ans: 50, Q: How many acres of storage are available at the site?, P: Approximately how many acres of outside storage does the Port of New Haven offer?, R: 0.845833957195282
> Ans: Vigo, Q: What was the most famous assault on Galicia?, P: Where did Sir Francis Drake attack in 1585 and again in 1589?, R: 0.8155720829963684
> Ans: the 20th century, Q

  3%|▎         | 18/625 [02:40<1:17:33,  7.67s/it]

objective/kl: 43.64839553833008
ppo/returns/mean: -2.269979476928711
ppo/policy/advantages_mean: 0.0012756786309182644
---------------------------------------------------------------------------------------------------
> Ans: Gordon Gund, Q: Who did the Blindness lobbie in 1985 and 2005 for US DST extensions?, P: What chair of the Retinitis Pigmentosa Foundation Fighting Blindness lobbied for an extension to daylight savings in the U.S.?, R: 0.8174535036087036
> Ans: provide teachers, Q: What did Prince Rastislav request the Emperor to, P: What had Prince Rastislav requested?, R: 0.9422250390052795
> Ans: raising the "aerial" wire, Q: What did Marconi find out he had a long wire?, P: What is a way to increase the strength of a radio transmission?, R: 0.9233606457710266
> Ans: Early Middle Ages and the High Middle Ages, Q: What  What, P: Along with the Late Middle Ages, what are the other two period of the Middle Ages?, R: 0.7935548424720764
> Ans: multiple ISPs interconnect at peering 

  3%|▎         | 19/625 [02:47<1:16:49,  7.61s/it]

objective/kl: 49.40911865234375
ppo/returns/mean: -2.7184956073760986
ppo/policy/advantages_mean: -0.00646184291690588
---------------------------------------------------------------------------------------------------
> Ans: Siddhatta Gotama, Q: What did you know the Buddha did exist?, P: What do some say is Buddha's real name?, R: 0.9280003309249878
> Ans: kickback, Q: What is an official's share of misappropriated funds allocated from his or her organization to an organization involved in corrupt bidding, P: What is the public official's share called when involved in corrupt bidding?, R: 0.9325548410415649
> Ans: a kind of perception that can be conscious or unconscious, applying to people as well as electrons, Q: What is the term "prehension" meant to indicate?, P: What is prehension used to define?, R: 0.9412853717803955
> Ans: financial services, Q: What are other important industries?, P: Mutual funds and insurance are what type of industry?, R: 0.8272709846496582
> Ans: Delawar

  3%|▎         | 20/625 [02:55<1:16:45,  7.61s/it]

objective/kl: 43.545562744140625
ppo/returns/mean: -2.2506613731384277
ppo/policy/advantages_mean: -0.03077569231390953
---------------------------------------------------------------------------------------------------
> Ans: Tomás de Torquemada, Q: Who led the Spanish Inquisition?, P: Who led the Spanish Inquisition?, R: 0.9525741338729858
> Ans: He Yingqin, Q: What did the 4th Front Army carry?, P: Who was the Chinese commander-in-chief of the 10th and 27th Army Groups?, R: 0.8297029733657837
> Ans: studied classical music, Q: What have, P: What have classical trained performers done extensively?, R: 0.9214174151420593
> Ans: Ark Royal, Q: What did the, P: What aircraft carrier received maintenance at Devonport Dockyard?, R: 0.914445698261261
> Ans: Stout and porter, Q: What are dark beers made using roasted malts or roast barley?, P: What do you call a dark beer that is brewed with roasted malts or barley?, R: 0.9407511949539185
> Ans: 32 million, Q: How many albums did West sell?,

  3%|▎         | 21/625 [03:03<1:17:34,  7.71s/it]

objective/kl: 43.01519775390625
ppo/returns/mean: -2.3164620399475098
ppo/policy/advantages_mean: -0.016844402998685837
---------------------------------------------------------------------------------------------------
> Ans: al-Farabi's commentary, Q: What did he read about the work of Aristotle?, P: What helped Avicenna understand the Metaphysics of Aristotle?, R: 0.9352065324783325
> Ans: capitalism, Q: What did the Third International Theory reject?, P: What economic philosophy was associated with the West?, R: 0.933594286441803
> Ans: Liberal Democrats, Q: What did the SDP and Liberals form?, P: What was the merger of SDP and the Liberals called?, R: 0.9423277378082275
> Ans: occupied part of their capital, Berlin, for one night, Q: What was the city where, P: What city did Hadik occupy?, R: 0.9297762513160706
> Ans: Britain, Q: What did the Suez Crisis exposed to?, P: A British minister thought which country was becoming an 'American satellite'?, R: 0.8148893117904663
> Ans: pos

  4%|▎         | 22/625 [03:10<1:15:57,  7.56s/it]

objective/kl: 45.17784881591797
ppo/returns/mean: -2.681993007659912
ppo/policy/advantages_mean: -0.052433740347623825
---------------------------------------------------------------------------------------------------
> Ans: 26 square kilometres, Q: How many kilometers does Tuvalu have?, P: What is the total land area of Tuvalu?, R: 0.8449804782867432
> Ans: enormous influence, Q: What did each szlachcic have?, P: How much influence did each szlachcic have over politics?, R: 0.8603295683860779
> Ans: Robert Boyle, Q: Who discovered and described the reaction between iron filings and dilute acids?, P: Who discovered Hydrogen gas?, R: 0.9339531064033508
> Ans: 1,500, Q: How many were reported missing?, P: How many soldiers from Napoleon's army were reported missing?, R: 0.9417231678962708
> Ans: a specific cutoff date, Q: What is a specific cutoff date for reception?, P: Some reception statutes impose what?, R: 0.011168358847498894
> Ans: medium-sized aquatic birds with broad bills, eye

  4%|▎         | 23/625 [03:18<1:17:33,  7.73s/it]

objective/kl: 43.985755920410156
ppo/returns/mean: -2.577760696411133
ppo/policy/advantages_mean: -0.009899545460939407
---------------------------------------------------------------------------------------------------
> Ans: Materialism, Q: What is closely related to physicalism?, P: Some people consider physicalism to be synonymous with what?, R: 0.8358490467071533
> Ans: touchscreen, Q: What is the iPod Touch?, P: The iPod Touch uses what kind of interface?, R: 0.8461365699768066
> Ans: adaptive, restorative and regulatory, Q: What were Augustus' reforms represented?, P: How were Augustus's reforms viewed?, R: 0.8637587428092957
> Ans: calques, Q: What is the name of Indonesian words?, P: What term describes the literally translated Indonesian versions of Dutch terms that have become standard?, R: 0.933756947517395
> Ans: weekly evenings and monthly weekends, Q: context> What do members train on?, P: How often do the TA troops regularly train?, R: 0.827893853187561
> Ans: halva, Q:

  4%|▍         | 24/625 [03:27<1:21:32,  8.14s/it]

objective/kl: 43.43010711669922
ppo/returns/mean: -2.496980667114258
ppo/policy/advantages_mean: 0.021900741383433342
---------------------------------------------------------------------------------------------------
> Ans: $2.1 billion, Q: What did the cost of the bridge in the Detroit–Windsor bridge?, P: How much is the Gordie Howe International Bridge expected to cost?, R: 0.8295533061027527
> Ans: three field armies, Q: How many armies did the Chinese Spring Offensive have?, P: How many armies did the Chinese use in the Fifth Phase Offensive?, R: 0.9437017440795898
> Ans: the American Revolution, Q: What was the major event of Boston Massacre?, P: Boston became one of the wealthiest international ports after what war?, R: 0.8180480003356934
> Ans: imprisoned or martyred, Q: What did some Christians resisted and were resisted?, P: In areas of strict enforcement, what happened to Christians?, R: 0.8218377232551575
> Ans: oil, Q: What did the southwestern China price drop over specul

  4%|▍         | 25/625 [03:37<1:25:24,  8.54s/it]

objective/kl: 47.30708694458008
ppo/returns/mean: -2.697758674621582
ppo/policy/advantages_mean: 0.010769917629659176
---------------------------------------------------------------------------------------------------
> Ans: COSCO, Q: Who owned Pier II?, P: What organization owns Pier II in Piraeus?, R: 0.848210871219635
> Ans: classical civilisation, Q: What is the most enduring period for analysing European history?, P: What is another name for Antiquity?, R: 0.9301806092262268
> Ans: Themba Dlamini, Q: What did the government declare a humanitarian crisis?, P: Who was the prime minister of Swaziland in 2004?, R: 0.8181110620498657
> Ans: /u/, Q: How does the /o / o /, P: What letter remains distinct?, R: 0.8107894062995911
> Ans: mud brick houses, Q: What did the neolithic constructions appear in?, P: What type of homes were built in the Neolithic era?, R: 0.9360272884368896
> Ans: 1895, Q: When was Lancashire inception?, P: What year did Rugby League start?, R: 0.836193323135376
> 

  4%|▍         | 26/625 [03:44<1:22:25,  8.26s/it]

objective/kl: 45.65447235107422
ppo/returns/mean: -2.7148070335388184
ppo/policy/advantages_mean: -0.013831594958901405
---------------------------------------------------------------------------------------------------
> Ans: much greater than their separation d, Q: What does the length and width of the plates do?, P: In an ideal model of a capacitor, what must be assumed about the size of the plates?, R: 0.8355190753936768
> Ans: 1960, Q: In what year did Lee and Capote travel to Kansas?, P: What year did Lee and Capote go to Kansas together?, R: 0.8676168322563171
> Ans: Talaat Harb, Q: What did the company financed the studio Misr in 1936?, P: Who financed Studio Misr?, R: 0.846940279006958
> Ans: 1992, Q: In what year was the UK Polytechnics designated as universities?, P: In what year were polytechnics in the UK given the university designation?, R: 0.9445331692695618
> Ans: nine, Q: How many children did Victoria marry?, P: How many children did Queen Victoria and Prince Albert 

  4%|▍         | 27/625 [03:53<1:24:10,  8.44s/it]

objective/kl: 46.82549285888672
ppo/returns/mean: -2.499849319458008
ppo/policy/advantages_mean: -0.009788726456463337
---------------------------------------------------------------------------------------------------
> Ans: Jan III Sobieski, Q: Who was the king of the Ottomans?, P: Which polish king led the fight against the Ottoman empire in the Battle of Vienna?, R: 0.8330919146537781
> Ans: Zen, Q: What did Dwight Goddard focus on in his Buddhist scriptures?, P: Goddard collected mainly what type of Buddhist scripture?, R: 0.8347311615943909
> Ans: 18th century, Q: In what century did the Manchus dominate Tibet?, P: When was Tibet dominated by the Manchus?, R: 0.8578073382377625
> Ans: the Defense Clandestine Service, Q: What is the name of the global clandestine service?, P: What intelligence service did the Department of Defense recently create?, R: 0.9313071966171265
> Ans: mild and also wet, Q: What is the summer of the islands?, P: What is the weather like in the summer time 

  4%|▍         | 28/625 [04:02<1:24:38,  8.51s/it]

objective/kl: 48.8499641418457
ppo/returns/mean: -2.7073721885681152
ppo/policy/advantages_mean: -0.0047171451151371
---------------------------------------------------------------------------------------------------
> Ans: Viktor Suvorov, Q: Who was the writer of The Black Book of Communism?, P: Who was skeptical of Stalin’s policies?, R: 0.9321053624153137
> Ans: thundershowers, Q: What does the average annual rainfall occur in Charleston?, P: Half of Charleston's annual rainfall occurs in what form?, R: 0.8429701924324036
> Ans: forest, Q: What is the east coast of Appalachia covered?, P: What originally covered the east coast of the Appalachians?, R: 0.9370855689048767
> Ans: World Broadcasting System and Associated Music Publishers, Q: What were the dominant radio transcription producers?, P: What were the top licensees of the Western Electric system in the 1930s?, R: 0.9354323148727417
> Ans: 1948, Prime Minister U Nu embarked upon a policy of nationalisation and the state was de

  5%|▍         | 29/625 [04:09<1:20:32,  8.11s/it]

objective/kl: 46.11994934082031
ppo/returns/mean: -2.556302547454834
ppo/policy/advantages_mean: -0.026333559304475784
---------------------------------------------------------------------------------------------------
> Ans: Around the second century BC, Q: Around the second century BC, what were the first-known city-states in central Myanmar?, P: What period was the first known city states established in Myanmar?, R: 0.013850994408130646
> Ans: Louis Miriani, Q: What was the last mayor of Detroit?, P: Who was Detroit's last Republican mayor?, R: 0.8531922698020935
> Ans: 1948, Q: What did the National Security Council issue Directive 10/2 calling for covert action against USSR?, P: In what year was directive 10/2 issued?, R: 0.8359910845756531
> Ans: World Wealth Report, Q: What did the 2011, P: What organization ranked New Delhi 39th in economic activity in 2011?, R: 0.9202055931091309
> Ans: Latin Church, Q: What was the name of the earliest universities in Europe?, P: What church 

  5%|▍         | 30/625 [04:16<1:16:48,  7.74s/it]

objective/kl: 45.505855560302734
ppo/returns/mean: -2.624690532684326
ppo/policy/advantages_mean: -0.006543579511344433
---------------------------------------------------------------------------------------------------
> Ans: ASPEC, Q: What was the joint proposal of AT&T Bell Laboratories, Thomson Consumer Electronics, Fraunhofer Society and CNET, P: What was the name given to the proposal?, R: 0.9173706769943237
> Ans: five, Q: How many FA Cups did Arsenal win between 1988 and 2005?, P: Over a 17 year period which began in the late 80s, stretching five years into the new millenia, how many FA Cups did Arsenal win?, R: 0.6475770473480225
> Ans: City of Man, Q: What was the name of the city of God?, P: What does Civitas terrena mean?, R: 0.9235515594482422
> Ans: one of the world's poorest countries, Q: Liberia is what kind of country?, P: What is Liberia considered around the world economically?, R: 0.8291416764259338
> Ans: Shaye J. D. Cohen, Q: Who was the historian of mixed marriag

  5%|▍         | 31/625 [04:23<1:14:26,  7.52s/it]

objective/kl: 43.683555603027344
ppo/returns/mean: -2.618239402770996
ppo/policy/advantages_mean: -0.030731061473488808
---------------------------------------------------------------------------------------------------


In [None]:
# save model 
