### Hemonc

In [1]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import Entrez
from urllib.error import HTTPError
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
columns_ref = ['study', 'condition', 'pmid', 'pub.date']
columns_result = ['study', 'condition', 'regimen', 'comparator', 'efficacy']
columns_indication = ['study', 'condition', 'stage_or_status']

preprocess = lambda df, col: df[col].dropna().drop_duplicates().sort_values(col).reset_index(drop=True)
ref = preprocess(pd.read_csv("Data/Raw/Hemonc/ref.table.csv"), columns_ref)
result = preprocess(pd.read_csv("Data/Raw/Hemonc/study_results.csv"), columns_result)
indication = preprocess(pd.read_csv("Data/Raw/Hemonc/indications.csv"), columns_indication)

efficacy = pd.read_excel("Data/Raw/Hemonc/efficacy.xlsx")
efficacy2label = dict(zip(efficacy['efficacy_raw'], efficacy['efficacy_std']))
efficacy2label = {k:v if 'Might Be ' not in v else v[len('Might Be '):] for k,v in efficacy2label.items()}
stage = pd.read_excel("Data/Raw/Hemonc/stage.xlsx")
stage2label = dict(zip(stage['stage_raw'], stage['stage_std']))

In [3]:
study2pmids = defaultdict(set)
for _, row in ref.iterrows():
    study2pmids[(row['study'], row['condition'])].add(row['pmid'])
    
study2result = defaultdict(set)
for _, row in result.iterrows():
    if (row['study'], row['condition']) not in study2pmids or efficacy2label[row['efficacy']] == 'Other' or row['regimen'] == row['comparator']: continue
    label = efficacy2label[row['efficacy']]
    switched = label.replace('Inferior', 'Superior') if 'Inferior' in label else label.replace('Superior', 'Inferior')
    study2result[(row['study'], row['condition'])].add((row['regimen'], row['comparator'], label))
    study2result[(row['study'], row['condition'])].add((row['comparator'], row['regimen'], switched))
    
for study in set(study2pmids.keys()) - set(study2result.keys()):
    del study2pmids[study]
    
study2stage = defaultdict(set)
for _, row in indication.iterrows():
    if (row['study'], row['condition']) not in study2result: continue
    study2stage[(row['study'], row['condition'])].add(stage2label[row['stage_or_status']])

In [22]:
Entrez.email = "" # your email address
def get_text(pmid, retries=50):
    for i in range(retries):
        try:
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="text")
            record = Entrez.read(handle)
            article = record['PubmedArticle'][0]['MedlineCitation']['Article']
            doc = article['ArticleTitle']
            if 'Abstract' in article: doc = ' '.join([doc] + article['Abstract']['AbstractText'])
            return pmid, doc
        except HTTPError as e:
            if e.code == 429: time.sleep(i)
            else: raise Exception(f"Failed to fetch data for PMID {pmid} after {retries} retries.")

pmid2doc = {}
path_doc = "Data/Raw/Hemonc/docs.csv"
if os.path.exists(path_doc):
    pmid2doc = pd.read_csv(path_doc)
    pmid2doc = dict(zip(pmid2doc['pmid'], pmid2doc['doc']))

pmids_missing = set([pmid for study, pmids in study2pmids.items() for pmid in pmids]) - set(pmid2doc.keys())
pmids_missing |= set([pmid for pmid, doc in pmid2doc.items() if not isinstance(doc, str) and np.isnan(doc)])
if len(pmids_missing):
    pmid2doc_missing = {}
    with ThreadPoolExecutor(max_workers=10) as exe:
        futures = [exe.submit(get_text, pmid) for pmid in pmids_missing]
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                pmid, doc = future.result()
                pmid2doc_missing[pmid] = doc
            except Exception as e:
                print(f"Error: {e}")
    for pmid in pmid2doc_missing:
        if not isinstance(pmid2doc_missing[pmid], str) and np.isnan(pmid2doc_missing[pmid]):
            pmid2doc_missing[pmid] = get_text(pmid)[1]
    pmid2doc |= pmid2doc_missing
    pmid2doc_df = pd.DataFrame(list(pmid2doc.items()), columns=['pmid', 'doc'])
    pmid2doc_df.to_csv(path_doc, index=False)

In [7]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openai_model = "gpt-4o"

def call_gpt(message_user, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": message_user}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 4096 # 512
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4 after {retries} retries.")

In [10]:
templates_qn = [
    'Choose an option that best describes the efficacy of {REGIMEN} compared to {COMPARATOR} when used to treat {CONDITION}.',
    'Select the option that most accurately reflects the effectiveness of {REGIMEN} versus {COMPARATOR} in treating {CONDITION}.',
    'Which option best summarizes the comparative efficacy of {REGIMEN} and {COMPARATOR} for managing {CONDITION}?',
    'Identify the option that best summarizes the effectiveness of {REGIMEN} versus {COMPARATOR} in treating {CONDITION}.',
    'Which option most effectively illustrates the efficacy of {REGIMEN} when compared with {COMPARATOR} for {CONDITION}?'
]
rephrase = \
f'''
### Instruction
Do not respond to the question. 
Instead, rephrase the given question template into 20 other versions that are semantically equivalent.

## Version 0: {templates_qn[0]}
## Version 1: {templates_qn[1]}
## Version 2: {templates_qn[2]}
## Version 3: {templates_qn[3]}
## Version 4: {templates_qn[4]}
'''
response = call_gpt(rephrase)
matches = [re.search(fr"## Version {i}: (.*)", response) for i in range(5, 20)]           
templates_qn += [each.group(1) for each in matches if each is not None and len(each.group(1))]

In [23]:
option2idx = {'superior':1, 'inferior':2, 'no difference':3}
dataset = []
for key, values in study2result.items():
    evidence = '\n\n'.join([pmid2doc[pmid] for pmid in study2pmids[key]])
    stage = '' if key not in study2stage else ' ({})'.format(', '.join(sorted(study2stage[key])))
    conditon = key[1] + stage
    for regimen, comparator, efficacy in values:
        if regimen == comparator: continue
        questions = [each.format(**{'REGIMEN':regimen, 'COMPARATOR':comparator, 'CONDITION':conditon}) for each in templates_qn]
        answer = option2idx[efficacy.lower()]
        dataset.append((evidence, *questions, answer))
        
columns = ['evidence'] + [f'question {i}' for i in range(1, len(templates_qn)+1)] + ['answer']
dataset = pd.DataFrame(dataset, columns=columns)
dataset = dataset.dropna().drop_duplicates(subset=columns[:-1], keep=False).reset_index(drop=True)
for option, idx in option2idx.items():
    dataset[f'option {idx}'] = option
dataset.to_csv('Data/Input/Hemonc.csv', index=False)

In [4]:
dataset

Unnamed: 0,evidence,question 1,question 2,question 3,question 4,question 5,question 6,question 7,question 8,question 9,...,question 15,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3
0,"A double-blind, placebo-controlled, randomized...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Cisplati...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Cisplatin an...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
1,"A double-blind, placebo-controlled, randomized...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Cisplati...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,"Which selection best reflects how Cisplatin, T...",Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
2,Thalidomide maintenance treatment increases pr...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Interfer...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Interferon a...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
3,Thalidomide maintenance treatment increases pr...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Interfer...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Interferon a...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
4,"Randomized, Double-Blind, Placebo-Controlled P...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Placebo ...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Placebo perf...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6207,Adjuvant Abemaciclib Plus Endocrine Therapy fo...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Abemacic...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Abemaciclib ...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
6208,Topotecan versus paclitaxel for the treatment ...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Paclitax...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Paclitaxel m...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
6209,Topotecan versus paclitaxel for the treatment ...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Topoteca...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Topotecan mo...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
6210,Bendamustine prolongs progression-free surviva...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how CMF comp...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how CMF performs...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference


### PubMedQA

In [6]:
import re
import json
import time
import openai
import random
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
raw = load_dataset("qiaojin/PubMedQA", data_dir="pqa_labeled", split="train")
raw.to_json('Data/Raw/PubMedQA/PubMedQA.jsonl')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2201997

In [9]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openai_model = "gpt-4o"

def call_gpt(message_user, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": message_user}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 4096 # 512
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4 after {retries} retries.")

In [7]:
template_rephrase = \
'''
### Instruction
Do not respond to the question. 
Instead, follow the provided examples and rephrase the given question into 20 other versions that are semantically equivalent.

### Example
## Question: Which option best summarizes the comparative efficacy of regimen and comparator for managing condition?
## Version 1: Choose an option that best describes the efficacy of regimen compared to comparator when used to treat condition.
## Version 2: Select the option that most accurately reflects the effectiveness of regimen versus comparator in treating condition.
## Version 3: Identify the option that best summarizes the effectiveness of regimen versus comparator in treating condition.
## Version 4: Which option most effectively illustrates the efficacy of regimen when compared with comparator for condition?

### Your Task
## Question: {QUESTION}
'''

def rephrase_qn(question, retries=50):
    query = template_rephrase.format(**{'QUESTION':question})
    for _ in range(retries):
        response = call_gpt(query)
        matches = [re.search(fr"## Version {i}: (.*)", response) for i in range(1, 20)]
        if all([each is not None and len(each.group(1)) for each in matches]):            
            questions = [question] + [each.group(1) for each in matches]
            if len(set(questions)) == 20: return questions
    return [question] + [None]*19

In [10]:
def prepare_row(idx, row):
    evidence = '\n\n'.join(row['context']['contexts'])
    questions = rephrase_qn(row['question'])
    row = (evidence, *questions, option2idx[row['final_decision']])
    return idx, row

raw = [json.loads(line) for line in open('Data/Raw/PubMedQA/PubMedQA.jsonl')]
option2idx = {'yes':1, 'no':2, 'maybe':3}

dataset, max_workers = {}, 50
with ThreadPoolExecutor(max_workers=max_workers) as exe:
    futures = [exe.submit(prepare_row, idx, row) for idx, row in enumerate(raw)]
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            idx, row = future.result()
            dataset[idx] = row
        except Exception as e:
            print(f"Error: {e}")

100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:24<00:00,  3.08it/s]


In [11]:
columns = ['evidence'] + [f'question {i}' for i in range(1, 21)] + ['answer']
dataset = [row for idx, row in sorted(dataset.items(), key = lambda x: x[0])]
dataset = pd.DataFrame(dataset, columns=columns)
dataset = dataset.dropna().drop_duplicates(subset=columns[:-1], keep=False).reset_index(drop=True)
for option, idx in option2idx.items():
    dataset[f'option {idx}'] = option
dataset.to_csv('Data/Input/PubMedQA.csv', index=False)

In [12]:
dataset

Unnamed: 0,evidence,question 1,question 2,question 3,question 4,question 5,question 6,question 7,question 8,question 9,...,question 15,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3
0,Programmed cell death (PCD) is the regulated d...,Do mitochondria play a role in remodelling lac...,Are mitochondria involved in the remodeling of...,Do mitochondria contribute to the restructurin...,Is there a role for mitochondria in the transf...,Do mitochondria influence how lace plant leave...,What role do mitochondria have in the reformat...,Are mitochondria responsible for changes in la...,Is the restructuring of lace plant leaves duri...,Do mitochondria affect the way lace plant leav...,...,Do mitochondria play a significant part in the...,Is mitochondrial involvement evident in the re...,To what extent do mitochondria contribute to l...,Are mitochondrial roles crucial in shaping lac...,Do mitochondria have an essential function in ...,Is there a mitochondrial role in the construct...,1,yes,no,maybe
1,Assessment of visual acuity depends on the opt...,Landolt C and snellen e acuity: differences in...,How do the Landolt C and Snellen E tests diffe...,What are the distinctions between Landolt C an...,In what ways do Landolt C and Snellen E acuity...,How does acuity measurement compare between La...,What are the key differences between Landolt C...,How do the acuity assessments of Landolt C and...,What differences exist between the Landolt C a...,"When assessing strabismus amblyopia, how do th...",...,"In cases of strabismus amblyopia, how does the...",What differences are there in assessing strabi...,How do Landolt C and Snellen E tests compare i...,How does the acuity assessment with Landolt C ...,How are the Landolt C and Snellen E tests diff...,What is the difference in acuity testing betwe...,2,yes,no,maybe
2,Apparent life-threatening events in infants ar...,"Syncope during bathing in infants, a pediatric...",Could syncope in infants during bathing be a p...,Is the occurrence of syncope in infants while ...,Might fainting episodes in infants during bath...,Are syncope episodes in infants during bathing...,Does syncope in infants when bathing suggest a...,Is fainting in infants during bathing a sign o...,Could fainting episodes during baths in infant...,Is there a possibility that syncope in infants...,...,Might an infant experiencing syncope during a ...,Is there a connection between bathing-induced ...,Can syncope in infants during bathing be attri...,Is syncope during bathing in infants potential...,Might the syncope observance in infants while ...,Could infant syncope during bathing sessions i...,1,yes,no,maybe
3,The transanal endorectal pull-through (TERPT) ...,Are the long-term results of the transanal pul...,Do the long-term outcomes of the transanal pul...,How do the long-term results of the transanal ...,Are the long-term effects of the transanal pul...,Do the long-term results for the transanal pul...,Are the transanal pull-through's long-term out...,How do the transanal pull-through's long-term ...,Is the transanal pull-through's long-term effe...,Do the long-term effects of transanal pull-thr...,...,Does the transanal pull-through provide long-t...,Are the sustained results of the transanal pul...,How do transanal pull-through and transabdomin...,Are the transanal pull-through's results over ...,Are the long-term results from transanal pull-...,Is the long-term effectiveness of the transana...,2,yes,no,maybe
4,Telephone counseling and tailored print commun...,Can tailored interventions increase mammograph...,Are personalized interventions effective in bo...,Do customized interventions lead to increased ...,Can individualized interventions enhance the r...,Are HMO women more likely to undergo mammograp...,Do specific interventions designed for HMO wom...,Can the implementation of tailored interventio...,Are mammography usage rates among HMO women af...,Will personalized programs encourage more HMO ...,...,Are HMO women more inclined to use mammography...,Do tailored strategies for HMO women enhance t...,Can customized interventions boost mammography...,Are personalized efforts successful in raising...,Will tailored intervention efforts lead to a r...,Are interventions specifically designed for HM...,1,yes,no,maybe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"After 34 weeks gestation, summary measures of ...",Does gestational age misclassification explain...,Could the misclassification of gestational age...,Is the variation in birthweights between Austr...,Can the difference in birthweights between Aus...,Might the misclassification of gestational age...,Does the misclassification of gestational age ...,Is gestational age misclassification a reason ...,Can the observed birthweight disparities betwe...,Could the differences in birthweights between ...,...,Could the disparity in birthweights of Austral...,Are differences in assessing gestational age r...,Does the variation in birthweights between Aus...,Is the disparity in birthweights between Austr...,Might the classification of gestational age er...,Is the difference in birthweights between Aust...,2,yes,no,maybe
996,To evaluate the accuracy of ultrasonographic e...,Is there any interest to perform ultrasonograp...,Is performing ultrasonography on boys with und...,Does the prospect of using ultrasonography in ...,Is there any reason to consider ultrasonograph...,Is there an interest in utilizing ultrasonogra...,Are there any benefits to conducting ultrasono...,Is there any value in performing ultrasonograp...,Is ultrasonography a valued method for evaluat...,Does ultrasonography provide any advantages in...,...,What is the level of interest in the applicati...,Is performing ultrasonography deemed interesti...,Could ultrasonography be of any interest when ...,Would there be any merit in performing ultraso...,Is it worthwhile to perform ultrasonography on...,Is there any clinical interest in using ultras...,2,yes,no,maybe
997,We analyzed the pharmacokinetic-pharmacodynami...,Is peak concentration needed in therapeutic dr...,Is it necessary to measure peak concentration ...,Should peak concentration be considered in the...,Is assessing peak concentration essential in t...,Do we need to monitor peak concentration in th...,Is measuring peak concentration required in va...,Is checking peak concentration important in th...,Is it important to include peak concentration ...,Is the assessment of peak concentration necess...,...,Is the measurement of peak concentration cruci...,Does therapeutic drug monitoring of vancomycin...,Should peak concentration measurements be incl...,Is it essential to determine peak concentratio...,Is evaluating peak concentration a necessity i...,Is it critical to monitor peak concentration w...,2,yes,no,maybe
998,This investigation assesses the effect of plat...,Can autologous platelet-rich plasma gel enhanc...,Does autologous platelet-rich plasma gel impro...,Is the healing after mandibular third molar ex...,Can the use of autologous platelet-rich plasma...,Does applying autologous platelet-rich plasma ...,Is surgical healing improved by autologous pla...,Can the healing outcome be enhanced by autolog...,Does the application of autologous platelet-ri...,Is recovery after surgical extraction of mandi...,...,Is the healing period reduced after mandibular...,Does autologous platelet-rich plasma gel contr...,Can the healing process be enhanced with autol...,Does the use of autologous platelet-rich plasm...,Is healing after mandibular third molar surgic...,Are healing outcomes better with autologous pl...,1,yes,no,maybe


### NQ

In [1]:
import re
import json
import time
import openai
import random
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
raw = load_dataset("google-research-datasets/natural_questions", split="validation")
raw.to_json('Data/Raw/NQ/NQ.jsonl')

Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

3712458270

In [2]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openai_model = "gpt-4o"

def call_gpt(message_user, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": message_user}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 512
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4 after {retries} retries.")

In [3]:
template_rephrase = \
'''
### Instruction
Do not respond to the question. 
Instead, follow the provided examples and rephrase the given question into 20 other versions that are semantically equivalent.

### Example
## Question: Which option best summarizes the comparative efficacy of regimen and comparator for managing condition?
## Version 1: Choose an option that best describes the efficacy of regimen compared to comparator when used to treat condition.
## Version 2: Select the option that most accurately reflects the effectiveness of regimen versus comparator in treating condition.
## Version 3: Identify the option that best summarizes the effectiveness of regimen versus comparator in treating condition.
## Version 4: Which option most effectively illustrates the efficacy of regimen when compared with comparator for condition?

### Your Task
## Question: {QUESTION}
'''

def rephrase_qn(question, retries=20):
    query = template_rephrase.format(**{'QUESTION':question})
    for _ in range(retries):
        response = call_gpt(query)
        matches = [re.search(fr"## Version {i}: (.*)", response) for i in range(1, 20)]
        if all([each is not None and len(each.group(1)) for each in matches]):            
            questions = [question] + [each.group(1) for each in matches]
            if len(set(questions)) == 20: return questions
    return [question] + [None]*19

In [4]:
template_mcq = \
'''
### Instruction
Using the provided examples as a guide, transform the given question with a correct answer into a multiple-choice question. 
Provide two additional incorrect options that are similar in type or category to the correct answer.

# Example 1
## Question: Which continent is the largest by land area?
## Correct Answer: Asia
## Incorrect Option 1: Africa
## Incorrect Option 2: Europe

# Example 2
## Question: Is the last name scott irish or scottish?
## Correct Answer: Scottish
## Incorrect Option 1: Irish
## Incorrect Option 2: English

# Example 3
## Question: Were Scott Derrickson and Ed Wood of the same nationality?
## Correct Answer: Yes
## Incorrect Option 1: No
## Incorrect Option 2: Maybe

# Your Task
## Question: {QUESTION}
## Correct Answer: {ANSWER}
'''

def convert_mcq(question, answer, retries=20):
    query = template_mcq.format(**{'QUESTION':question, 'ANSWER':answer})
    for _ in range(retries):
        response = call_gpt(query)
        match1 = re.search(r"## Incorrect Option 1: (.*)", response)
        match2 = re.search(r"## Incorrect Option 2: (.*)", response)
        if match1 and len(match1.group(1)) and match2 and len(match2.group(1)):            
            options = [answer.capitalize(), match1.group(1).capitalize(), match2.group(1).capitalize()]
            random.shuffle(options)
            if len(set(options)) == 3: return options
    return [answer.capitalize(), None, None]

In [5]:
def extract_evidence(evidence):
    soup = BeautifulSoup(evidence, "html.parser")
    evidence = ' '.join([p.get_text(strip=True) for p in soup.find_all("p") if p.get_text(strip=True)])
    return evidence

def prepare_row(idx, row):
    question, answer = row['question'], row['answer'][0]
    questions = rephrase_qn(question)
    options = convert_mcq(question, answer)
    answer = options.index(answer.capitalize())+1
    evidence = extract_evidence(' '.join(raw[raw_id2idx[row['id']]]['document']['tokens']['token']))
    row = (evidence, *questions, answer, *options)
    return idx, row

raw = [json.loads(line) for line in open('Data/Raw/NQ/NQ.jsonl')]
raw_genread = [json.loads(line) for line in open('Data/Raw/NQ/NQ-GenRead.jsonl')]
raw_id2idx = {each['id']:idx for idx, each in enumerate(raw)}

dataset, max_workers = {}, 50
with ThreadPoolExecutor(max_workers=max_workers) as exe:
    futures = [exe.submit(prepare_row, idx, row) for idx, row in enumerate(raw_genread)]
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            idx, row = future.result()
            dataset[idx] = row
        except Exception as e:
            print(f"Error: {e}")

100%|██████████████████████████████████████████████████████████████████████████████████| 3610/3610 [24:17<00:00,  2.48it/s]


In [6]:
columns = ['evidence'] + [f'question {i}' for i in range(1, 21)] + ['answer'] + [f'option {i}' for i in range(1, 4)]
dataset = [row for idx, row in sorted(dataset.items(), key = lambda x: x[0])]
dataset = pd.DataFrame(dataset, columns=columns)
dataset = dataset.dropna().drop_duplicates(subset=columns[:21], keep=False).reset_index(drop=True)
dataset.to_csv('Data/Input/NQ.csv', index=False)

In [7]:
dataset

Unnamed: 0,evidence,question 1,question 2,question 3,question 4,question 5,question 6,question 7,question 8,question 9,...,question 15,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3
0,The Nobel Prize in Physics ( Swedish : Nobelpr...,who got the first nobel prize in physics?,Who was the inaugural recipient of the Nobel P...,Who was awarded the first Nobel Prize in Physics?,Can you tell me who received the first Nobel P...,Who is recognized as the first Nobel laureate ...,Who was honored with the first Nobel Prize in ...,Who was the first individual to win the Nobel ...,Can you name the first person to be awarded th...,Who was the earliest recipient of the Nobel Pr...,...,Who was the first to be honored with the Nobel...,Who achieved the first Nobel Prize award in Ph...,Who was the first laureate of the Nobel Prize ...,Who gained the inaugural Nobel Prize in Physics?,Who was the debut winner of the Nobel Prize in...,Who earned the first Nobel Prize in the Physic...,1,Wilhelm conrad röntgen,Marie curie,Albert einstein
1,Deadpool 2 is an upcoming American superhero f...,when is the next deadpool movie being released?,What is the release date for the next Deadpool...,Can you tell me when the upcoming Deadpool mov...,When will the next installment of the Deadpool...,Do you know the release date for the upcoming ...,When should we expect the next Deadpool movie ...,Could you provide the release date for the new...,What is the scheduled release date for the nex...,When is the anticipated release date of the up...,...,What is the release schedule for the upcoming ...,When is the official release date for the next...,Can you tell me when the next Deadpool film wi...,When is the release schedule for the next Dead...,Do you know when the next film in the Deadpool...,When will the new Deadpool movie be released t...,3,"July 10, 2018","June 1, 2018","May 18 , 2018"
2,Shortwave radio is radio transmission using sh...,which mode is used for short wave broadcast se...,What is the mode employed for short wave broad...,Which mode is utilized for broadcasting short ...,Identify the mode that is applied in short wav...,What mode is adopted for short wave broadcast ...,Which mode serves as the standard for short wa...,"In short wave broadcast service, which mode is...",What mode is typically used for services broad...,Determine the mode that is used for short wave...,...,Which operational mode is used in short wave b...,What is the preferred mode for short wave broa...,Specify the mode that is used for short wave b...,What mode do short wave broadcasting services ...,Which mode is designated for use in short wave...,What mode is standard for short wave broadcast...,3,Rtty,Psk31,Olivia
3,Nigeria is a country in West Africa . Nigeria ...,the south west wind blows across nigeria between?,During which period does the southwest wind bl...,Across which time does the southwest wind make...,When does the southwest wind sweep across Nige...,At what times do the southwest winds affect Ni...,Over what span does Nigeria experience southwe...,The southwest wind crosses Nigeria at what time?,What is the time frame for the southwest wind ...,During what months do the southwest winds occu...,...,Across which season do southwest winds blow in...,At what time span does Nigeria experience the ...,When is the southwest wind active across Nigeria?,The influence of the southwest wind is felt in...,Which period marks the presence of southwest w...,During what period is Nigeria under the impact...,3,Beginning of august,End of october,Till september
4,Health or vitality is an attribute assigned to...,what does hp mean in war and order?,What is the meaning of hp in the context of Wa...,"In War and Order, what does the term hp refer to?",Could you explain what hp stands for in War an...,"In War and Order, what does hp denote?",What does hp signify within War and Order?,How is hp defined in War and Order?,What is the interpretation of hp in War and Or...,"In the game War and Order, what does hp mean?",...,What does hp imply in War and Order?,"In War and Order, what does the acronym hp sta...",What does the term hp mean in War and Order ga...,"In the context of War and Order, what is hp?",Can you clarify what hp means in War and Order?,What does hp stand for in the game War and Order?,1,Hit points or health points,Hero points,High power
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591,"In astronomy , the geocentric model ( also kno...",who challenged the aristotelian model of a geo...,Who questioned the Aristotelian idea of a geoc...,Who disputed the concept of a geocentric unive...,Which individual contested the Aristotelian vi...,Who opposed the traditional Aristotelian geoce...,Who contradicted the idea of an Earth-centered...,Which person challenged Aristotle's notion of ...,Who was the critic of the Earth-centered unive...,Who was responsible for challenging the Aristo...,...,Who defied the understanding of a geocentric u...,Which scholar challenged the geocentric worldv...,Who undermined the model of a universe with Ea...,Who called into question the Aristotelian asse...,Who offered a challenge to the Earth-centric c...,Who contested the notion of an Earth-centered ...,2,Ptolemy,Copernicus,Galileo
3592,The Miraculous Journey of Edward Tulane is a 2...,when was the miraculous journey of edward tula...,"What is the publication date of ""The Miraculou...","In which year was ""The Miraculous Journey of E...","When did ""The Miraculous Journey of Edward Tul...","Can you tell me the year ""The Miraculous Journ...","When did the book titled ""The Miraculous Journ...","What year was ""The Miraculous Journey of Edwar...","When was ""The Miraculous Journey of Edward Tul...","Could you specify the publication year of ""The...",...,"Can you provide the year that ""The Miraculous ...","When was the date of publication for ""The Mira...","Could you inform me when ""The Miraculous Journ...","When did ""The Miraculous Journey of Edward Tul...","What is the official publication year of ""The ...","In which year did ""The Miraculous Journey of E...",2,2004,2006,2008
3593,"Lord Banquo / ˈbæŋkwoʊ / , the Thane of Lochab...",character in macbeth who is murdered and appea...,Which character in Macbeth is killed and later...,Identify the character in Macbeth who is slain...,"In Macbeth, which character is murdered and su...",Who in Macbeth is assassinated and later shows...,"In Macbeth, which individual is killed and rea...",Which character meets their end in Macbeth but...,Identify the murdered character in Macbeth who...,Who is the character that is slain in Macbeth ...,...,Identify the ghostly character in Macbeth who ...,Who appears as a ghost in Macbeth after being ...,"In Macbeth, whose death is followed by a ghost...","In Shakespeare's Macbeth, which character is m...",Which person is murdered in Macbeth and then s...,Who is the character in Macbeth that returns a...,3,Macduff,Duncan,Banquo
3594,As You Like It is a pastoral comedy by William...,when was as you like it first performed?,"In what year did the play ""As You Like It"" hav...","Can you tell me the first time ""As You Like It...","When did the initial performance of ""As You Li...",What is the date of the premiere performance o...,"What year marks the debut performance of ""As Y...","When was ""As You Like It"" staged for the first...","Could you inform me about the year ""As You Lik...","What year was the first staging of ""As You Lik...",...,"When did audiences first see ""As You Like It"" ...","Can you specify the year ""As You Like It"" was ...","What year did ""As You Like It"" have its inaugu...","When did the play ""As You Like It"" first appea...","Which year saw the first performance of ""As Yo...","Can you indicate when ""As You Like It"" was fir...",2,1611,1603,1599


### HotpotQA

In [1]:
import re
import json
import time
import openai
import random
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openai_model = "gpt-4o"

def call_gpt(message_user, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": message_user}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 512
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4 after {retries} retries.")

In [3]:
template_rephrase = \
'''
### Instruction
Do not respond to the question. 
Instead, follow the provided examples and rephrase the given question into 20 other versions that are semantically equivalent.

### Example
## Question: Which option best summarizes the comparative efficacy of regimen and comparator for managing condition?
## Version 1: Choose an option that best describes the efficacy of regimen compared to comparator when used to treat condition.
## Version 2: Select the option that most accurately reflects the effectiveness of regimen versus comparator in treating condition.
## Version 3: Identify the option that best summarizes the effectiveness of regimen versus comparator in treating condition.
## Version 4: Which option most effectively illustrates the efficacy of regimen when compared with comparator for condition?

### Your Task
## Question: {QUESTION}
'''

def rephrase_qn(question, retries=20):
    query = template_rephrase.format(**{'QUESTION':question})
    for _ in range(retries):
        response = call_gpt(query)
        matches = [re.search(fr"## Version {i}: (.*)", response) for i in range(1, 20)]
        if all([each is not None and len(each.group(1)) for each in matches]):            
            questions = [question] + [each.group(1) for each in matches]
            if len(set(questions)) == 20: return questions
    return [question] + [None]*19

In [4]:
template_mcq = \
'''
Using the provided examples as a guide, transform the given question with a correct answer into a multiple-choice question. 
Provide two additional incorrect options that are similar in type or category to the correct answer.

# Example 1
## Question: Which continent is the largest by land area?
## Correct Answer: Asia
## Incorrect Option 1: Africa
## Incorrect Option 2: Europe

# Example 2
## Question: Is the last name scott irish or scottish?
## Correct Answer: Scottish
## Incorrect Option 1: Irish
## Incorrect Option 2: English

# Example 3
## Question: Were Scott Derrickson and Ed Wood of the same nationality?
## Correct Answer: Yes
## Incorrect Option 1: No
## Incorrect Option 2: Maybe

# Your task
## Question: {QUESTION}
## Correct Answer: {ANSWER}
'''

def convert_mcq(question, answer, retries=20):
    query = template_mcq.format(**{'QUESTION':question, 'ANSWER':answer})
    for _ in range(retries):
        response = call_gpt(query)
        match1 = re.search(r"## Incorrect Option 1: (.*)", response)
        match2 = re.search(r"## Incorrect Option 2: (.*)", response)
        if match1 and len(match1.group(1)) and match2 and len(match2.group(1)):            
            options = [answer.capitalize(), match1.group(1).capitalize(), match2.group(1).capitalize()]
            random.shuffle(options)
            if len(set(options)) == 3: return options
    return [answer.capitalize(), None, None]

In [5]:
def prepare_row(idx, row):
    question, answer = row['question'], row['answer']
    questions = rephrase_qn(question)
    options = convert_mcq(question, answer)
    answer = options.index(answer.capitalize())+1
    name2paragraph = {each[0]:''.join(each[1]) for each in row['context']}
    evidence = '\n\n'.join([name2paragraph[name] for name, _ in row['supporting_facts']])
    row = (evidence, *questions, answer, *options)
    return idx, row

raw = json.load(open('Data/Raw/HotpotQA/hotpot_dev_distractor_v1.json'))
dataset, max_workers = {}, 50
with ThreadPoolExecutor(max_workers=max_workers) as exe:
    futures = [exe.submit(prepare_row, idx, row) for idx, row in enumerate(raw)]
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            idx, row = future.result()
            dataset[idx] = row
        except Exception as e:
            print(f"Error: {e}")

100%|████████████████████████████████████████████████████████████████████████████████| 7405/7405 [2:30:47<00:00,  1.22s/it]


In [6]:
columns = ['evidence'] + [f'question {i}' for i in range(1, 21)] + ['answer'] + [f'option {i}' for i in range(1, 4)]
dataset = [row for idx, row in sorted(dataset.items(), key = lambda x: x[0])]
dataset = pd.DataFrame(dataset, columns=columns)
dataset = dataset.dropna().drop_duplicates(subset=columns[:21], keep=False).reset_index(drop=True)
dataset.to_csv('Data/Input/HotpotQA.csv', index=False)

In [7]:
dataset

Unnamed: 0,evidence,question 1,question 2,question 3,question 4,question 5,question 6,question 7,question 8,question 9,...,question 15,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3
0,"Scott Derrickson (born July 16, 1966) is an Am...",Were Scott Derrickson and Ed Wood of the same ...,Are Scott Derrickson and Ed Wood from the same...,Did Scott Derrickson and Ed Wood share the sam...,Do Scott Derrickson and Ed Wood have the same ...,Is the nationality of Scott Derrickson the sam...,Are the national origins of Scott Derrickson a...,Is Scott Derrickson's nationality the same as ...,Do Scott Derrickson and Ed Wood belong to the ...,Are Scott Derrickson and Ed Wood citizens of t...,...,Do Scott Derrickson and Ed Wood have the same ...,Were Scott Derrickson and Ed Wood born in the ...,Is the national affiliation of Scott Derrickso...,Are Scott Derrickson and Ed Wood of identical ...,Do Scott Derrickson and Ed Wood share the same...,Are Scott Derrickson and Ed Wood nationals of ...,3,No,Possibly,Yes
1,Kiss and Tell is a 1945 American comedy film s...,What government position was held by the woman...,What official government role did the actress ...,Which public office was occupied by the woman ...,"In Kiss and Tell, what governmental role was h...",The actress playing Corliss Archer in the film...,What was the official government title of the ...,What government office did the woman in the ro...,Which government position did the actress behi...,What government office did the woman portrayin...,...,What was the governmental title of the woman w...,"In Kiss and Tell, what was the government role...",The actress who portrayed Corliss Archer in Ki...,What was the government role of the woman who ...,Which government office did the actress known ...,"In the film Kiss and Tell, what official gover...",3,Ambassador to the united nations,Secretary of state,Chief of protocol
2,"The Laleli Mosque (Turkish: ""Laleli Camii, or ...",Are the Laleli Mosque and Esma Sultan Mansion ...,Is the Laleli Mosque situated in the same neig...,Do the Laleli Mosque and Esma Sultan Mansion s...,Can the Laleli Mosque and Esma Sultan Mansion ...,Are the Laleli Mosque and the Esma Sultan Mans...,Are the locations of the Laleli Mosque and Esm...,Do the Laleli Mosque and Esma Sultan Mansion r...,Is the neighborhood that contains the Laleli M...,Are the Laleli Mosque and Esma Sultan Mansion ...,...,Does the Laleli Mosque share the same neighbor...,Is the neighborhood of the Laleli Mosque the s...,Are the Laleli Mosque and Esma Sultan Mansion ...,Is the neighborhood for the Laleli Mosque the ...,Are both the Laleli Mosque and the Esma Sultan...,Do Laleli Mosque and Esma Sultan Mansion share...,1,No,Yes,Maybe
3,Big Stone Gap is a 2014 American drama romanti...,"The director of the romantic comedy ""Big Stone...",In which New York city is the director of the ...,"""Big Stone Gap"" is a romantic comedy; the dire...",Identify the New York city where the director ...,Which New York city serves as the home base fo...,"The director of the movie ""Big Stone Gap,"" a r...",Locate the New York city that is the base for ...,"What city in New York is the director of ""Big ...","The romantic comedy ""Big Stone Gap"" has a dire...",...,Name the New York city where the romantic come...,Which city in New York serves as the residence...,Find the New York city that is home to the dir...,What is the New York city of residence for the...,Which New York city is considered the base for...,"The romantic comedy ""Big Stone Gap"" director i...",2,"Tribeca, new york city","Greenwich village, new york city","Upper east side, new york city"
4,2014 S/S is the debut album of South Korean gr...,2014 S/S is the debut album of a South Korean ...,Who is responsible for forming the South Korea...,Who founded the South Korean boy group whose f...,"The debut album ""2014 S/S"" was released by a S...",The creator of the South Korean boy group that...,Who established the South Korean boy group tha...,Who was behind the formation of the South Kore...,Which individual or entity formed the South Ko...,Who is the founder of the South Korean boy gro...,...,Who was the driving force behind the South Kor...,Who played a key role in forming the South Kor...,"The South Korean boy group that launched ""2014...",Who initiated the creation of the South Korean...,"The South Korean boy band's debut album ""2014 ...",Who is credited with the formation of the Sout...,1,Yg entertainment,Sm entertainment,Jyp entertainment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6114,"Robert Sargent Shriver Jr. ( ; November 9, 191...",Who was the brother of the wife of the Democra...,Who was the sibling of the spouse of the Democ...,Identify the brother of the woman married to t...,Can you name the brother of the 1972 Democrati...,Who is the brother of the Democratic Party's 1...,What is the name of the brother of the woman w...,Who is identified as the brother of the spouse...,Name the brother of the Democratic Vice Presid...,Which individual was the brother of the wife o...,...,Who can be named as the brother of the spouse ...,Who was the sibling of the wife of the 1972 De...,What is the identity of the brother of the 197...,Can you provide the name of the brother of the...,Who was the brother of the partner of the 1972...,State the name of the brother of the wife of t...,1,President john f. kennedy,Senator robert f. kennedy,Attorney general robert b. kennedy
6115,Volvic is a brand of mineral water. Its source...,Are both Volvic and Canfield's Diet Chocolate ...,Can we confirm that Volvic and Canfield's Diet...,Is it accurate to say that both Volvic and Can...,Are Volvic and Canfield's Diet Chocolate Fudge...,Would both Volvic and Canfield's Diet Chocolat...,Are Volvic and Canfield's Diet Chocolate Fudge...,Do both Volvic and Canfield's Diet Chocolate F...,Is it true that both Volvic and Canfield's Die...,Are both Volvic and Canfield's Diet Chocolate ...,...,Are Volvic and Canfield's Diet Chocolate Fudge...,Do the waters Volvic and Canfield's Diet Choco...,Are both Volvic and Canfield's Diet Chocolate ...,Can both Volvic and Canfield's Diet Chocolate ...,Are Volvic and Canfield's Diet Chocolate Fudge...,Is the term natural spring water applicable to...,2,Possibly,No,Yes
6116,The Bosnian Coarse-haired Hound or Bosanski Oš...,Are Billy and Barak both breeds of scenthound?...,Is it true that both Billy and Barak are scent...,Can Billy and Barak be classified as breeds of...,"Do both Billy and the Barak, also known as the...",Would Billy and Barak qualify as breeds within...,"Are the dog breeds Billy and Barak, also calle...",Do the breeds Billy and Barak both fall under ...,Can both Billy and the Bosnian Coarse-haired H...,Are Billy and Barak recognized as breeds of sc...,...,Would both the Billy and Barak breeds be liste...,"Are Billy and the Barak breed, which is also c...",Could both Billy and Barak be grouped as scent...,Do the dog breeds Billy and Barak fall within ...,Are Billy and Barak considered members of the ...,"Do Billy and Barak, also known as Bosnian Coar...",3,Unsure,No,Yes
6117,Dig is an American alternative rock band from ...,Were both of the following rock groups formed ...,Is it true that both the rock bands Dig and Th...,Can it be confirmed that the rock groups Dig a...,"Did both the bands, Dig and Thinking Fellers U...",Are Dig and Thinking Fellers Union Local 282 b...,"Were the origins of both rock groups, Dig and ...",Is California the formation site for both the ...,Were the rock groups Dig and Thinking Fellers ...,"Did both bands, Dig and Thinking Fellers Union...",...,Is there evidence that both Dig and Thinking F...,Were both Dig and Thinking Fellers Union Local...,"Did the two bands, Dig and Thinking Fellers Un...",Are both Dig and Thinking Fellers Union Local ...,Were the musical groups Dig and Thinking Felle...,"Was the establishment of both rock groups, Dig...",1,Yes,Uncertain,No


In [None]:
dataset = pd.read_csv('Data/Input/HotpotQA.csv')
raw = json.load(open('Data/Raw/HotpotQA/hotpot_dev_fullwiki_v1.json'))
raw_ = json.load(open('Data/Raw/HotpotQA/hotpot_dev_distractor_v1.json'))

retrieveds = []
question2id = {each['question']:idx for idx, each in enumerate(raw)}
for _, row in dataset.iterrows():
    id = question2id[row['question 1']]
    context = raw[id]['context'] if len(raw[id]['context']) else raw_[id]['context']
    retrieved = '\n\n'.join([''.join(each[1]) for each in context])
    retrieveds.append(retrieved)
dataset['retrieved'] = retrieveds
dataset.to_csv('Data/Input/HotpotQA.csv', index=False)    