## 1. Download the SQuAD Dataset  

In [None]:
from datasets import load_dataset

squad = load_dataset("squad")

## 2. Knowledge Base Preparation  

In [None]:
import pandas as pd
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch
from tqdm import tqdm
import numpy as np

# Extract contexts and titles from the SQuAD dataset
contexts = squad['train']['context']
titles = squad['train']['title']

# Create a DataFrame
knowledge_base = pd.DataFrame({'title': titles, 'context': contexts})

# Save the DataFrame with embeddings
knowledge_base.to_csv('../finetune_data/t_squad-kb.csv', index=False, sep='\t')

: 

## Training Data Generation  

In [None]:
import os

# Create directories for the training data
train_dir = '../finetune_data/squad-training-data'
os.makedirs(train_dir, exist_ok=True)

# Extract questions and answers
questions = squad['train']['question']
answers = [a['text'][0] for a in squad['train']['answers']]

# Save questions and answers to files
with open(os.path.join(train_dir, 'train.source'), 'w') as f_source:
    for question in questions:
        f_source.write(question + '\n')

with open(os.path.join(train_dir, 'train.target'), 'w') as f_target:
    for answer in answers:
        f_target.write(answer + '\n')


## Retrieval Evaluation Data Generation

In [4]:
import os
from datasets import load_dataset

squad = load_dataset("squad")

# Create directories for the training data
train_dir = '../finetune_data/squad-training-data/original'
os.makedirs(train_dir, exist_ok=True)

# Extract questions and answers
questions = squad['train']['question']
answers = [a['text'][0] for a in squad['train']['answers']]
retrievals = squad['train']['title']

# Save questions and answers to files
with open(os.path.join(train_dir, 'source'), 'w') as f_source:
    for question in questions:
        f_source.write(question + '\n')

with open(os.path.join(train_dir, 'target'), 'w') as f_target:
    for answer in answers:
        f_target.write(answer + '\n')

with open(os.path.join(train_dir, 'retrieval'), 'w') as f_target:
    for ret in retrievals:
        f_target.write(ret + '\n')
        
