Generate Synthetic Data (Q&A)
===


In this notebook, we generate a sythetic dataset that contains all the questions and answers from the wiki-text-2 raw test dataset. By curating the dataset in this way, we can use it to evaluate the performance of our full pipeline.

In [1]:
import os

In [2]:
pwd

'c:\\Users\\kuotz\\rag-evaluation\\notebooks'

In [3]:
os.chdir('../')

In [7]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode

In [8]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')
    
    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

In [9]:
TRAIN_FILES = ['./data/raw/test.txt']
train_corpus = load_corpus(TRAIN_FILES, verbose=True)

Loading files ['./data/raw/test.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 368 nodes


In [10]:
len(train_corpus)

368

In [11]:
TRAIN_CORPUS_FPATH = './data/processed/corpus.json'

if not os.path.exists('./data'):
    os.mkdir('./data')

with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

In [14]:
import re
import uuid

from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode
from tqdm.notebook import tqdm

In [15]:
TRAIN_QUERIES_FPATH = './data/processed/queries.json'
TRAIN_RELEVANT_DOCS_FPATH = './data/processed/relevant_docs.json'
TRAIN_ANSWERS_FPATH = './data/processed/answers.json'

In [16]:
with open(TRAIN_CORPUS_FPATH, 'r+') as f:
    train_corpus = json.load(f)

In [17]:
# Creating a corpus of text
train_corpus = {k: train_corpus[k] for k in list(train_corpus.keys())}

In [24]:
# Sample 10 queries
for key, value in list(train_corpus.items())[:10]:
    print(key, value)

273a6e33-7a57-4b02-b704-9c13264f1769 = Robert Boulter = 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
 In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark

In [27]:
os.environ['OPENAI_API_KEY'] = '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'

In [28]:
###### Generate queries and answers #####
def generate_queries_and_answers(
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
    verbose=False,
):
    """
    Automatically generate hypothetical questions and answers that could be 
    answered with the doc in the corpus.
    """
    llm = OpenAI(model='gpt-3.5-turbo')

    prompt_template = prompt_template or """\
    Context information is provided below.

    ---------------------
    {context_str}
    ---------------------

    With the provided context information and no prior knowledge,
    create {num_questions_per_chunk} question(s) and their corresponding answer(s) 
    for an upcoming quiz/examination. Answers should be concise, limited to 1-5 words. 
    The questions and answers should be diverse in nature across the document 
    and directly related to the context information."
"""


    queries = {}
    answers = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = llm.complete(query)
 
        result = str(response).strip().split("\n")
        q_a_pairs = zip(result[0::2], result[1::2])  # Assuming alternating questions and answers

        for question, answer in q_a_pairs:
            question = re.sub(r"^\d+[\).\s]", "", question).strip()
            if len(question) > 0 and len(answer) > 0:
                question_id = str(uuid.uuid4())
                question = question.replace("Question:", "").strip()
                queries[question_id] = question
                answer = answer.replace("Answer:", "").strip()
                answers[question_id] = answer
                relevant_docs[question_id] = [node_id]
    
    return queries, answers, relevant_docs


In [29]:
train_queries, train_answers, train_relevant_docs = generate_queries_and_answers(
    train_corpus,
    num_questions_per_chunk=1,
    verbose=True,
)

  0%|          | 0/368 [00:00<?, ?it/s]

In [30]:
with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_ANSWERS_FPATH, 'w+') as f:
    json.dump(train_answers, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

In [31]:
TRAIN_DATASET_FPATH = './data/processed/dataset.json'

In [32]:
train_dataset = {
    'queries': train_queries,
    'answers': train_answers,
    'corpus': train_corpus,
    'relevant_docs': train_relevant_docs,
}

In [33]:
if os.path.exists(TRAIN_DATASET_FPATH):
    os.remove(TRAIN_DATASET_FPATH)
with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

---