### This colab file augment QA dataset questions from sources of **governance-v2**, **whitepaper** and **yield space papers**. It also augment previous document sources via custom prompting to **docs-v2** and **addendum-docs**.

### This colab file also experiments with integrated QA-generation pipeline to ensure a smooth and trustworthy data generation flow.

In [None]:
# Install dependencies
!pip install langchain===0.0.230 openai chromadb==0.3.26 pydantic==1.10.8 GitPython ipython tiktoken
!pip install unstructured==0.8.5 pdf2image pytesseract pypdf
!apt-get install -y poppler-utils tesseract-ocr

In [None]:
import os
from langchain.document_loaders import GitLoader, UnstructuredPDFLoader, OnlinePDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain, RetrievalQA
from IPython.display import display
from IPython.display import Markdown
from getpass import getpass
from pathlib import Path
from langchain.callbacks import StdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain.callbacks import get_openai_callback
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
import json

stdout_handler = StdOutCallbackHandler()

In [None]:
OPENAI_API_KEY = getpass()

In [None]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

## Load up repo and pdf documents and generate questions

In [None]:
# Load repository with .md files as targets to split and store
def load_repo(remote_repo_url, local_repo_path, branch, file_filter=None):
    local_repo_exists = Path(local_repo_path).is_dir() # second and later loadings

    if local_repo_exists:
        loader = GitLoader(
            repo_path=local_repo_path,
            branch=branch,
            file_filter=file_filter
        )
    else:
        loader = GitLoader(
            clone_url=remote_repo_url,
            repo_path=local_repo_path,
            branch=branch,
            file_filter=file_filter
        )
    return loader.load() # load the required source files

# Load PDF documents
def load_pdf(file_path):
    loader = UnstructuredPDFLoader(file_path)
    return loader.load()

def load_online_pdf(file_url):
    loader = OnlinePDFLoader(file_url)
    return loader.load()

# Load single md document from drive/filesystem
def load_md(markdown_path):
    loader = TextLoader(markdown_path) # raw, do not parse md. https://github.com/langchain-ai/langchain/issues/3591
    return loader.load()

def split_docs(docs, language, chunk_size, chunk_overlap, force_category):
    text_splitter = RecursiveCharacterTextSplitter.from_language(language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    all_splits=[]
    all_metadatas=[]
    for d in docs:
        doc_file = d.page_content
        metadata = d.metadata # metadata including filepath, filename, other stuffs
        splits = text_splitter.split_text(doc_file) # parse the document to small chunk of code snippets

        metadata['category'] = force_category

        # this is only for governance type questions
        if metadata['source'].split('.')[-1] == 'md':
          title = f"# {metadata['file_name'].split('.')[0]}"
          for i in range(len(splits)):
            splits[i] = f"{title}\n\n{splits[i]}" # string is immutable, must write explitly


        metadatas = [metadata for _ in splits]
        all_splits += splits
        all_metadatas += metadatas # collecting metadata headers and the splited chunks from each document.

    return {
        'all_splits': all_splits,
        'all_metadatas': all_metadatas # pack as dict/json
    }

def split_pdf(docs, chunk_size, chunk_overlap, force_category):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) # no language as pdf is parsed as plain txt

    all_splits=[]
    all_metadatas=[]
    for d in docs:
        doc_file=d.page_content
        metadata = d.metadata # metadata including filepath, filename, other stuffs
        splits = text_splitter.split_text(doc_file) # parse the document to small chunk of code snippets

        metadata['category'] = force_category

        metadatas = [metadata for _ in splits]
        all_splits += splits
        all_metadatas += metadatas # collecting metadata headers and the splited chunks from each document.

    return {
        'all_splits': all_splits,
        'all_metadatas': all_metadatas # pack as dict/json
    }

In [None]:
from langchain.text_splitter import MarkdownTextSplitter
def split_md(docs, chunk_size, chunk_overlap, force_category):
    text_splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    all_splits=[]
    all_metadatas=[]
    for d in docs:
        doc_file = d.page_content
        metadata = d.metadata # metadata including filepath, filename, other stuffs
        splits = text_splitter.split_text(doc_file) # parse the document to small chunk of code snippets

        metadata['category'] = force_category

        metadatas = [metadata for _ in splits]
        all_splits += splits
        all_metadatas += metadatas # collecting metadata headers and the splited chunks from each document.

    return {
        'all_splits': all_splits,
        'all_metadatas': all_metadatas # pack as dict/json
    }

In [None]:
# documentation repo
remote_repo_docsV2_url="https://github.com/yieldprotocol/docs-v2" # the docs all coming from the github repo (public)
local_repo_docsV2_path="/tmp/yield_docs_v2_repo"

# cookbook repo
remote_repo_addendum_url="https://github.com/yieldprotocol/addendum-docs"
local_repo_addendum_path="/tmp/yield_addendum-docs"

# governance repo, the documents under this repo is kinda problematic, does not have a lot of content, most are link to external files
# thinking of whether to load the files affected by governance-v2 as well?
remote_repo_governance_url = "https://github.com/yieldprotocol/governance-v2"
local_repo_governance_path = "/tmp/governance-v2"

branch="main" # specify the branch of the remote repo urls
file_filter=lambda file_path: file_path.endswith(".md") # select only .md files as 'documentations'

chunk_size_chars = 2000 # context length required, in SFT chunk it to 1000 at evaluation
chunk_overlap_chars = 0 # no overlapping since containing code chunk

documentation_docs = load_repo(remote_repo_docsV2_url, local_repo_docsV2_path, branch, file_filter)
addendum_docs = load_repo(remote_repo_addendum_url, local_repo_addendum_path, branch, file_filter)
governance_docs = load_repo(remote_repo_governance_url, local_repo_governance_path, branch, file_filter)

documentation_splits = split_docs(documentation_docs, Language.MARKDOWN, chunk_size_chars, chunk_overlap_chars, force_category='general')
addendum_splits = split_docs(addendum_docs, Language.MARKDOWN, chunk_size_chars, chunk_overlap_chars, force_category='technical')
governance_splits = split_docs(governance_docs, Language.MARKDOWN, chunk_size_chars, chunk_overlap_chars, force_category='governance')

# trim down first and last file of gov_splits, as there are placeholders
governance_splits['all_splits'] = governance_splits['all_splits'][1:]
governance_splits['all_metadatas'] = governance_splits['all_metadatas'][1:]

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from google.colab import drive
drive.mount('/content/drive/')

whitepaper_data = load_online_pdf(file_url="https://yieldprotocol.com/Yield.pdf")
yieldspace_data = load_md(markdown_path="/content/drive/MyDrive/YieldSpace.md")

# sanitize the source
whitepaper_data[0].metadata['source'] = "https://yieldprotocol.com/Yield.pdf"
yieldspace_data[0].metadata['source'] = "https://yieldprotocol.com/YieldSpace.pdf"

whitepaper_splits = split_pdf(whitepaper_data, 2000, chunk_overlap_chars, force_category="paper")
yieldspace_splits = split_md(yieldspace_data, 2500, 0, force_category="paper")

In [None]:
for i, split in enumerate(whitepaper_splits['all_splits']):
  display(Markdown(f"# Chunk {i+1}"))
  display(Markdown(split))

In [None]:
vector_db = Chroma(embedding_function=OpenAIEmbeddings())
# chunking them already
documentation_splits_ids = vector_db.add_texts(documentation_splits['all_splits'], documentation_splits['all_metadatas'])
addendum_splits_ids = vector_db.add_texts(addendum_splits['all_splits'], addendum_splits['all_metadatas'])
governance_splits_ids = vector_db.add_texts(governance_splits['all_splits'], governance_splits['all_metadatas'])
whitepaper_splits_ids = vector_db.add_texts(whitepaper_splits['all_splits'], whitepaper_splits['all_metadatas'])
yieldspace_splits_ids = vector_db.add_texts(yieldspace_splits['all_splits'], yieldspace_splits['all_metadatas'])

In [None]:
# You are a Web3 user who wants to learn about the Yield protocol and its inner workings, concepts and methods to integrate using both client-side and smart contract code.

question_bank_prompt_template = """
You are an inquisitive Web3 user who wants to learn about the YieldSpace paper released by Yield Protocol that explains the mathematical foundations of YieldSpace.

# INSTRUCTIONS
- Generate 5 questions to ask about the protocol from the document provided below, always ensure the questions are related to the document text and do not make up any information.
- When you ask a question, make sure that you can also find evidences in the document that makes it answerable, but do not attempt to answer.
- The questions should be in plain text as much as possible, when the question contains latex formula, write the latex symbols in double slash '\\\\' to avoid json decoder error.
- Organize the results in lines, each question takes a line. Do not enumerate the questions, just write them to lines.

# DOCUMENT
{document}

# RESULT"""

QUESTION_BANK_PROMPT = PromptTemplate(
    template=question_bank_prompt_template, input_variables=["document"]
)

# one single document (chunk) and asking 5 questions
# 16k context is also available.
question_bank_llm_chain = LLMChain(
    llm=ChatOpenAI(temperature=0.10, model="gpt-4"),
    prompt=QUESTION_BANK_PROMPT
)

# code-related hallucination --> maybe turned it off when the first tested finetuned LLAMA-V2 is tested out.

answer_prompt_template = """
You are a Web3 expert who is able to answer any user query on Yield protocol's documentation, code, whitepapers and many other such topics.

# INSTRUCTIONS
- The user's query will be wrapped in triple back ticks
- Only answer the query using the provided context below which may include general information and code, do not make up any information.
- If you want to explicitly support your answer from the context, quote the part in the document in your answer.
- If the query is related to code or integration, provide a step by step explanation on the process, use code suggestions whereever relevant and always use markdown format annotated with the language to show the code.
- If the query is related to mathematic formulas or mathematic proofs, explain in natural languguage in an easy-to-understand manner and use markdown/latex to compose math formulas if applicable.
- For code suggestions, use comments to explain each important concept, class, variable, function and parameter.
- However, if you could not find documented code reference in the context, do not attempt to deliniate your answer in code, instead, use natural language supported from the context retrieved.
- Yield Protocol has no JS SDK so always use ethers package for JS code suggestions.

# CONTEXT
{augment_context}
{context}

# QUERY
```
{question}
```
"""
ANSWER_PROMPT = PromptTemplate(
    template=answer_prompt_template, input_variables=["context", "augment_context", "question"]
)


In [None]:
from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, StructuredTool, Tool, tool
from typing import Optional, Type

complex_template = """
You are a web3 expert using Yield protocol and its services. {role_description}
Write me engaging and profound case study questions rooted in real life web3 user experience scenario using first-person pronoun (I) in the text, based on the provided documents.

# INSTRUCTIONS
- Compose {k} such long, qualitative and engaging real-life case study question that is arisen from the following document.
- Always ensure the questions are related to the document text and do not make up any information.
- Before you ask a question, make sure that you can also find evidences in the document in the form of sentences, phrases, or function descriptions that makes it answerable.
- Organize the results in lines, each question or each evidence takes a line. Do not enumerate the questions and evidences, just write them to lines.

# DOCUMENT
{document}

# EXAMPLE RESULT
evidence: <evidence 1>
evidence: <evidence 2>
evidence: <evidence 3>
...
question: <question 1>
question: <question 2>
question: <question 3>
...

# YOUR RESULT
"""

COMPLEX_QUESTION_PROMPT = PromptTemplate(
    template=complex_template, input_variables=["document", "role_description", "k"]
)

role_dict = {
    "doc" : "Here is a piece of document located at Yield Protocol's website and you have questions on it.",
    "addendum" : "Here is a piece of techinical document related to protocol operations. You are curious on how these code snippets work.",
    "governance" : "Here is a governance proposal proposing changes in the protocol. Remember that proposal numbers comes in chronologicial order, with larger number being recent proposals, and smaller number are old proposals.",
    "whitepaper" : "You are looking at its whitepaper and you want to talk to the authors of this paper to discuss about the details in this paper.",
    "yieldspace" : "You are looking at its newly published paper called 'yield space' and you want to talk to the authors of this paper to discuss about the details in this paper. Especially you noticed that this paper is heavily focused on mathematical formulas related to the concept of 'invariants' in decentralized finance. You are very interested in studying the maths behind this paper, pay special attention to the numeric relationships between objects in this passage."
}

# def ask_complex_question(document, role, k="3"):
#   complex_question_llm_chain = LLMChain(
#       llm=llm,
#       prompt=COMPLEX_QUESTION_PROMPT
#   )
#   print(f"Document:{document}")
#   print(f"Role: {role_dict[role]}")
#   print(f"k: {k}")
#   result = complex_question_llm_chain.run(document, role_description=role_dict[role], k=k)
#   return result

# question_writer = StructuredTool.from_function(
#         name="Question Writer",
#         func=ask_complex_question,
#         description="useful for when you need to ask engaging and high-quality questions on the given document in Yield Protocol related stuff.",
#     )

complex_answer_prompt_template = """
You are Yield Protocol's Web3 expert who is able to answer any user query on Yield protocol's documentation, code, whitepapers and many other such topics.

# INSTRUCTIONS
- The user's query will be wrapped in triple back ticks.
- Only answer the query using the provided context below which may include general information and code, do not make up any information.
- Aside from the documented context, there's also a list of evidences straightforwardly related to the question asked, if you found the fetched documents are contradicting or confusing, you can use the concise evidences to aid your navigation inside the context.
- If you want to support your answer from the context, quote the part in the document in your answer.
- If the query is related to code or integration, provide a step by step explanation on the process, use code suggestions whereever relevant and always use markdown format annotated with the language to show the code.
- If the query is related to mathematic formulas or mathematic proofs, explain in natural languguage in an easy-to-understand manner and use markdown/latex to compose math formulas if applicable.
- For code suggestions, use comments to explain each important concept, class, variable, function and parameter.
- However, if you could not find documented code reference in the context, do not attempt to deliniate your answer in code, instead, use natural language supported from the context retrieved.
- Yield Protocol has no JS SDK so always use ethers package for JS code suggestions.

# EVIDENCE
{evidence}

# CONTEXT
{augment_context}
{context}

# QUERY
```
{question}
```
"""
COMPLEX_ANSWER_PROMPT = PromptTemplate(
    template=complex_answer_prompt_template, input_variables=["context", "augment_context", "evidence", "question"]
)


In [None]:
from google.colab import drive
import json


def answer(query, category, injected_doc, top_k=5, show_sources=False):
    display(Markdown(f"# Answering query:\n{query}"))

    llm_chain = LLMChain(
      llm=ChatOpenAI(temperature=0.1, model="gpt-4"),
      prompt=ANSWER_PROMPT
    )

    retriever=vector_db.as_retriever(search_type="mmr", search_kwargs = {
      'k': top_k,
      'filter': {'category': category}
    })

    context = retriever.get_relevant_documents(query)
    injected_context, injected_meta = injected_doc

    # context is multiple, we store them in the dataset for augmenting knowledge purpose (to inject to llama prompts)
    ret = {'question': query, 'answer': "", 'category': injected_meta['category'], 'contexts': [injected_context] + [con.page_content for con in context]}

    with get_openai_callback() as cb:
      ans = llm_chain.run(question=query, context=context, augment_context=injected_context)
      ret['answer'] = ans
      display(Markdown("# Final Answer"))
      display(Markdown(ans))
      print(cb)

      if show_sources:
        display(Markdown("### Sources"))
        display(Markdown(f"**[Source ORIGINAL]**"))
        display(Markdown(injected_context))
        if injected_meta['category'] != 'paper':
          display(Markdown(f"*File path: {injected_meta['file_path']}*"))

        for i, d in enumerate(context):
          display(Markdown(f"**[Source {i+1}]**"))
          display(Markdown(d.page_content))
          if d.metadata['category'] != 'paper':
            display(Markdown(f"*File path: {d.metadata['file_path']}*"))

      return ret


# Gets saved in the "tests/question_answering" directory
def save_qa_pair(file_name, qa_entities):
  with open(f"/content/drive/MyDrive/{file_name}", 'a+') as file:
    for qa in qa_entities:
      json.dump(qa, file)
      file.write("\n")


# integrated QA pipeline and save on disk on promise, one pipeline is a full chunk document db section (general, technical, whitepaper, etc.)
def gen_QA_pairs(chain, texts, metadatas, file_name):
  print(f"total texts: {len(texts)}")

  for i, t in enumerate(texts):
    with get_openai_callback() as cb:
      print(f"Working on text #{i+1}")
      qresult = chain.run(document=t)
      print(qresult)
      q_array = qresult.split('\n')
      print(f"Asked questions: {q_array}")
      print(cb)

      # attempt answer the question in the run freshly with context
      qa_array = []
      for q in q_array:
        qa_object = answer(q, category=metadatas[i]['category'], injected_doc=(t, metadatas[i]), show_sources=False)
        qa_array.append(qa_object)

      print(f"\nSaving QA pairs for text #{i+1}\n")
      save_qa_pair(file_name, qa_array)

In [None]:
import json
import re

def complex_answer(query, category, injected_doc, evidences, top_k=5, show_sources=False):
    display(Markdown(f"# Answering query:\n{query}"))

    llm_chain = LLMChain(
      llm=ChatOpenAI(temperature=0, model="gpt-4"),
      prompt=COMPLEX_ANSWER_PROMPT
    )

    retriever=vector_db.as_retriever(search_type="mmr", search_kwargs = {
      'k': top_k,
    })

    context = retriever.get_relevant_documents(query)
    injected_context, injected_meta = injected_doc

    # context is multiple, we store them in the dataset for augmenting knowledge purpose (to inject to llama prompts)
    ret = {'question': query, 'answer': "", 'category': injected_meta['category'], 'contexts': [injected_context] + [con.page_content for con in context]}

    with get_openai_callback() as cb:
      ans = llm_chain.run(question=query, context=context, augment_context=injected_context, evidence=evidences)
      ret['answer'] = ans
      display(Markdown("# Final Answer"))
      display(Markdown(ans))
      print(cb)

      if show_sources:
        display(Markdown("### Sources"))
        display(Markdown(f"**[Source ORIGINAL]**"))
        display(Markdown(injected_context))
        if injected_meta['category'] != 'paper':
          display(Markdown(f"*File path: {injected_meta['file_path']}*"))

        for i, d in enumerate(context):
          display(Markdown(f"**[Source {i+1}]**"))
          display(Markdown(d.page_content))
          if d.metadata['category'] != 'paper':
            display(Markdown(f"*File path: {d.metadata['file_path']}*"))

      return ret

def gen_complex_QA_pairs(texts, chain, role, questions_per_text, metadatas, file_name):
  print(f"total texts: {len(texts)}")

  for i, t in enumerate(texts):
    with get_openai_callback() as cb:
      print(f"Working on text #{i+1}")
      display(Markdown(t))
      qresult = chain.run(document=t, role_description=role_dict[role], k=questions_per_text)
      print(qresult)
      lines = qresult.split('\n')

      q_array = []
      evids = []
      for line in lines:
        if line.split(':')[0].lower() == 'question':
          q_array.append("".join([segment for segment in line.split(':')[1:]]))
        elif line.split(':')[0].lower() == 'evidence':
          evids.append("".join([segment for segment in line.split(':')[1:]]))

      # q_array = json.loads(qresult)['questions']
      # evids = json.loads(qresult)['evidences']

      for q in q_array:
        print(f"Question asked: {q}")
      for e in evids:
        print(f"Evidences: {e}")
      print(cb)

      evid_str = "\n".join([e for e in evids])
      qa_array = []
      for q in q_array:
        qa_object = complex_answer(q, category=metadatas[i]['category'], injected_doc=(t, metadatas[i]), evidences=evid_str, show_sources=False)
        qa_array.append(qa_object)

      print(f"\nSaving QA pairs for text #{i+1}\n")
      save_qa_pair(file_name, qa_array)

def gen_governance_QA_pairs(texts, chain, metadatas, file_name, role='governance', questions_per_text="3"):
  with get_openai_callback() as cb:
      for i, t in enumerate(texts):
        pattern = r"^# YPP-\d{4}$" # YPP-0004, and so on.
        header = t.split('\n')[0]

        print(f"Working on text #{i+1}:{header}...")
        display(Markdown(t))
        qresult = chain.run(document=t, role_description=role_dict[role], k=questions_per_text)

        # parse the data right here programmatically and not lending this to LLM.
        print(qresult)
        lines = qresult.split('\n')

        q_array = []
        evids = []
        for line in lines:
          if line.split(':')[0].lower() == 'question':
            q_array.append("".join([segment for segment in line.split(':')[1:]]))
          elif line.split(':')[0].lower() == 'evidence':
            evids.append("".join([segment for segment in line.split(':')[1:]]))

        for j in range(len(q_array)):
          match_object = re.fullmatch(pattern, header)
          if match_object:
            q_array[j] = f"[{header[2:]}] {q_array[j]}" # make it like [YPP-0004] <question>
            print(f"Question asked: {q_array[j]}")
        for e in evids:
          print(f"Evidences: {e}")
        evid_str = "\n".join([e for e in evids])

        qa_array = []
        for q in q_array:
          qa_object = complex_answer(q, category=metadatas[i]['category'], injected_doc=(t, metadatas[i]), evidences=evid_str, show_sources=False)
          qa_array.append(qa_object)

        print(f"\nSaving QA pairs for text #{i+1}\n")
        save_qa_pair(file_name, qa_array)

In [None]:
# #Uncomment the lines to run functions, or run your own generation partitions.
# drive.mount('/content/drive/')
# llm=ChatOpenAI(temperature=0.01, model="gpt-4")
# complex_question_llm_chain = LLMChain(llm=llm, prompt=COMPLEX_QUESTION_PROMPT)
# gen_governance_QA_pairs(governance_splits['all_splits'][501:], complex_question_llm_chain, governance_splits['all_metadatas'][501:], "governance_long_qa_3.jsonl")

In [None]:
# #Uncomment the lines to run functions, or run your own generation partitions.
# drive.mount('/content/drive/')
# llm=ChatOpenAI(temperature=0.01, model="gpt-4")
# complex_question_llm_chain = LLMChain(llm=llm, prompt=COMPLEX_QUESTION_PROMPT)
# gen_complex_QA_pairs(yieldspace_splits['all_splits'][24:27], chain=complex_question_llm_chain, role="yieldspace", questions_per_text="3", metadatas=yieldspace_splits['all_metadatas'][24:27], file_name="yieldspace_long_qa.jsonl")

The following sections are no longer in use.

In [None]:
# # Discarded due to redundancy
# governance_proposals_question_prompt_template = """
# You are a web3 expert using Yield protocol and its services. Here is a governance proposal proposing changes in the protocol.
# Write me engaging and profound case study questions rooted in real life web3 user experience scenario using first-person pronoun (I) in the text, based on the provided documents.

# # INSTRUCTIONS
# - Compose 3 such long, qualitative and engaging real-life case study question that is arisen from the following proposal document
# - Always ensure the questions are related to the document text and do not make up any information.
# - When you ask a question, make sure that you can also find evidences in the document that makes it answerable.
# - The result should be wrapped in JSON format like {{"questions": []}}.

# # DOCUMENT
# {document}

# # RESULT"""

# GOV_QUESTION_BANK_PROMPT = PromptTemplate(
#     template=governance_proposals_question_prompt_template, input_variables=["document"]
# )

# governance_question_bank_llm_chain = LLMChain(
#     llm=ChatOpenAI(temperature=0.01, model="gpt-4"),
#     prompt=GOV_QUESTION_BANK_PROMPT
# )
# # prompt script for generating long whitepaper questions
# # how could it ask questions that can be answered in the
# whitepaper_question_prompt_template = """
# You are a web3 expert using Yield protocol and its services. You are looking at its whitepaper and you want to talk to the authors of this paper to discuss about the details in this paper.
# Write me engaging and profound case study questions rooted in real life web3 user experience scenario using first-person pronoun (I) in the text, based on the provided whitepaper pages.

# # INSTRUCTIONS
# - Compose 4 such long, qualitative and engaging real-life case study question that is arisen from the following whitepaper document.
# - Always ensure the questions are related to the document text and do not make up any information.
# - When you ask a question, make sure that you can also find evidences in the document that makes it answerable.
# - The result should be wrapped in JSON format like {{"questions": []}}.

# # DOCUMENT
# {document}

# # RESULT"""

# WP_QUESTION_BANK_PROMPT = PromptTemplate(
#     template = whitepaper_question_prompt_template, input_variables=["document"]
# )

# whitepaper_question_bank_llm_chain = LLMChain(
#     llm=ChatOpenAI(temperature=0.01, model="gpt-4"),
#     prompt=WP_QUESTION_BANK_PROMPT
# )
# yieldspace_question_prompt_template = """
# You are a web3 expert using Yield protocol and its services. You are looking at its newly published paper called 'yield space' and you want to talk to the authors of this paper to discuss about the details in this paper.
# Especially you noticed that this paper is heavily focused on mathematical formulas related to the concept of 'invariants' in decentralized finance.
# You are very interested in studying the maths behind this paper, pay special attention to the numeric relationships between objects in this passage.
# Write me engaging and profound case study questions rooted in web3 finance researcher experience using first-person pronoun (I) in the text, based on the provided paper pages.

# # INSTRUCTIONS
# - Compose 4 such long, qualitative and engaging real-life case study question that is arisen from the following whitepaper document.
# - Always ensure the questions are related to the document text and do not make up any information.
# - When you ask a question, make sure that you can also find evidences in the document that makes it answerable.
# - The result should be wrapped in JSON format like {{"questions": []}}.

# # DOCUMENT
# {document}

# # RESULT"""

# YS_QUESTION_BANK_PROMPT = PromptTemplate(
#     template = yieldspace_question_prompt_template, input_variables=["document"]
# )

# yieldspace_question_bank_llm_chain = LLMChain(
#     llm=ChatOpenAI(temperature=0.01, model="gpt-4"),
#     prompt=YS_QUESTION_BANK_PROMPT
# )