In this notebook we will automatically generate a set of evaluation questions based on wandb docs

In [1]:
import re
import os
import random
import time
from tqdm.auto import tqdm

import openai

import wandb
from wandb.integration.openai import autolog

In [None]:
# !pip install "langchain>=0.0.175" "wandb>=0.15.3" openai cohere tqdm

In [2]:
# Download a repo with documentation pages saved as .md files
# !git clone https://github.com/wandb/docodile.git

Set Weights & Biases Project and Entity

In [4]:
PROJECT = "wandbot_synth" 
ENTITY = "wandbot"

Authenticate with OpenAI

In [5]:
import openai
from getpass import getpass

def get_openai_key():
  if os.getenv("OPENAI_API_KEY") is None:
    if any(['VSCODE' in x for x in os.environ.keys()]):
      print('Please enter password in the VS Code prompt at the top of your VS Code window!')
    os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
    openai.api_key = os.getenv("OPENAI_API_KEY")
  assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
  print("OpenAI API key configured")

get_openai_key()

Please enter password in the VS Code prompt at the top of your VS Code window!
OpenAI API key configured


# Generate Synthetic User Questions using ChatGPT

In [35]:
# This function is used to find all the markdown files in a directory and return it's content and path

def find_md_files(directory):
    md_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as md_file:
                    content = md_file.read()
                md_files.append((file_path, content))
    return md_files

In [36]:
documents = find_md_files('../docodile/docs/')

random.shuffle(documents)
len(documents)

288

In [38]:
system_prompt = """You are a tester for a support bot for the Weights & Biases (aka wandb or W&B) MLOps python library.'
    'Your goal is to generate 40 questions of varying difficulty that can be answered by reading a document in the wandb documentation.'
    'Given a document, you need to imagine a hypothetical question from a user that has not read the document before.
    'It should be possible to answer the question by reading and reasoning over the document.
    'This question should be feasible to come from a user learning about wandb.
    'The question should be answerable by a human.
    'Each question should be unique and not a duplicate of another question.
    'Each question should be separated by a new line."""

## Generate Questions

In [None]:
# W&B Autolog
autolog({"project":PROJECT, "entity":ENTITY, "name":"synth_question_generation"})

res = []
for i in tqdm(range(len(documents))):
    for i_r, _ in enumerate(range(5)):
        try: 
            source = documents[i][0]
            doc = documents[i][1]
            generation_prompt = f'''Let's start!
                Please generate 40 questions of varying difficulty that can be answered by reading this document from the wandb documentation;
                Document: {doc}
                Questions:'''

            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": generation_prompt},
                    ]
            )
            generation = response.choices[0].message.content
            res.append({
                'prompt': generation_prompt,
                'document': source,
                'question': generation
            })
            # we don't need to retry if we get here
            break

        except Exception as e:
            if "This model's maximum context length is 4097 tokens" in str(e): break
            # wait for before retrying
            time.sleep(10)
            print(f'retrying {i}: {e}')

    # if i == 2: break # for testing
    if i % 20 == 0: print(i)

### Post Process
Post-process openai completitions into structured outputs

In [None]:
import pandas as pd

print(f"{res[:3]}\n")

# we will now split each generation in res into a list of questions based on the new line character and flatten the resulting list
res = [{'prompt': x['prompt'], 'document': x['document'], 'question': y} for x in res for y in x['question'].split('\n')]

# now let's remove the numeric characters, point and space at the beginning of each question
for i in range(len(res)):
    res[i]['question'] = re.sub(r'^[\d. ]+', '', res[i]['question'])


qs = [x['question'] for x in res]
for i,q in enumerate(qs):
    # print(q)
    if i > 20: break
    
# Save to DataFrame and CSV
df = pd.DataFrame(res)
df.to_csv('sythetic-user-questions_2023-05-16.csv', index=False)
print(f"{df.head()}\n")
print(f"\n{len(df)}")

In [None]:
# Log to wandb
wandb.log({'generated_questions_table': wandb.Table(dataframe=df)})
wandb.finish()