In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv("us.env"))  # read local .env file

In [None]:
from pathlib import Path
from tqdm import tqdm
import sys

sys.path.append(Path("..").resolve().as_posix())

## Load the library

...and split it by categories.

In [None]:
lib_path = Path("../../docs_md").resolve()

In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(lib_path.as_posix(), recursive=True).load_data()

In [None]:
from src.core.index import DOC_MAP

doc_map_keys = list(DOC_MAP.keys())

docs_with_meta = []

for doc in documents:
    rel_path = str(
        Path(doc.metadata["file_path"]).resolve().relative_to(lib_path).as_posix()
    )

    section_key = [key for key in doc_map_keys if rel_path.startswith(key)][
        -1
    ]  # because cli and clients
    doc.metadata["section"] = DOC_MAP[section_key]
    docs_with_meta.append(doc)

## Prepare LangChain pipeline

We're using a Pydantic structured output chain because we need the data to adhere a certain structure.

In [None]:
from langchain.output_parsers import PydanticOutputParser
from langchain_community.chat_models import AzureChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain_core.runnables import RunnableSerializable

from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Type, List

In [None]:
class MessagePair(BaseModel):
    human: str = Field(description="Message from the user.")
    assistant: str = Field(description="Helpful response from the assistant.")


class Conversation(BaseModel):
    message_pairs: List[MessagePair] = Field(
        description="A short series of back and forth messages between the human and the assistant."
    )


system_template = """You are an assistant that helps write scripts of conversations about EdgeDB.
    Below you will find a piece of official EdgeDB documentation denoted by ---.
    Your job is to write a script of a conversation based on that piece of documentation.
    There're two participants: a human who wants to learn about / needs help with EdgeDB, EdgeQL, SDL, DDL, clent integrations etc.
    There's also a helpful assistant, whos job is two help the human out.

    Please only use the provided piece of documentation and no prior knowledge to create the conversation.
    Make sure some code snippets are involved.
    When providing SDL examples, please replace old syntax with up to date syntax, in which "->" is used over ":", and "property" and "link" are ommitted for non-computed properties/links.
    Examples:
    Old:
    ```sdl
    type Movie {{
    required property title -> str;
    multi link actors -> Person;
    }}
    ```
    Up to date:
    ``sdl
    type Movie {{
    required title: str;
    multi actors: Person;
    }}
    ```

    {format_instructions}
    """

human_template = """---{doc}---
    """

llm = AzureChatOpenAI(
    temperature=0.1,
    azure_deployment="gpt-4-1106",
    openai_api_version="2023-07-01-preview",
)
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Conversation)

system_message = SystemMessagePromptTemplate.from_template(
    template=system_template,
    # partial_variables={"format_instructions": parser.get_format_instructions()},
)

# system_message.partial(format_instructions=parser.get_format_instructions())

# print(system_message)

human_message = HumanMessagePromptTemplate.from_template(template=human_template)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message,
        human_message,
    ],
)

prompt = prompt.partial(format_instructions=parser.get_format_instructions())

chain = prompt | llm | parser

- As a result of calling this chain for a piece of documentation, we're expecting to get a `Coversation` object.
- In it, there's going to be a series of generated user questions and assistant answers wrapped as `MessagePair` objects.

In [None]:
import random

# For this example we only want docs that are related to EdgeQL and SDL

docs_edgeql_sdl = []

for doc in docs_with_meta:
    if doc.metadata["section"] == "edgeql_and_sdl":
        docs_edgeql_sdl.append(doc)

docs_edgeql_sdl = random.sample(docs_edgeql_sdl, 20)

len(docs_edgeql_sdl)

In [None]:
suffix = "_val"
conversations_path = Path(f"edgeql_sdl_conversations{suffix}.jsonl")

responses = []

for doc in tqdm(docs_edgeql_sdl):
    try:
        response = chain.invoke({"doc": doc})
    except:
        continue
    with conversations_path.open("a+") as f:
        f.write(response.json())
        f.write("\n")
    responses.append(response)

In [None]:
# Display a generated dialog

for turn in responses[0].message_pairs:
    print(f"Human: \n\n{turn.human}\n")   
    print(f"Assistant: \n\n{turn.assistant}\n")

## Repackage results

We need them to fit OpenAI data format.

In [None]:
from typing import Literal


class Message(BaseModel):
    role: Literal["system", "user", "assistant"]
    content: str

class Chat(BaseModel):
    messages: List[Message]

class Dataset(BaseModel):
    chats: List[Chat]

In [None]:
chats = []

for response in responses:
    messages = []
    for turn in response.message_pairs:
        messages.append(Message(role="user", content=turn.human))
        messages.append(Message(role="assistant", content=turn.assistant))

    chat = Chat(messages=messages)
    chats.append(chat)

dataset = Dataset(chats=chats)

In [None]:
formatted_path = Path(f"edgeql_sdl_formatted_v1{suffix}.jsonl")

with formatted_path.open("w") as f:
    for chat in dataset.chats:
        f.write(f"{chat.json()}\n")
