In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
from openai import OpenAI
import os 
import sys 
from pprint import pprint
from IPython.display import display, Markdown, Latex
from IPython import get_ipython
from pathlib import Path

sys.path.append((Path(os.getcwd())/ '../').resolve().as_posix())
from pipeline.utils.pdf import read_pdf
from pydantic import BaseModel, Field
from typing import List
import json

from dotenv import find_dotenv, load_dotenv
from pipeline.helpers import get_json_response, get_messages_response, split_text, list_of_dicts_to_dict_of_lists, upload_to_hf, once, get_async_client, get_json_response_async, get_messages_response_async
from datasets import Dataset
load_dotenv()

In [None]:
class SuggesterModel(BaseModel):
    suggestions: List[str]
    finished: bool

class EditorModel(BaseModel):
    question: str
    answer : str 

class RefinedQuestionsModel(BaseModel):
    questions: List[str]


In [None]:
client = get_async_client()

In [None]:
SUGGESTER_PROMPT = """ 
    You are provided a pair of question and answer documents.
    Your job is to provide suggestions to refine both the question and answer to increase understanding of the context.

    Question: 
    {question}
    Answer: 
    {answer}
    
    Return your suggestions as a list of strings in JSON. 
    If you have no suggestions, return an empty list and set finished to True.


"""


In [None]:
question = "Summarize the AgentInstruct methodology for creating synthetic datasets for supervised fine-tuning and instruction-tuning."
answer = """The AgentInstruct methodology is a structured approach to create synthetic datasets for supervised fine-tuning and instruction-tuning of Large Language Models (LLMs). The methodology consists of three main flows: Content Transformation Flow, Seed Instruction Generation Flow, and Instruction Refinement Flow.

**Content Transformation Flow**

1. Assemble a collection of raw seeds (e.g., textbook chapters, web articles, code snippets).
2. Transform the seed with the aid of one or more content transformation agents to create an intermediate representation that simplifies the creation of instructions tailored to specific objectives.
3. This flow is instrumental in generating high-quality data and introducing diversity.

**Seed Instruction Generation Flow**

1. Take as input the transformed seed from the Content Transformation Flow.
2. Generate a set of diverse instructions using multiple agents, each targeting different question types (e.g., literal comprehension, critical comprehension, evaluative comprehension).
3. This flow introduces diversity by relying on a pre-defined taxonomy.

**Instruction Refinement Flow**

1. Take as input the instructions from the Seed Instruction Generation Flow.
2. Iteratively enhance their complexity and quality using Suggester-Editor Agents.
3. The refinement flow contains multiple suggester-editor agents that modify the passage, question, or answer choices to make them complex or unanswerable.

The AgentInstruct methodology is designed to automate the generation process, leveraging raw articles as seeds to foster diversity and ensure that problems generated in different iterations are distinct and of broad coverage. This enables the creation of data at scale with high diversity and varying complexity.

**Key Benefits**

1. **Automation**: The agentic flows can run autonomously, reducing or eliminating human intervention.
2. **Diversity**: AgentInstruct generates both prompts and responses using a large number of agents and a taxonomy of over 100 subcategories.
3. **Large quantities of data**: AgentInstruct can create vast amounts of diverse data.

**Implementation**

The AgentInstruct methodology has been implemented for 17 different skills, each having multiple subcategories. The skills include reading comprehension, question answering, coding, retrieval augmented generation, creative writing, tool use, and more.

Source: "AgentInstruct: Toward Generative Teaching with Agentic Flows" by Arindam Mitra et al., Microsoft Research."""

In [None]:
MODEL_3B= "llama-3.2-3b-instruct"
resp = await get_json_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": SUGGESTER_PROMPT.format(question=question, answer=answer)},
                ],
    response_format=SuggesterModel
)

In [None]:
resp.model_dump()

In [None]:
resp.suggestions

In [None]:
EDITOR_PROMPT = """ 
    You are provided a list of suggestions a pair of question and answer documents.
    Your job is to apply the suggestions to the question and answer and generate a new answer and question

    Question: 
    {question}
    Answer: 
    {answer}
    
    Suggestions:
    {suggestions}

"""


In [None]:
resp2 = await get_messages_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": EDITOR_PROMPT.format(question=question, answer=answer, suggestions="\n".join(resp.suggestions[0:5]))},
                ],
)

In [None]:
resp = await get_json_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": EDITOR_PROMPT.format(question=question, answer=answer, suggestions="\n".join(resp.suggestions[0:5]))},
                ],
    response_format=EditorModel
)

In [None]:
resp.model_dump()['question']

In [None]:
question

In [None]:
print(answer)

In [None]:
print(resp.model_dump()['answer'])

In [None]:
REFINED_QUESTIONS_PROMPT = """
    You are provided with a question and an answer.
    Your job is to generate a set of new questions that can be answered with the given answer but is diverse and approaches 
    the original question from different perspectives.

    Ensure that the generated questions are clear, purposeful, specific, and invoke critical thinking
    Question:
    {question}

    Answer:
    {answer}

    Return a list of new questions in JSON format.
"""

In [None]:
resp = await get_json_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": REFINED_QUESTIONS_PROMPT.format(question=question, answer=answer)},
                ],
    response_format=RefinedQuestionsModel
)

In [None]:
question

In [None]:
resp.questions

* List of 17 capabilities for which we implemented AgentInstruct Flows
    * Finetuned model could not answer completely

In [None]:
print(answer)

Leverage RAG to also get similar documents

In [None]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import chromadb


In [None]:
EMBEDDING_MODEL = "text-embedding-nomic-embed-text-v1.5@f32" # on LM Studio
embeddings_func = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    base_url="http://localhost:1234/v1",
    api_key="terst",
    check_embedding_ctx_length=False # https://github.com/langchain-ai/langchain/issues/21318
)

In [None]:
vector_store = Chroma(
    collection_name="test",
    embedding_function=embeddings_func,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [None]:
type(vector_store)

In [None]:
REFINED_RAG_ANSWER_PROMPT = """
    You are tasked with answering questions based on a provided text.
    You are provided with a question and an initial answer.
    You are also provided with some supporting documentation to help create a new response

    Your goal is to generate high-quality, detailed answers by following these instructions:
    
    # Instructions:
    1. Reference the Text: Answer directly using relevant details from the text. Avoid introducing unsupported claims.
    2. Comprehensive Response: Address all parts of the question thoroughly, covering multiple aspects if needed.
    3. Detail-Oriented: Highlight key elements like techniques, processes, models, or challenges, expanding on them for clarity.
    4. Organized Structure: Use clear paragraphs or points for complex answers.
    5. Clarity and Examples: Ensure the answer is precise and easy to follow. Include examples or quotes from the text when applicable.
    6. Include Sources: Clearly reference the source information at the end of the answer.

    If the answer is not found in the text, respond with "NO ANSWER FOUND"

    Question:
    {question}

    Initial Answer:
    {answer}

    Supporting Documentation:
    {docs}

"""


In [None]:

class AnswerModel(BaseModel):
    answer: str

In [None]:
def get_rag_docs(
        vector_store: Chroma,
        question: str,
        k: int = 5
) -> str:
    """Get RAG response."""
    docs = vector_store.similarity_search_with_score(question, k=k)
    return "\n".join([r[0].page_content for r in docs])


In [None]:
rag_docs = get_rag_docs(vector_store, resp.questions[7], k=5)


In [None]:
answer_resp = await get_messages_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": REFINED_RAG_ANSWER_PROMPT.format(question=resp.questions[7], answer=answer, docs=rag_docs)},
                ],
)

In [None]:
print(answer_resp)

In [None]:
answer_resp2 = await get_json_response_async(
    client=client,
    model=MODEL_3B,
    messages=[
                    {"role": "system", "content": REFINED_RAG_ANSWER_PROMPT.format(question=resp.questions[7], answer=answer, docs=rag_docs)},
                ],
    response_format=AnswerModel
)

In [None]:
len(answer_resp2.answer)

In [None]:
print(answer_resp2.answer)

In [None]:
import PyPDF2

In [None]:
with open("../data2/agentinstruct.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    # Attempt to extract title from metadata
    if "/Title" in reader.metadata:
        title= reader.metadata["/Title"]
    raise ValueError("No title found in metadata")
