In [1]:
import os
import time
import openai
import requests
import html2text
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from readability import Document
from docstring_parser import parse
from pydantic import BaseModel, Field
from duckduckgo_search import DDGS
import faiss

from dotenv import load_dotenv
load_dotenv("/Users/arshath/play/openautonomy/olas-predict-benchmark/.env")

openai.api_key = os.getenv("OPENAI_API_KEY")

from IPython.display import display, Markdown

### Prepare Questions From Benchmark

In [2]:
# load all questions
path = "../data/autocast/autocast_questions.json"
df = pd.read_json(path)
print(df.shape)

# filter out non-true/false questions
df = df[df["qtype"] == "t/f"].reset_index(drop=True)
print(df.shape)

# make sure answers is not None
df = df[df["answer"].notnull()].reset_index(drop=True)
print(df.shape)

# make sure source_links is not []
df = df[df["source_links"].map(len) > 0].reset_index(drop=True)
print(df.shape)

(6532, 14)
(3225, 14)
(2003, 14)
(1403, 14)


### Test RAG

Idea:
- question/prompt comes in
- generate n queries
- use duckduckgo for getting n urls per query
- make faiss 
- do rag and answer


What to use:
- use langchain
- use funciton calling

### RAG

##### 1. Query Generation

In [3]:
class OpenAISchema(BaseModel):  # type: ignore[misc]
    @classmethod  # type: ignore[misc]
    @property
    def openai_schema(cls) -> Dict[str, Any]:
        """
        Return the schema in the format of OpenAI's schema as jsonschema

        Note:
            Its important to add a docstring to describe how to best use this class, it will be included in the description attribute and be part of the prompt.

        Returns:
            model_json_schema (dict): A dictionary in the format of OpenAI's schema as jsonschema
        """
        schema = cls.model_json_schema()
        docstring = parse(cls.__doc__ or "")
        parameters = {
            k: v for k, v in schema.items() if k not in ("title", "description")
        }
        for param in docstring.params:
            if (name := param.arg_name) in parameters["properties"] and (
                description := param.description
            ):
                if "description" not in parameters["properties"][name]:
                    parameters["properties"][name]["description"] = description

        parameters["required"] = sorted(
            k for k, v in parameters["properties"].items() if "default" not in v
        )

        if "description" not in schema:
            if docstring.short_description:
                schema["description"] = docstring.short_description
            else:
                schema["description"] = (
                    f"Correctly extracted `{cls.__name__}` with all "
                    f"the required parameters with correct types"
                )

        return {
            "name": schema["title"],
            "description": schema["description"],
            "parameters": parameters,
        }
    
    @classmethod
    def from_response(cls, completion: Dict[str, Any]) -> "OpenAISchema":
        """
        Convert the response from OpenAI into the class instance

        Args:
            completion (dict): The response from OpenAI

        Returns:
            OpenAISchema: The instance of the class
        """

        message = completion.choices[0].message

        return cls.model_validate_json(
            message.function_call.arguments,
        )

In [4]:
class Queries(OpenAISchema):
    queries: List[str]

In [5]:
question = df["question"].iloc[0]
N_QUERY = 5
N_URLS = 3
model = "gpt-3.5-turbo"
temperature = 0.
max_tokens = 300

system_template = """You are a world class algorithm for generating structured output from a given input."""
user_template = """
Given the user's question: please generate {N_QUERY} diverse and relevant search queries that can be used to find information on the internet to answer the initial question. 
Focus on capturing different aspects and interpretations of the question to ensure comprehensive coverage of the topic.

USER's QUESTION: {question}
"""

messages = [
    {
        "role": "system",
        "content": system_template,
    },
    {
        "role": "user",
        "content": user_template.format(N_QUERY=N_QUERY, question=question),
    },
]

response = openai.ChatCompletion.create(
    model=model,
    messages=messages,
    temperature=temperature,
    max_tokens=max_tokens,
    n=1,
    timeout=150,
    request_timeout=150,
    stop=None,
    functions=[Queries.openai_schema],
)

queries = Queries.from_response(response)
print(queries)

queries=['Upcoming initial public offerings on Shanghai Stock Exchange', 'Upcoming initial public offerings on Shenzhen Stock Exchange', 'List of companies that went public on Shanghai Stock Exchange before 1 January 2016', 'List of companies that went public on Shenzhen Stock Exchange before 1 January 2016', 'Comparison of initial public offerings on Shanghai Stock Exchange and Shenzhen Stock Exchange before 1 January 2016']


##### 2. Get URLs for queries

In [10]:
def get_urls(query, n_urls):
    ddgs = DDGS()
    search = ddgs.text(query)

    urls = [url['href'] for url in search]

    return urls[:n_urls]

In [11]:
urls = []

for query in queries.queries:
    urls += get_urls(query, N_URLS)

urls = list(set(urls))
print(urls)

  ddgs = DDGS()


['https://www.statista.com/statistics/1219302/china-number-of-newly-listed-companies-at-the-shenzhen-stock-exchange/', 'https://www.statista.com/statistics/1293751/stock-exchanges-with-highest-proceeds-of-ipos-worldwide/', 'https://en.wikipedia.org/wiki/List_of_companies_listed_on_the_Shenzhen_Stock_Exchange', 'https://en.wikipedia.org/wiki/Category:Companies_listed_on_the_Shanghai_Stock_Exchange', 'http://www.eyeshenzhen.com/content/2023-09/04/content_30451355.htm', 'https://en.wikipedia.org/wiki/Category:Companies_listed_on_the_Shenzhen_Stock_Exchange', 'https://www2.deloitte.com/cn/en/pages/audit/articles/2023-review-and-2024-outlook-for-chinese-mainland-and-hk-ipo-markets.html', 'https://www2.deloitte.com/cn/en/pages/audit/articles/mainland-and-hk-ipo-markets-in-q3-2023.html', 'https://topforeignstocks.com/listed-companies-lists/the-complete-list-of-listed-companies-on-the-shanghai-stock-exchange/', 'https://en.wikipedia.org/wiki/Shanghai_Stock_Exchange', 'https://www.reuters.com/m

##### 3. Get text

In [12]:
def get_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        doc = Document(response.text)
        doc = doc.summary()
        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True
        h.ignore_emphasis = True
        text = h.handle(doc)
        return text
    
    except Exception as e:
        print(e)
        return None

In [13]:
docs = []
for url in urls:
    text = get_text_from_url(url)
    if text:
        docs.append(text)

401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/deals/chinas-first-batch-bluechips-under-new-ipo-system-surge-debut-2023-04-10/
401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/syngenta-files-10-bln-shanghai-ipo-prospectus-2021-07-02/


In [14]:
len(docs)

11

##### 4. Generate Embedding/Retriever

In [15]:
def recursive_character_text_splitter(text, max_tokens, overlap):
    if len(text) <= max_tokens:
        return [text]
    else:
        return [text[i:i+max_tokens] for i in range(0, len(text), max_tokens - overlap)]

In [16]:
split_docs = []
for doc in docs:
    split_docs += recursive_character_text_splitter(doc, 2000, 100)

In [17]:
len(split_docs)

59

In [18]:
EMBEDDING_MODEL = "text-embedding-ada-002"
BATCH_SIZE = 1000

def get_embeddings(split_docs):

    # Make chunks to embeddings mapping
    chunk_to_embedding = {}
    for batch_start in range(0, len(split_docs), BATCH_SIZE):
        batch_end = batch_start + BATCH_SIZE
        batch = split_docs[batch_start:batch_end]
        print(f"Batch {batch_start} to {batch_end-1}")
        response = openai.Embedding.create(
            model=EMBEDDING_MODEL,
            input=batch,
        )

        for i, be in enumerate(response["data"]):
            assert i == be["index"]

        batch_embeddings = [e["embedding"] for e in response["data"]]
        for chunk, embedding in zip(batch, batch_embeddings):
            chunk_to_embedding[chunk] = embedding

    return chunk_to_embedding

def find_similar_chunks(
    query: str, chunk_to_embedding: Dict, k: int = 4
) -> List:
    """Similarity search to find similar chunks to a query"""


    query_embedding = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )["data"][0]["embedding"]

    index = faiss.IndexFlatIP(1536)
    index.add(np.array(list(chunk_to_embedding.values())))
    D, I = index.search(np.array([query_embedding]), k)

    return [list(chunk_to_embedding.keys())[i] for i in I[0]]

In [19]:
chunk_to_embedding = get_embeddings(split_docs)

Batch 0 to 999


In [20]:
retrieved_chunks = find_similar_chunks(question, chunk_to_embedding, 5)

##### 5. Retrieve and generate

In [21]:
def format_docs(docs):
    return "\n\n".join(doc for doc in docs)

In [22]:
formatted_docs = format_docs(retrieved_chunks)

In [23]:
class Results(OpenAISchema):
    p_yes: float =  Field(description="Estimated probability that the event in the USER_QUESTION occurs.")
    p_no: float = Field(description="Estimated probability that the event in the USER_QUESTION does not occur.")
    confidence: float = Field(description="A value between 0 and 1 indicating the confidence in the prediction. 0 indicates lowest confidence value; 1 maximum confidence value.")
    info_utility: float = Field(description="Utility of the information provided in ADDITIONAL_INFORMATION to help you make the prediction. 0 indicates lowest utility; 1 maximum utility.")

In [24]:
prediction_system_template = """You are a world class algorithm for generating structured output from a given input. 
You make predictions about the probability of an event happening based on the information provided in the input.
"""

prediction_user_template = """
You are an LLM inside a multi-agent system that takes in a prompt of a user requesting a probability estimation
for a given event. You are provided with an input under the label "USER_PROMPT". You are also provided with ADDITIONAL_INFORMATION.

INSTRUCTIONS
* Read the input under the label "USER_PROMPT" delimited by three backticks.
* The "USER_PROMPT" specifies an event.
* The event will only have two possible outcomes: either the event will happen or the event will not happen.
* If the event has more than two possible outcomes, you must ignore the rest of the instructions and output the response "Error".
* You must provide a probability estimation of the event happening, based on your training data.
* You are provided an itemized list of information under the label "ADDITIONAL_INFORMATION" delimited by three backticks.
* You can use any item in "ADDITIONAL_INFORMATION" in addition to your training data.
* If an item in "ADDITIONAL_INFORMATION" is not relevant, you must ignore that item for the estimation.
* You must provide your response in the format specified under "OUTPUT_FORMAT".
* Do not include any other contents in your response.

USER_QUESTION: 
```{question}```

ADDITIONAL_INFORMATION: 
```{formatted_docs}```

"""

messages = [
    {
        "role": "system",
        "content": prediction_system_template,
    },
    {
        "role": "user",
        "content": prediction_user_template.format(question=question, formatted_docs=formatted_docs),
    },
]

response = openai.ChatCompletion.create(
    model=model,
    messages=messages,
    temperature=temperature,
    max_tokens=max_tokens,
    n=1,
    timeout=150,
    request_timeout=150,
    stop=None,
    functions=[Results.openai_schema],
)

results = Results.from_response(response)
print(results)

p_yes=0.8 p_no=0.2 confidence=0.9 info_utility=0.8
