In [None]:
import nest_asyncio
nest_asyncio.apply()
import asyncio
import os
from openai import OpenAI, AsyncOpenAI
import pandas as pd
from dotenv import load_dotenv

import json
from bs4 import BeautifulSoup
from bs4.element import Comment
import markdown
import markdownify
import frontmatter
import html
import regex as re

load_dotenv()


In [None]:
import pathlib
docs_dir = "../data/wandb_docs"
docs_dir = pathlib.Path(docs_dir)
docs_files = sorted(docs_dir.rglob("*.md"))

print(f"Number of files: {len(docs_files)}\n")
print("First 5 files:\n{files}".format(files="\n".join(map(str, docs_files[:5]))))

In [None]:
docs = [{"content": file.read_text(), "source": str(file.relative_to(docs_dir))} for file in docs_files]


In [None]:
docs_df = pd.DataFrame(docs)

In [None]:
from typing import Dict, Any



def convert_contents_to_soup(contents: str) -> BeautifulSoup:
    _, content = frontmatter.parse(contents)
    # use some extensions to convert the markdown to html
    markdown_document = markdown.markdown(
        content,
        extensions=[
            "toc",
            "pymdownx.extra",
            "pymdownx.blocks.admonition",
            "pymdownx.magiclink",
            "pymdownx.blocks.tab",
            "pymdownx.pathconverter",
            "pymdownx.saneheaders",
            "pymdownx.striphtml",
            "pymdownx.highlight",
            "pymdownx.pathconverter",
            "pymdownx.escapeall"
        ],
    )
    soup = BeautifulSoup(markdown_document, "html.parser")
    def remove_urls_a_tags_hrefs(soup):
        # For hyperlinks, keep the text but remove the link
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(a_tag.text)
        
        # Remove all images
        for img_tag in soup.find_all('img'):
            img_tag.decompose()
        
        return soup

    # Use the function as before
    soup = remove_urls_a_tags_hrefs(soup)

    def remove_javascript_import_statements(soup):
        for p in soup.find_all('p'):
            if p.text.strip().startswith('import') and ';' in p.text:
                p.decompose()
        return soup
    soup = remove_javascript_import_statements(soup)

    return soup


def clean_soup(soup: BeautifulSoup) -> BeautifulSoup:
    """Cleans the BeautifulSoup object.

    Args:
        soup: The BeautifulSoup object to clean.

    Returns:
        The cleaned BeautifulSoup object.
    """
    for img_tag in soup.find_all("img", src=True):
        img_tag.extract()
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    for p_tag in soup.find_all("p"):
        if not p_tag.text.strip():
            p_tag.decompose()
    return soup


def clean_contents(contents: str) -> str:
    """Cleans the contents.

    Args:
        contents: The contents to clean.

    Returns:
        The cleaned contents.
    """
    soup = convert_contents_to_soup(contents)
    soup = clean_soup(soup)
    cleaned_document = markdownify.MarkdownConverter(
        heading_style="ATX"
    ).convert_soup(soup)
    # Regular expression pattern to match import lines
    js_import_pattern = r"import .* from [‘’']@theme/.*[‘’'];\s*\n*"
    cleaned_document = re.sub(js_import_pattern, "", cleaned_document)
    cleaned_document = cleaned_document.replace("![]()", "\n")
    cleaned_document = re.sub(r"\[([^]]+)\]\([^)]+\)", r"\1", cleaned_document)
    cleaned_document = re.sub(r"\n{3,}", "\n\n", cleaned_document)
    cleaned_document = frontmatter.loads(cleaned_document).content
    return cleaned_document


def extract_frontmatter(file_path: pathlib.Path) -> Dict[str, Any]:
    """Extracts the frontmatter from a file.

    Args:
        file_path: The path to the file.

    Returns:
        The extracted frontmatter.
    """
    with open(file_path, "r") as f:
        contents = frontmatter.load(f)
        return {k: contents[k] for k in contents.keys()}


def strip_markdown_content(file_content):
    soup = convert_contents_to_soup(file_content)

    # Format headers with custom style and ensure they are not successive
    for header in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        header_text = header.get_text()
        formatted_header = f"\n\n---\n\n{header_text}\n\n\n"
        header.replace_with(formatted_header)

    # Replace <br> tags with newline characters
    for br in soup.find_all("br"):
        br.replace_with("\n\n\n")

    # Append a newline after each paragraph
    for p in soup.find_all("p"):
        p.append("\n\n\n")

    # Handle multiline code blocks enclosed in <pre> tags
    for pre in soup.find_all("pre"):
        code_text = pre.get_text()
        cleaned_code_text = code_text.strip("\n")
        # Ensure the code block is separated by newlines and enclosed in triple backticks
        formatted_code = f"\n\n\n```\n{cleaned_code_text}\n\n```\n\n\n"
        pre.replace_with(formatted_code)

    # Handle inline code blocks
    for code in soup.find_all("code"):
        if (
            code.parent.name != "pre"
        ):  # This checks if the <code> tag is not inside a <pre> tag
            inline_code_text = code.get_text()
            formatted_inline_code = f"`{inline_code_text}`"
            code.replace_with(formatted_inline_code)

    # Extract and unescape the HTML to plain text
    text = soup.get_text()
    unescaped_text = html.unescape(text)

    # Clean up escaped underscores and backticks
    clean_text = re.sub(r"\\_", "_", unescaped_text)
    clean_text = re.sub(r"\\`", "`", clean_text)

    # # Normalize double newlines to newlines
    clean_text = re.sub(r"\n\n", "\n", clean_text)

    # # Normalize triple or more newlines to double newlines
    clean_text = re.sub(r"\n{3,}", "\n\n", clean_text)

    return clean_text


def clean_markdown_chunk(chunk):
    chunk = chunk.replace("---", "\n\n")
    chunk = chunk.replace("```", "\n\n")
    chunk = re.sub(r"\n+", "\n", chunk)
    chunk = chunk.strip()
    return chunk

In [None]:
import tiktoken

class Tokenizer:
    def __init__(self, model_name):
        self.tokenizer = tiktoken.encoding_for_model(model_name)

    def encode(self, text):
        return self.tokenizer.encode(text, allowed_special="all")

    def decode(self, tokens):
        return self.tokenizer.decode(tokens)

tokenizer = Tokenizer("gpt-4o")

def length_function(content: str) -> int:
    
    return len(tokenizer.encode(content))


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
            separators=[
                "\n---\n",
                "\n```\n",
                "\n\n",
                "\n",
            ],
            is_separator_regex=True,
            chunk_size=512,
            chunk_overlap=0,
            keep_separator=False,
            length_function=length_function,
        )

In [None]:
chunks_df = pd.DataFrame(docs_df["content"].map(strip_markdown_content).map(text_splitter.split_text).explode().map(clean_markdown_chunk))

In [None]:
chunks_df["source"] = docs_df["source"]
chunks_df = chunks_df.loc[chunks_df['content'].map(length_function) >= 20]

In [None]:
from typing import List
client = AsyncOpenAI()
import numpy as np

async def embed_batch(texts: List[str]) -> np.array:
    embeddings = await client.embeddings.create(
        input=texts,
        model="text-embedding-3-small",
    )
    return np.array([embedding.embedding for embedding in embeddings.data])

async def embed_data(texts: List[str], batch_size: int = 100) -> np.array:
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embeddings.append(await embed_batch(batch))
    return np.concatenate(embeddings)

In [None]:
embeddings = asyncio.run(embed_data(chunks_df["content"].tolist()))

In [None]:
chunks_df['embedding'] = embeddings.tolist()


In [None]:
import cohere

In [None]:
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

co_client = cohere.AsyncClient(api_key=os.getenv("CO_API_KEY"))
async def rerank_context(query: str, context_df: pd.DataFrame, top_n: int=10) -> pd.DataFrame:
    reranked = await co_client.rerank(
        query=query,
        documents=context_df["content"].tolist(),
        model="rerank-english-v3.0",
        top_n=top_n,
        return_documents=False
    )
    idxs = [result.index for result in reranked.results]
    scores = [result.relevance_score for result in reranked.results]
    return pd.DataFrame({"content": context_df.iloc[idxs]["content"], "source": context_df.iloc[idxs]["source"], "score": scores})
    

async def get_context_docs(df: pd.DataFrame, query: str, answer: str, top_k: int = 10) -> pd.DataFrame:
    embedding = await embed_batch([query, answer])
    sims = cosine_similarity(embedding, np.array(df['embedding'].tolist()))
    qidxs = sims[0].argsort()[::-1][:top_k]
    aidxs = sims[1].argsort()[::-1][:top_k]
    idxs = np.unique(np.concatenate([qidxs, aidxs]))
    context_df = pd.DataFrame({"content": df.iloc[idxs]["content"], "source": df.iloc[idxs]["source"]})
    reranked = await rerank_context(query, context_df, top_k)
    return reranked

In [None]:
sample_questions = pd.read_json("../data/wandbot_sample_questions_answers.jsonl", lines=True, orient="records")
# sample = sample_questions.sample(1).iloc[0]

In [None]:
from instructor.utils import disable_pydantic_error_url
import instructor
from pydantic import BaseModel, Field, model_validator
from typing import Dict, List
disable_pydantic_error_url()

class Response(BaseModel):
    """The final scores and relevance of the documents"""
    final_scores: Dict[str, int] = Field(..., description="The final scores for each document based on the criteria")
    relevance: List[int] = Field(..., description="The ranked order of relevance of the documents")

    @model_validator(mode="after")
    @classmethod
    def validate_relevance(cls, data: Any) -> Any:
        if len(data.relevance) != len(set(data.relevance)):
            raise ValueError("The relevance list must be unique.")
        if len(data.relevance) != len(data.final_scores):
            raise ValueError("The relevance list must be the same length as the final scores.")
        return data
    

ins_client = instructor.from_openai(AsyncOpenAI(),  mode=instructor.Mode.JSON)

async def create_message(question: str, answer: str, context_df: pd.DataFrame) -> dict:
    documents = ""
    for idx, row in context_df.reset_index().iterrows():
        documents += f"<doc_{idx}>\n{row['content']}\n</doc_{idx}>\n"

    query_prompt = f"""
    <question>
    {question}
    </question>
    <answer>
    {answer}
    </answer>
    {documents}
    """

    return {"role": "user", "content": query_prompt}

import copy

async def process_sample(question: str, answer: str, chunks_df: pd.DataFrame) -> dict:
    messages = copy.deepcopy(json.load(open("prompts/retrieval_generation_prompt.json")))
    
    context_df = await get_context_docs(chunks_df, question, answer)
    message = await create_message(question, answer, context_df)
    messages.append(message)

    relevance_response = await ins_client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.0,
        response_model=Response,
        max_retries=5,
        max_tokens=250,
    )
    relevance_results = relevance_response.model_dump()
    
    relevance_scores = list(relevance_results["final_scores"].values())
    context_df["relevance"] = relevance_scores
    context_df = context_df.iloc[relevance_results["relevance"]]
    contexts = context_df.to_dict(orient="records")
    
    return {"question": question, "answer": answer, "contexts": contexts}

async def get_test_set(sample_questions: pd.DataFrame, chunks_df: pd.DataFrame) -> pd.DataFrame:
    tasks = [
        process_sample(row["question"], row["answer"], chunks_df)
        for _, row in sample_questions.iterrows()
    ]
    
    test_set = await asyncio.gather(*tasks)
    return pd.DataFrame(test_set)




In [238]:
undone_questions = sample_questions[sample_questions["question"].map(lambda x: x not in sample_test_set["question"].tolist())]

In [239]:
undone_sample_test_set = asyncio.run(get_test_set(undone_questions, chunks_df))

In [241]:
full_sample_test_set = pd.concat([sample_test_set, undone_sample_test_set])

In [242]:
full_sample_test_set.to_json("../data/eval/full_test_dataset.jsonl", lines=True, orient="records")

In [285]:
complete_sample_test_set = full_sample_test_set[full_sample_test_set["contexts"].map(lambda x: set([i.get("relevance") for i in x])).map(len) == 3]

In [286]:
sample_counts = complete_sample_test_set["contexts"].map(lambda x: pd.Series([i.get('relevance') for i in x]).value_counts().to_dict())
subset_sample = complete_sample_test_set[sample_counts.map(lambda x: x[2] >= 2) & sample_counts.map(lambda x: x[1] >= 2) & sample_counts.map(lambda x: x[0] >= 2)]


In [287]:
subset_sample_counts = subset_sample["contexts"].map(lambda x: pd.Series([i.get('relevance') for i in x]).value_counts().to_dict())
final_test_set = subset_sample[subset_sample_counts.map(lambda x: x[2] >=x[1]) | subset_sample_counts.map(lambda x: x[1] >=x[0])]
final_test_set = final_test_set.drop_duplicates(subset=["question"])
final_test_set = final_test_set.sample(100, replace=False)


final_test_set.to_json("../data/eval/final_test_dataset.jsonl", lines=True, orient="records")

In [298]:
eval_set = subset_sample[subset_sample.question.map(lambda x: x not in final_test_set.question.tolist())].sample(50, replace=False)
eval_set = eval_set.drop_duplicates(subset=["question"])
eval_set.to_json("../data/eval/final_eval_dataset.jsonl", lines=True, orient="records")
eval_set.shape


(50, 3)