# Chat with Git Repositories

## Step 1: Index git repo
1) Clone git repo
2) split files to chunks
3) create embedding from each chunk
4) save all embeddings and chunks in np.array+list of strings

In [22]:
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Iterable
import asyncio
import numpy as np
import numpy.typing as npt
import openai
import tiktoken
import os


In [23]:
EMBEDDING_MODEL = "text-embedding-ada-002"
OPENAI_KEY = 
CHATGPT_MODEL = "gpt-3.5-turbo"


In [24]:
def chunk_python_file(file_path, tokenizer, max_len=1000):
    """create chunks of texts that end with \n\n and are not longer than max_len.
    In case a chunk is longer than max_len, it will NOT be splitted into multiple chunks.
    TODO YONIGO: maybe split python file with:

        ast.parse(file_path.read())
        for child_node in ast.iter_child_nodes(tree):
            print(ast.unparse(child_node))

    So that each chunk will be a python node. problem is ast removes comments and docstrings.
    """
    with open(file_path, "r") as file:
        document = file.read()
    sep_t = tokenizer.encode("\n\n")[0]
    doc_t = tokenizer.encode(document)
    end_indices = [i for i, c in enumerate(doc_t) if c == sep_t]
    if len(end_indices) == 0:
        end_indices = [len(doc_t) - 1]
    current_start = 0
    current_end = end_indices[0]

    for i in end_indices[1:]:
        if i - current_start <= max_len:
            current_end = i
        else:
            # if current_end - current_start > max_len:
            #     print(f"WARNING {file_path} has a chunk longer than {max_len} tokens: {current_end-current_start}")
            yield tokenizer.decode(doc_t[current_start : current_end + 1])
            current_start = current_end + 1
            current_end = i
    if current_end - current_start > 0:
        yield tokenizer.decode(doc_t[current_start : current_end + 1])


def iterate_repo_chunks(git_repo: str, tokenizer, chunk_size=1000):
    """iterate over all python files in a git repo and yield chunks of text.
    TODO YONIGO: make exclude_dirs a list of regexes.
    """
    with TemporaryDirectory() as tmpdir:
        os.system(f"git clone {git_repo} {tmpdir}")
        for python_file in Path(tmpdir).glob("**/*.py"):
            if python_file.is_dir():
                continue
            if python_file.name.startswith("test_"):
                continue
            yield from (
                f"file name: {python_file.relative_to(tmpdir)}\nfile content:\n{chunk}"
                for chunk in chunk_python_file(python_file, tokenizer, chunk_size)
            )


def batch_generator(iterable: Iterable, batch_size: int):
    """Yield successive batch_size chunks from the iterable.
    openai embeedings api gets batches of chunks.
    """
    current_batch = []
    for item in iterable:
        current_batch.append(item)
        if len(current_batch) == batch_size:
            yield current_batch
            current_batch = []
    if current_batch:
        yield current_batch


In [25]:
class VectorDB:
    def __init__(self, texts: list[str], embeddings: list[npt.NDArray]) -> None:
        self.texts = texts
        # Yes the vector db is a np.ndarray. Using Pinecone/ANN would be better for ##MUTCH## larger datasets.
        self.vectors = np.stack(embeddings)

    def search(self, vector: npt.NDArray[np.float32], k: int = 5, min_similarity=0.0) -> list[str]:
        """Search for k nearest documents to the given vector with cosine similarity"""
        scores = np.dot(self.vectors, vector) / (np.linalg.norm(self.vectors, axis=1) * np.linalg.norm(vector))
        indices = np.argpartition(scores, -k)[-k:]
        return [self.texts[i] for i in indices if scores[i] > min_similarity]


In [26]:
def index_repo(repo_path: str, openai_key: str, chunk_size: int = 1000) -> VectorDB:
    EMBEDDING_MODEL = "text-embedding-ada-002"
    EMBEDDING_BATCH_SIZE = 500
    tokenizer = tiktoken.get_encoding("gpt2")
    texts = []
    embeddings = []
    for text_batch in batch_generator(iterate_repo_chunks(repo_path, tokenizer, chunk_size), EMBEDDING_BATCH_SIZE):
        # TODO YONIGO: can be run concurrently with acreate
        response = openai.Embedding.create(model=EMBEDDING_MODEL, input=text_batch, api_key=openai_key)
        embeddings_batch = [np.array(e["embedding"]) for e in response["data"]]
        texts.extend(text_batch)
        embeddings.extend(embeddings_batch)
        # documents.extend([Document(text, embedding) for text, embedding in zip(text_batch, embeddings_batch)])

    db = VectorDB(texts, embeddings)
    return db


db = index_repo("https://github.com/tiangolo/fastapi.git", OPENAI_KEY, chunk_size=1000)


Cloning into '/var/folders/2x/bfct2nqs5h54sr249_przxy00000gn/T/tmpgsx4_q0j'...


## Retrival 
For any given `question` I will use `topk_chunks` to retrieve the most relevant chunks.
The most relevant chunks are chunks with highest cosine similarity between the question embedding and the chunk embedding.

In [27]:
def topk_chunks(question: str, db: VectorDB, openai_key: str, k: int = 5, min_similarity=0.0) -> list[str]:
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=[question], api_key=openai_key)
    question_embedding = np.array(response["data"][0]["embedding"])
    return db.search(question_embedding, k=k, min_similarity=min_similarity)


In [28]:
topk = topk_chunks("how are responses serialized?", db, OPENAI_KEY, k=5, min_similarity=0.0)
topk


['file name: fastapi/routing.py\nfile content:\n\nasync def serialize_response(\n    *,\n    field: Optional[ModelField] = None,\n    response_content: Any,\n    include: Optional[Union[SetIntStr, DictIntStrAny]] = None,\n    exclude: Optional[Union[SetIntStr, DictIntStrAny]] = None,\n    by_alias: bool = True,\n    exclude_unset: bool = False,\n    exclude_defaults: bool = False,\n    exclude_none: bool = False,\n    is_coroutine: bool = True,\n) -> Any:\n    if field:\n        errors = []\n        response_content = _prepare_response_content(\n            response_content,\n            exclude_unset=exclude_unset,\n            exclude_defaults=exclude_defaults,\n            exclude_none=exclude_none,\n        )\n        if is_coroutine:\n            value, errors_ = field.validate(response_content, {}, loc=("response",))\n        else:\n            value, errors_ = await run_in_threadpool(\n                field.validate, response_content, {}, loc=("response",)\n            )\n      

## Question Answering
steps:
1) get topk chunks for the question
2) send all chunks to chatgpt with the question
3) send all the answers to gpt again to reduce the answers to one answer

In [29]:
aq_prompt = """
    Use the following portion of a python codebase to try to answer the question. 
    python code:\n
    {context}
    Question: {question}
    ANSWER:
"""
reduce_prompt = """
    These answers are based on different portions of the codebase.
    combine them to get a better coherent single answer to the question.
    question: {question}
    list of answers:\n
    {answers}\n
    SINGLE ANSWER:
"""


In [30]:
async def answer_chunk(question, chunk, openai_key) -> str:
    messages = [
        {"role": "system", "content": "You answer questions about a python git repository"},
        {"role": "user", "content": aq_prompt.format(context=chunk, question=question)},
    ]

    response = await openai.ChatCompletion.acreate(
        messages=messages,
        model=CHATGPT_MODEL,
        max_tokens=2000,
        temperature=0,
        api_key=OPENAI_KEY,
    )
    current_answer = response["choices"][0]["message"]["content"]
    return current_answer


async def reduce_answers(question, answers, openai_key) -> str:
    messages = [
        {"role": "system", "content": "You answer questions about a python git repository"},
        {
            "role": "user",
            "content": reduce_prompt.format(question=question, answers="\n".join([f"answer:{a}\n" for a in answers])),
        },
    ]

    response = await openai.ChatCompletion.acreate(
        messages=messages,
        model=CHATGPT_MODEL,
        max_tokens=2000,
        temperature=0,
        api_key=OPENAI_KEY,
    )
    current_answer = response["choices"][0]["message"]["content"]
    return current_answer


async def map_reduce(question: str, topk_chunks: list[str], openai_key: str) -> str:
    chunk_answers = await asyncio.gather(*[answer_chunk(question, chunk, openai_key) for chunk in topk_chunks])
    return await reduce_answers(question, chunk_answers, openai_key)


In [31]:
answer = await map_reduce("how are responses serialized?", topk, OPENAI_KEY)
print(answer)


Responses in the python git repository are serialized using JSON format and the `jsonable_encoder` function from the `fastapi.encoders` module is used to convert the response data to a JSON serializable format. The `JSONResponse` class from the `fastapi.responses` module is then used to create a response object with the serialized data. Additionally, there are two custom response classes, `UJSONResponse` and `ORJSONResponse`, which use the `ujson` and `orjson` libraries respectively to serialize the response content. The serialization is done using the `render` method of the response class. The `CustomORJSONResponse` class's `render` method also uses the `orjson` library to serialize the response content into a JSON-formatted byte string.


## end2end

In [32]:
db = index_repo("https://github.com/tiangolo/fastapi.git", OPENAI_KEY, chunk_size=1000)
question = "how are responses serialized?"
topk = topk_chunks(question, db, OPENAI_KEY, k=5, min_similarity=0.0)
answer = await map_reduce(question, topk, OPENAI_KEY)


Cloning into '/var/folders/2x/bfct2nqs5h54sr249_przxy00000gn/T/tmp5pjttvzz'...


In [34]:
print(answer)


Responses in the python git repository are serialized using various methods. The `serialize_response` function defined in the `fastapi/routing.py` file is used to serialize responses by taking in various parameters such as the response content, fields to include or exclude, and whether to exclude unset, default, or None values. The `jsonable_encoder` function from the `fastapi.encoders` module is also used to convert the response data to a JSON serializable format. Additionally, the `JSONResponse` class from the `fastapi.responses` module is used to create a response object with the serialized data. The `CustomORJSONResponse` class's `render` method uses the `orjson` library to serialize the response content into bytes. There are also two custom response classes, `UJSONResponse` and `ORJSONResponse`, which use the `ujson` and `orjson` libraries respectively to serialize responses.
