In [2]:
import numpy as np

from src.chromaDB_setup import *
from src.embedding import *
from src.github_parser import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
repo_files = get_repo_files("vl4duu/locomobile.co")

Error processing file public/spinning-disc.gltf: unsupported encoding: none


In [4]:
chunked_docs = chunk_repository_files(repo_files)

textual_embeddings = np.array([embed_textual_metadata(doc["content"]) for doc in chunked_docs['textual_chunks']])

# Process code chunks with CodeBERT
code_embeddings = np.array([generate_code_embedding(doc["content"]) for doc in chunked_docs['code_chunks']])

embedded_chunks = {
    'textual_embeddings': np.array(textual_embeddings),
    'code_embeddings': np.array(code_embeddings)
}

In [5]:
chromaDb_collections = setup_chroma_collections(chunked_docs, embedded_chunks)




In [6]:
def retrieve_relevant_chunks(query, collections, top_k=3):
    textual_results = collections['textual_collection'].query(
        query_embeddings=[embed_textual_metadata(query)],
        n_results=top_k * 2,
        include=['documents', 'metadatas', 'distances']
    )

    code_results = collections['code_collection'].query(
        query_embeddings=generate_code_embedding(query).tolist(),
        n_results=top_k * 2,
        include=['documents', 'metadatas', 'distances']

    )

    def process_results(results):
        if "distances" not in results or not results["distances"]:
            print("Warning: 'distances' key missing or empty in results. Returning empty list.")
            return []
        # Calculate scores based on distances
        distances = np.array(results["distances"][0])
        scores = 1 - distances  # Invert distance to get similarity score
        return sorted(zip(scores, results["documents"], results["metadatas"]), reverse=True)[:top_k]

    top_textual = process_results(textual_results)
    top_code = process_results(code_results)

    return {"textual": top_textual, "code": top_code}


In [7]:
def construct_rag_prompt(query, relevant_chunks):
    """Constructs a RAG-style prompt for the AI model."""

    prompt = f"You are a repository analyser, use the provided chunks to answer any related questions about the repository:\n\nQuestion: {query}\n\nContext:\n"

    for chunk_type, chunks in relevant_chunks.items():
        if chunks:  # Check if there are any chunks for this type
            prompt += f"\n--- {chunk_type.capitalize()} Chunks ---\n"
            for score, text, metadata in chunks:
                prompt += f"Score: {score:.4f}\n"
                prompt += f"Content: {text}\n"
                prompt += f"Metadata: {metadata}\n"
        else:
            prompt += f"\n--- No {chunk_type} chunks found ---\n"

    prompt += "\nAnswer:"
    return prompt


In [8]:
import openai


def query_ai_model(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system",
             "content": "You are a helpful assistant.  Answer the question using only the provided context."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=500,  # Adjust as needed
        n=1,
        stop=None,
        temperature=0.1,  # Adjust as needed
    )
    return response.choices[0].message["content"].strip()



In [9]:
def answer_question(query, collections):
    """Retrieves relevant chunks and queries the AI model."""
    relevant_chunks = retrieve_relevant_chunks(query, collections)
    rag_prompt = construct_rag_prompt(query, relevant_chunks)
    ai_answer = query_ai_model(rag_prompt)
    return ai_answer


In [10]:
query = "What technologies does the repo use?"
answer = answer_question(query, chromaDb_collections)
print(f"\nAI Answer:\n{answer}")


Number of requested results 6 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 6 is greater than number of elements in index 3, updating n_results = 3



AI Answer:
The repository uses Vue.js, Vite, and Node.js.
