In [7]:
from git import Repo
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser  
from langchain_text_splitters import Language, MarkdownTextSplitter
import os

# Imports
from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain 
from langchain_core.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_cohere import CohereEmbeddings


# Load environment variables
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY") 
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
langchain_endpoint = os.getenv("LANGCHAIN_ENDPOINT")
os.environ["COHERE_API_KEY"] =  os.getenv("COHERE_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "git_runner"





In [8]:
# Initialize embeddings and model
# embeddings = OllamaEmbeddings(model="mistral", base_url="http://host.docker.internal:11434")
model = ChatGroq(temperature=1, model_name="llama3-70b-8192", api_key=groq_api_key)
embeddings = CohereEmbeddings(model="embed-english-light-v3.0")


In [10]:

def update_chroma_db(repos_list, local_repo_path, db_path):
    for repo_info in repos_list:
        repo_path = os.path.join(local_repo_path, repo_info['name'])
        # Check if the repository directory exists, if not, clone it
        if not os.path.exists(repo_path):
            Repo.clone_from(repo_info['url'], to_path=repo_path)
        else:
            print(f"The directory '{repo_path}' already exists. Skipping cloning.")
            continue

        core_path = os.path.join(repo_path)
        # Load Python and Jupyter Notebook files
        python_notebook_loader = GenericLoader.from_filesystem(
            core_path,
            suffixes=[".py", ".ipynb", ".md"],
            exclude=["**/non-utf8-encoding.py"],
            parser=LanguageParser(parser_threshold=500),
        )
        python_documents = python_notebook_loader.load()

        python_splitter = RecursiveCharacterTextSplitter.from_language(
            language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
        )

        texts = python_splitter.split_documents(python_documents)

        return texts

In [11]:


db_directory = "/workspaces/myStuff/RAG01/chromadb"
os.makedirs(db_directory, exist_ok=True)

local_repo_path = "/workspaces/myStuff/RAG01/git-repos"
repos_list = [
    {'name': 'achainflow', 'url': 'https://github.com/vcappuccio/achainflow'},
]


texts= update_chroma_db(repos_list, local_repo_path, db_directory)
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=db_directory)


InvalidDimensionException: Embedding dimension 384 does not match collection dimensionality 4096

In [89]:

retriever = vectordb.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 8},
)

# Define prompts
retriever_prompt = ChatPromptTemplate.from_messages(
    [
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
        (
            "user",
            "Given the above conversation, generate a search query to look up to get information relevant to the conversation",
        ),
    ]
)

document_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
    ]
)

# Create retriever and document chains
retriever_chain = create_history_aware_retriever(model, retriever, retriever_prompt)
document_chain = create_stuff_documents_chain(model, document_prompt)

# Create final QA chain
qa = create_retrieval_chain(retriever_chain, document_chain)

In [90]:
question = "What is the code doing?"
result = qa.invoke({"input": question})
print(result["answer"])

The code is a Python script that implements a Streamlit application that uses a chain of AI advisors to provide a comprehensive answer to a user's problem. Here's a breakdown of what the code does:

1. **User Input**: The user inputs a problem or question into a text area.
2. **Advisor Evaluation**: The user's input is processed by a series of AI advisors, each with its own model and expertise. The advisors provide their input, suggestions, and ideas related to the user's problem.
3. **Advisor Consultation**: The code uses asynchronous programming to consult multiple advisors in a chain, where each advisor builds upon the previous one's output. The consultants are called using the `consult_advisor_async` function, which uses the Ollama API to interact with the advisors.
4. **Final Answer**: The output from the chain of advisors is processed and consolidated into a single, comprehensive answer. This answer is then returned to the user as the final response.
5. **Streamlit App**: The ent

In [91]:

system_template = """
Answer the user's questions based on the below context.
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible:

{context}
"""

# First we need a prompt that we can pass into an LLM to generate this search query
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_template),
        ("user", "{input}"),
    ]
)
document_chain = create_stuff_documents_chain(model, prompt)

qa_chain = create_retrieval_chain(retriever, document_chain)

In [92]:
print(qa_chain.pick("answer").invoke({"input": "What and how can I improve this code?"}))

Here are some suggestions to improve the code:

1. **Consistent naming conventions**: The code uses both camelCase and underscore notation for variable and function names. It's better to stick to a single convention throughout the code. In Python, underscore notation is more commonly used.

2. **Function and variable naming**: Some function and variable names are not very descriptive. For example, `go_groq_go` and `system_message3` could be renamed to something more meaningful.

3. **Code repetition**: The code has a lot of repeated code blocks, especially in the `consult_advisor_async` and `get_over_a_chain_of_advisors_async` functions. Consider extracting the common logic into separate functions to reduce code duplication.

4. **Error handling**: The code does not handle errors or exceptions. Consider adding try-except blocks to handle potential errors, such as network errors or API errors.

5. **Documentation**: The code has some docstrings, but they could be more detailed and descr

In [93]:
def handle_query(qa_chain, input_query):
    return qa_chain.pick("answer").invoke({"input": input_query})

 
# Detailed input query specifying the issues and requesting complete code solutions
input_query = """
Please provide complete solutions for the following issues, including all necessary code:
1. **Code Duplication**: The functions `go_groq_go`, `consult_advisor_async`, and `get_over_a_chain_of_advisors_async` are almost identical. Implement a unified function with appropriate parameters to minimize duplication.
2. **Error Handling**: Current error management is inadequate. Integrate try-except blocks to manage potential errors like API key issues, connection interruptions, or JSON parsing errors.
3. **Type Hints and Docstrings**: Absence of type hints and docstrings in function definitions. Add type hints for better readability and error prevention. Include docstrings to describe the functions' purposes, parameters, and expected outputs.
4. **Code Organization**: The extensive codebase could benefit from modularization. Consider separating the code into distinct modules, such as one for the API client, another for advisor models, and a third for the Streamlit application.
5. **asyncio Usage**: The necessity for `asyncio` is unclear. Evaluate if the code can be converted to synchronous operations for simplicity and maintainability.
6. **Testing**: The absence of tests is a critical oversight. Develop unit and integration tests to verify functionality and detect regressions.
"""

result = handle_query(qa_chain, input_query)
print(result)

Here are the solutions to the issues mentioned:

**1. Code Duplication:**

We can unify the functions `go_groq_go`, `consult_advisor_async`, and `get_over_a_chain_of_advisors_async` into a single function with parameters. Here's how you can do it:

```
async def consult_api(model, system_message, user_message):
    """
    Asynchronously consult an advisor using the API and return the response message content.

    Args:
        model (str): The model to consult.
        system_message (str): The system message to be included in the consultation.
        user_message (str or list): The user message(s) to be included in the consultation.

    Returns:
        str: The advisor's response message content.
    """
    messages = [{"role": "system", "content": system_message}]
    
    if isinstance(user_message, list):
        messages.extend(
            {"role": "user", "content": msg} if isinstance(msg, str) else msg
            for msg in user_message
        )
    else:
        messag