In [1]:
import os
import sys
app_path = os.path.abspath('..')
sys.path.insert(0, app_path)


from pymongo import MongoClient
from langchain_core.documents import Document # Keep Document as it's used in the query function signature if needed later
from langchain import PromptTemplate
from langchain_ollama import ChatOllama

from app.chroma_client import chroma_client

In [2]:
MONGODB_URI = "mongodb://127.0.0.1:27017/mydb"  # Use the service name 'mongo'
mongo_client = MongoClient(MONGODB_URI)
mdb = mongo_client.mydb
mongo_collection = mdb.mycollection

In [21]:
user_query = "What is the most toxic comments?"
separator = "#_next_#"
expand_to_n = 5

In [22]:
llm_llama_3b = ChatOllama(
    model="llama3.2:3b",
)

In [23]:
llm_r1_8b = ChatOllama(
    model="deepseek-r1:8b",
)

In [24]:
hyde_prompt = PromptTemplate(
    input_variables=["user_query", "expand_to_n"],
    template="""
    You have information about comments on pull requests.
    Given the question '{user_query}', generate {expand_to_n} hypothetical answers that directly answer this question.
    
    Output ONLY the hypothetical answers without any other text, explanations, authors, or headers.
    """,
)
hyde_chain = hyde_prompt | llm_llama_3b
raw_additional_questions = hyde_chain.invoke({"user_query": user_query, "expand_to_n": expand_to_n}).content
raw_additional_questions

'1. Comments with phrases like "this is stupid" or "you\'re wrong"\n2. Comments containing insults such as "idiot" or "unprofessional"\n3. Posts stating that a contributor\'s work is not good enough\n4. Repeated comments asking the same question over and over again\n5. Comments claiming that someone is "too slow" or "can\'t keep up"'

In [25]:
query = f"{user_query}/n/n{raw_additional_questions}"

In [26]:
results = chroma_client.similarity_search_with_relevance_scores(query)

context_comments = "\n --- \n".join([f"Score: {score} - {doc.page_content}" for doc, score in results])
print(context_comments)

Score: -0.11732434145706727 - This is a solid contribution. Thanks for your hard work!
 --- 
Score: -0.13601440256062358 - Please reformat this section to adhere to PEP 8 guidelines.
 --- 
Score: -0.14860558640591148 - The comments in this file are outdated.
 --- 
Score: -0.15707402720083574 - The logging in this section is not very informative.


  results = chroma_client.similarity_search_with_relevance_scores(query)


In [None]:
augmented_query = f"{user_query}/n/n{raw_additional_questions}"
print(augmented_query)

GENERATE_PROMPT_TEMPLATE = """
You are an AI assistant analyzing Fisheye/Stash code review comments.
Based solely on the following comments provided as context, please answer the user's question.
If the comments don't provide enough information, state that.

Context Comments:
{context_comments}

User Question: {query}

Answer:
"""

In [37]:
EXPANTION_PROMPT_TEMPLATE = f"""You are an AI language model assistant.
Your task is to generate {expand_to_n} different versions of the given user question or sentence to 
retrieve relevant documents from a vector database.
By generating multiple perspectives on the user question or sentence, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Database stores comments to pull-requests.
Provide these alternative questions or sentences seperated by '{separator}'. 
Don't add to your answer additional information.
Original question: {user_query}

Additional questions:
"""

EXPANTION_WITH_EXAMPLES_PROMPT_TEMPLATE = f"""
You have information about comments on pull requests.
Generate {expand_to_n} concrete example answers for the following question.

Original question: {user_query}

Output ONLY the example answers. 
Do not include any other text, explanations, or headers.

For example:
    QUESTION:
    What are the most negative comments?
    RESPONSE:
    1. This code will never compile due to the mismatched syntax
    2. This bug will cause issues in production
    3. Lack of unit tests
"""

EXPANTION_WITH_EXAMPLES_PROMPT_TEMPLATE = f"""
You have information about comments in Instagram.
Generate {expand_to_n} concrete example answers for the following question.

Original question: {user_query}

Output ONLY the example answers. 
Do not include any other text, explanations, or headers.
"""

EXPANTION_WITH_EXAMPLES_PROMPT_TEMPLATE = f"""
You have information about comments on pull requests.
Given the question '{user_query}', generate {expand_to_n} hypothetical answers that directly answer this question.

Output ONLY the hypothetical answers. 
Do not include any other text, explanations, or headers.
"""

prompt = EXPANTION_WITH_EXAMPLES_PROMPT_TEMPLATE

In [32]:
hyde_prompt = PromptTemplate(
    input_variables=["user_query", "expand_to_n"],
    template="""
    You have information about comments on pull requests.
    Given the question '{user_query}', generate {expand_to_n} hypothetical answers that directly answer this question.
    
    Output ONLY the hypothetical answers without any other text, explanations, authors, or headers.
    """,
)

In [33]:
hyde_chain = hyde_prompt | llm_llama_3b
raw_additional_questions = hyde_chain.invoke({"user_query": user_query, "expand_to_n": expand_to_n}).content
raw_additional_questions

NameError: name 'llm_llama_3b' is not defined

In [34]:
llm_r1_8b = ChatOllama(
    model="deepseek-r1:8b",
)

In [35]:
hyde_chain_r1_8b = hyde_prompt | llm_r1_8b
raw_additional_questions = hyde_chain_r1_8b.invoke({"user_query": user_query, "expand_to_n": expand_to_n}).content
raw_additional_questions

'<think>\nAlright, so I need to figure out how to approach this user\'s query. They provided a specific instruction where they want me to generate five hypothetical answers to the question \'What is the most positive comments?\' based on information about comments on pull requests.\n\nFirst, I should understand what exactly they\'re looking for. They mentioned that the response should only contain the hypothetical answers without any other text or explanations. So my task is straightforward: create five concise and varied responses that directly address what makes a comment the most positive in this context.\n\nI need to consider what aspects make a comment positive. Factors could include being encouraging, constructive, specific, genuine, or supportive. Each response should highlight different qualities to ensure diversity.\n\nI\'ll start by brainstorming words or phrases related to positivity: uplifting, encouraging, constructive, genuine, supportive, meaningful, thoughtful, specific

In [None]:
# _llm_response = llm_model.invoke(prompt)
# raw_additional_questions = _llm_response.content
# additional_questions = _llm_response.content.split(separator)[1:]
# raw_additional_questions

In [None]:
# for question in additional_questions:
#     print(question)

# additional_questions_str = " ".join(additional_questions).replace('\n\n', ' ')

In [None]:
augmented_query = f"{user_query} {raw_additional_questions}"
print(augmented_query)

What is the most positive comments? 1. "Great job on implementing the new feature! The code looks clean and well-organized."
2. "I love how you handled the edge case. It's a great example of robust error handling."
3. "The documentation is excellent. You've done a great job making it easy for others to understand the code."
4. "This pull request has greatly improved our team's workflow. Well done!"
5. "Your explanation of the code changes was clear and concise, thanks for taking the time to write it."


In [None]:
results = chroma_client.similarity_search(augmented_query)

context_comments = "\n\n --- \n\n".join([doc.page_content for doc in results])

In [None]:
print(context_comments)

Great job on this pull request! The code is clean and well-documented.

 --- 

The user interface changes look good.

 --- 

I'm impressed with the quality of this code. Keep up the good work!

 --- 

The comments in this file are outdated.
