# Introduction

This notebook is for evaluation of LaPSUM.

It expects the data to be loaded from the evaluation trial.

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import random

from langchain.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.agents import create_openai_functions_agent
from langchain.agents import AgentExecutor
import re  # For regex-based extraction of scores and explanations


ModuleNotFoundError: No module named 'matplotlib'

# Load the Data

In [104]:
# data_path = 'data.csv'
# df = pd.read_csv(data_path)

In [105]:
data = {
    "repository": ["project_1", "project_1", "project_1", "project_2", "project_2", "project_2", "project_3", "project_3", "project_3"],
    "question": [
        "How well does the code adhere to SOLID principles?",
        "Are there any security vulnerabilities in the code?",
        "How comprehensive are the unit tests?",
        "Does the code follow consistent naming conventions?",
        "Is the code well-documented?",
        "How efficient is the algorithm?",
        "Does the code follow consistent naming conventions?",
        "How easy is it to extend or modify the code?",
        "How well is the code optimized for performance?"
    ],
    "response": [
        ["The code mostly follows SOLID principles, but there are some areas that could be improved, particularly with respect to single responsibility.",
         "SOLID principles are well maintained, but there could be a better separation of concerns.",
         "The code does a decent job with SOLID principles, but has minor violations in dependency inversion."],

        ["There are a few minor security issues, such as the use of outdated libraries.",
         "The code has some potential security flaws related to hardcoded credentials and lack of input validation.",
         "No obvious security issues, but some areas could benefit from more input sanitization."],

        ["Unit tests are mostly comprehensive, but there are some edge cases that haven't been covered.",
         "The unit tests are thorough and cover most of the code, but a few areas, particularly error handling, are missing tests.",
         "The tests are somewhat comprehensive, but certain critical paths are not tested."],

        ["The naming conventions are mostly consistent, with some occasional deviations.",
         "The code follows naming conventions well, though there are some inconsistencies in variable names.",
         "Naming conventions are adhered to in most places, but some methods and classes have unclear names."],

        ["The code is well-documented, with clear function-level docstrings and usage examples.",
         "Documentation is decent, but some parts of the codebase lack sufficient comments and explanations.",
         "The code is under-documented, and some complex parts of the system could use more detailed explanations."],

        ["The algorithm is efficient, but could be improved by optimizing the data structures used in certain parts of the code.",
         "The algorithm works, but it could be more efficient, especially with large datasets.",
         "The algorithm is relatively efficient but could benefit from better memory management and reduced complexity."],

        ["The naming conventions are consistently followed, making the codebase easy to navigate.",
         "The naming conventions are good but could be slightly more descriptive in some areas.",
         "There are a few inconsistencies in naming conventions, particularly with function names."],

        ["The code is highly modular and easy to extend, with clear separation of concerns.",
         "The code is extendable, but some parts of the logic could be refactored to make it easier to add new features.",
         "The code is somewhat extendable, but some tight coupling between components makes modifications challenging."],

        ["The code is optimized for performance and handles large datasets efficiently.",
         "The performance is decent, but some areas of the code could benefit from further optimization, especially in terms of memory usage.",
         "Performance is a concern in the current implementation, as there are some bottlenecks that could be optimized."]
    ]
}


In [106]:
# Create the DataFrame
df = pd.DataFrame(data)

In [None]:
df.head()

# Evaluation

In [117]:
import re

# Function to send query to Ollama using LangChain (no memory)
def send_to_ollama(query):
    # Initialize the ChatOllama instance (no memory involved)
    llm = ChatOllama(
        model="mistral",  # Specify your model here
        temperature=0.5
    )

    # Create a simple prompt template (pass the query as part of a mapping, not just a string)
    prompt = PromptTemplate(input_variables=["query"], template=query)

    # Create a RunnableSequence chain
    llm_chain = prompt | llm  # Use | operator to chain the prompt with the model

    # Send query to Ollama and get the response
    response = llm_chain.invoke({"query": query})  # Pass the query as a mapping

    # Check if the response is an AIMessage object and get the text content
    if hasattr(response, 'text'):
        response_text = response.content  # Get the actual text content from the AIMessage object
    else:
        raise ValueError("Response does not contain text attribute.")

    # Initialize empty lists to store scores and explanations
    scores = []
    explanations = []

    # Regular expression patterns for different parts of the response
    response_num_pattern = re.compile(r"Response\s*(\d+)")  # Match 'Response X'
    score_pattern = re.compile(r"Score\s*-\s*(\d+)")  # Match 'Score - X'
    explanation_pattern = re.compile(r"Explanation.\s*(.*)")  # Match 'Explanation: ...'

    # Variables to hold the extracted values
    current_score = None
    current_explanation = None

    # Iterate through each line in the response text
    for line in response_text.split("\n"):
        # Extract response number
        response_num_match = response_num_pattern.search(line)
        if response_num_match:
            response_num = response_num_match.group(1)

        # Extract score
        score_match = score_pattern.search(line)
        if score_match:
            current_score = int(score_match.group(1))  # Capture the score as an integer

        # Extract explanation
        explanation_match = explanation_pattern.search(line)
        if explanation_match:
            current_explanation = explanation_match.group(1).strip()  # Capture the explanation

        # Once both score and explanation are found, store them
        if current_score is not None and current_explanation is not None:
            scores.append(current_score)
            explanations.append(current_explanation)

            # Reset for next response
            current_score = None
            current_explanation = None

    return scores, explanations


In [159]:
def process_and_map_scores(flattened_data):
    # Step 1: Prepare the query for evaluation, process each response individually
    query_base = """
        You are an experienced evaluator assessing responses from developers to technical questions.

        For each question, the responses from the developers have been provided. Evaluate how well each response answers the question on a scale of 1-10, where:
        - 1 means the response is completely irrelevant or fails to answer the question.
        - 10 means the response fully answers the question, providing a thorough, clear, and correct explanation.

        For each response:
        - Consider the relevance of the response to the question.
        - Consider the completeness of the answer. Does the response cover all parts of the question?
        - Consider the clarity of the response. Is it easy to understand and well-explained?

        Return your evaluation in the following format:
        Response X: Score - X Explanation: Explanation of why you gave this score.

        Example Response:
        Response 1: Score - 4 Explanation: While the response is relevant to the question it fails to provide contextual details.

        Here is the question and response:
    """

    evaluation_results = []  # To store the results (scores and explanations)

    # Iterate over each row in the flattened data to send each response individually
    for idx, row in flattened_data.iterrows():
        repository = row['repository']
        question = row['question']
        response = row['response']
        original_index = row['original_index']
        response_order = row['response_order']

        # Construct the query for this specific response
        query = query_base + f"\t{question}: {response}\n"
        query += "\t\t---\n\t\tPlease provide a score (1-10) and an explanation for this response."

        # Step 2: Send the query to Ollama (or any other evaluator)
        ollama_scores, ollama_explanations = send_to_ollama(query)  # Make sure to define this function to send and get responses

        # Step 3: Store the score and explanation for this response
        evaluation_results.append({
            'repository': repository,
            'question': question,
            'response': response,
            'original_index': original_index,
            'response_order': response_order,
            'score': ollama_scores[0],  # Assuming ollama_scores is a list of length 1 for each query
            'explanation': ollama_explanations[0]  # Assuming ollama_explanations is a list of length 1
        })

    # Step 4: Convert the results to a DataFrame
    evaluation_df = pd.DataFrame(evaluation_results)
    return evaluation_df


In [169]:
# Step 2: Add an 'original_index' column and 'response_order' to capture the order of responses
df['original_index'] = df.index
df['response_order'] = df['response'].apply(lambda x: list(range(len(x))))

# Step 3: Explode the data and keep track of the response order
df_exploded = df.explode('response', ignore_index=True)
df_exploded['response_order'] = df_exploded.groupby(['repository', 'question']).cumcount()

# Step 4: Shuffle the exploded data (responses) but retain 'response_order' for tracking
df_shuffled = df_exploded.sample(frac=1, random_state=42).reset_index(drop=True)

In [170]:
evaluation_df = process_and_map_scores(df_shuffled)

In [171]:
df_restored = evaluation_df.sort_values(by=['original_index', 'response_order']).reset_index(drop=True)

In [173]:
# Display the results
print(df_restored)

   repository                                           question  \
0   project_1  How well does the code adhere to SOLID princip...   
1   project_1  How well does the code adhere to SOLID princip...   
2   project_1  How well does the code adhere to SOLID princip...   
3   project_1  Are there any security vulnerabilities in the ...   
4   project_1  Are there any security vulnerabilities in the ...   
5   project_1  Are there any security vulnerabilities in the ...   
6   project_1              How comprehensive are the unit tests?   
7   project_1              How comprehensive are the unit tests?   
8   project_1              How comprehensive are the unit tests?   
9   project_2  Does the code follow consistent naming convent...   
10  project_2  Does the code follow consistent naming convent...   
11  project_2  Does the code follow consistent naming convent...   
12  project_2                       Is the code well-documented?   
13  project_2                       Is the code 