In [None]:
# User
# - Realistic
# - Consistent
# - Colorful personality
# - Coherent Follow ups


# Assistant
# - Tailors to user
# - Maximally Helpful
# - Code Quality
#   - Optimal Code
#   - Code Follows PEP8 Standards
# - Text Quality 
#   - Spelling
#   - Grammar
#   - Capitalization & Punctuation
#   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
#   - Explains Code Well
# - Markdown Formatting

In [3]:
import os
import random
import json

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
from pydantic import BaseModel
from llama_index.llms.openai import OpenAI


api_key = os.environ["OPENAI_API_KEY"]
file_list = os.listdir("../../jsonl_conversations/Batch 4/")
random.seed(0)
random_item = random.choice(file_list)

with open(f"../../jsonl_conversations/Batch 4/{random_item}") as f:
    conversation = json.load(f)

conversation

{'id': '18l9F7WDEqk8zVEJtKPKddmGQMv5Ejpc7',
 'metadata': {'topic': 'unit_testing_methodology > testing_asynchronous_code',
  'topic__top_level': 'unit_testing_methodology',
  'summary': 'User inquiring about testing async code and service availability.',
  'number_of_turns': '2'},
 'messages': [{'role': 'User',
   'content': "What's the proper way to test an async function that depends on an external service using mocks in pytest?",
   'type': 'markdown'},
  {'role': 'Assistant',
   'content': "There are a few key steps to ensure that the tests are properly isolated and asynchronous behavior is handled correctly. Here's how:\n\n**1. Install Necessary Packages:**\nEnsure you have `pytest`, `pytest-asyncio` (for async support), and `unittest.mock` (or `pytest-mock` for more features) installed.\n\n```shell\npip install pytest pytest-asyncio\n```\n\n\n**4. Write a Test Using Pytest:**\nUse the `pytest.mark.asyncio` decorator to mark your test as an async test. Use `unittest.mock` to creat

## GPT Inspection

In [4]:
from pydantic import BaseModel, Field
from typing import List
from llama_index.program import OpenAIPydanticProgram



class Feedback(BaseModel):
    score: int = Field(description="A score representing how good the conversation is in the given quality aspect, 1 is terrible, 5 is exemplary and flawless.", ge=1, le=5)
    issues: List[str] = Field(description="A concrete list of issues in the conversation. 15 words or less each.")
    praises: List[str] = Field(description="A concrete list of praise for exceptional behavior the conversation. 15 words or less each.")


class QualityAspect(BaseModel):
    name: str = Field(description="The name of the quality aspect.")
    instruction: str = Field(description="Instructions & details on how to inspect this quality aspect.")


def inspect_conversation_quality_aspect(conversation: List[List[dict]], quality_aspect: QualityAspect):
    """Inspect a conversation for a given quality aspect."""

    prompt_template_str = """
    Given the following conversation, please rate the quality of the conversation according to the given quality aspect.
    You are one of many specialized inspectors, so precisely focus on your quality aspect.

    Quality Aspect:
    {quality_aspect}

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=Feedback, 
        prompt_template_str=prompt_template_str, 
    )
    output = program( 
        quality_aspect=quality_aspect.model_dump(),
        conversation=conversation["messages"],
        description="Judge the quality of the conversation according to the given quality aspect."
    )
    return output


def inspect_all_conversation_quality_aspects(conversation) -> dict:
    """Inspect a conversation for all quality aspects."""
    
    user_qualities = [
        QualityAspect(
            name="Natural & Realistic", 
            instruction="""
            How does the user interaction resemble a real conversation and interactions a real user would have with a highly intelligent coding assistant.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Consistent", 
            instruction="""
            How consistent is the user's personality throughout the conversation.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Colorful Personality", 
            instruction="""
            How colorful is the user's personality throughout the conversation, Do any personality traits shine through? If it's just standard don't rate it highly.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Coherent Follow ups", 
            instruction="""
            How coherent are the user's follow up messages to the assistant's messages in the conversation as a whole.
            Ideally, the user would incrementally build on the conversation to achieve their goal.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
    ]

    assistant_qualities = [
        QualityAspect(
            name="Tailors to User", 
            instruction="""
            How well does the assistant tailor the way it's responding to the user's needs and preferences.

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Maximally Helpful", 
            instruction="""
            How helpful is the assistant in helping the user achieve their goals.

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Code Quality", 
            instruction="""
            How good is the code that the assistant generates.
            Qualities:
            #   - Correctness
            #   - Optimality
            #   - PEP8 Compliance & Readability

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Text Quality", 
            instruction="""
            How good is the text that the assistant generates.
            Qualities:
            #   - Spelling
            #   - Grammar
            #   - Capitalization & Punctuation
            #   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
            #   - Explains Code Well

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Markdown Formatting", 
            instruction="""
            How good is the markdown formatting that the assistant generates. Is it leveraging markdown syntax tools to maximize the readability of the text?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Richness",
            instruction="""
            How rich is the assistant's response to the user's messages. Does the code and text capitalize on opportunities to provide in-depth and diverse insights?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
    ]

    quality_results = {}
    for quality_aspect in user_qualities:
        r = inspect_conversation_quality_aspect(conversation, quality_aspect)
        quality_results[f"User - {quality_aspect.name}"] = r.model_dump()
    for quality_aspect in assistant_qualities:
        r = inspect_conversation_quality_aspect(conversation, quality_aspect)
        quality_results[f"Assistant - {quality_aspect.name}"] = r.model_dump()

    return quality_results

In [24]:
from tqdm import tqdm
import concurrent.futures
results = []
def process_file(file):
    with open(f"../../jsonl_conversations/Batch 4/{file}") as f:
        conversation = json.load(f)
    output = inspect_all_conversation_quality_aspects(conversation)
    record = {
        "id": conversation["id"],
        "colab_link": f"https://colab.research.google.com/drive/{conversation['id']}"
    }
    record.update(output)
    return record

with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    futures = [executor.submit(process_file, file) for file in file_list]
    progress_bar = tqdm(total=len(futures))
    for future in concurrent.futures.as_completed(futures):
        results.append(future.result())
        progress_bar.update(1)
    progress_bar.close()


 19%|█▊        | 95/512 [14:33<1:03:52,  9.19s/it]
100%|██████████| 512/512 [18:13<00:00,  2.14s/it]


In [28]:
results[0]

{'id': '1_HLk3ZzRK97Qmxg6_X-XUa8MJtZfhNa2',
 'colab_link': 'https://colab.research.google.com/drive/1_HLk3ZzRK97Qmxg6_X-XUa8MJtZfhNa2',
 'User - Natural & Realistic': {'score': 5,
  'issues': [],
  'praises': ["User's questions are clear and relevant to the context"]},
 'User - Consistent': {'score': 5,
  'issues': [],
  'praises': ["User's questions are consistently on-topic and technically oriented."]},
 'User - Colorful Personality': {'score': 1,
  'issues': ["The user's messages are purely informational, no personality traits discernible."],
  'praises': []},
 'User - Coherent Follow ups': {'score': 5,
  'issues': [],
  'praises': ["User's follow-ups build on previous information and show clear progression."]},
 'Assistant - Tailors to User': {'score': 5,
  'issues': [],
  'praises': ['Assistant provided clear and relevant examples',
   "Responses were tailored to user's requests",
   'Included additional information to enhance understanding']},
 'Assistant - Maximally Helpful': {'

In [46]:
import pandas as pd

# Cleanup scores from previous run
for result in results:
    keys = list(result.keys())
    for key in keys:
        if key.startswith("User") and key.endswith("score"):
            result.pop(key)
        if key.startswith("Assistant") and key.endswith("score"):
            result.pop(key)

# Add avg score
score_keys = set()
for result in results:
    user_scores = []
    for key in result:
        if key.startswith("User"):
            user_scores.append(result[key]["score"])
            score_keys.add(key)
    result["avg_user_score"] = sum(user_scores) / len(user_scores)

    assistant_scores = []
    for key in result:
        if key.startswith("Assistant"):
            assistant_scores.append(result[key]["score"])
            score_keys.add(key)
    result["avg_assistant_score"] = sum(assistant_scores) / len(assistant_scores)

    result["avg_score"] = (result["avg_user_score"] + result["avg_assistant_score"]) / 2
    result["min_score"] = min(user_scores + assistant_scores)

# Add score_keys per result
final_score_keys = []
for result in results:
    for key in score_keys:
        if key.startswith("User") or key.startswith("Assistant"):
            result[f"{key} - score"] = result[key]["score"]
            final_score_keys.append(f"{key} - score")


sorted_results = sorted(results, key=lambda x: x["min_score"], reverse=True)
df_inspection = pd.DataFrame(sorted_results)
df_inspection[["colab_link", "avg_score", "min_score", "avg_user_score", "avg_assistant_score"]+list(set(final_score_keys))]

Unnamed: 0,colab_link,avg_score,min_score,avg_user_score,avg_assistant_score,Assistant - Code Quality - score,Assistant - Markdown Formatting - score,User - Coherent Follow ups - score,Assistant - Text Quality - score,Assistant - Maximally Helpful - score,User - Consistent - score,Assistant - Richness - score,User - Natural & Realistic - score,User - Colorful Personality - score,Assistant - Tailors to User - score
0,https://colab.research.google.com/drive/1Q6bj48hdGstRYcM40l2sufb0XQ1v7HUq,4.666667,3,4.50,4.833333,5,5,5,5,5,5,4,5,3,5
1,https://colab.research.google.com/drive/1dF5eoJk1QgF-gzHq1S902cKdOF_jpM84,4.541667,3,4.25,4.833333,4,5,5,5,5,5,5,4,3,5
2,https://colab.research.google.com/drive/1F8Y3ih-JO6yN-uOgzcPC0bCkz6m7Jyt7,4.291667,2,4.25,4.333333,4,4,5,4,5,5,4,5,2,5
3,https://colab.research.google.com/drive/1-iW7k_y5N7RufI442HCf4q8KK-FhCT7K,4.375000,2,4.25,4.500000,4,5,5,4,5,5,4,5,2,5
4,https://colab.research.google.com/drive/1cpQEi66K_cWibTx-JpGqs0Nxfn0TK2-Y,4.208333,2,4.25,4.166667,4,4,5,4,5,5,4,5,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,https://colab.research.google.com/drive/18OBn8VwiBxiVAidtkwJskkt0CE2LlO5I,4.500000,1,4.00,5.000000,5,5,5,5,5,5,5,5,1,5
508,https://colab.research.google.com/drive/1AswrNSJSs5YRadnCNvh4ez8GYGAielTT,4.250000,1,4.00,4.500000,4,5,5,4,5,5,4,5,1,5
509,https://colab.research.google.com/drive/1VsYuzR9GcRANOdSarQURN96dvk30Dwoc,4.166667,1,4.00,4.333333,3,5,5,4,5,5,4,5,1,5
510,https://colab.research.google.com/drive/1-GgbEGJNm15JLg3K63cduSJjvCtFRN6k,4.208333,1,3.75,4.666667,5,5,5,4,5,5,4,4,1,5


In [47]:
score_keys

{'Assistant - Code Quality',
 'Assistant - Markdown Formatting',
 'Assistant - Maximally Helpful',
 'Assistant - Richness',
 'Assistant - Tailors to User',
 'Assistant - Text Quality',
 'User - Coherent Follow ups',
 'User - Colorful Personality',
 'User - Consistent',
 'User - Natural & Realistic'}

In [49]:
df_inspection[df_inspection["avg_score"] < 4][["colab_link", "avg_score", "min_score"]]

Unnamed: 0,colab_link,avg_score,min_score
6,https://colab.research.google.com/drive/1Za8l7nXUMPkJjqg5EXrptRFjM3qC0Ngo,3.958333,2
31,https://colab.research.google.com/drive/1VvoO2ztE8ZLNZZm1gnGl4AGZvMBClN9y,3.833333,2
64,https://colab.research.google.com/drive/102vi_sT5b0jxWRwiDp7EitmqjMWEepkd,3.958333,2
101,https://colab.research.google.com/drive/1QFUDWwrIKEZVOgxdhFx3huQrIT91tMoD,2.666667,1
102,https://colab.research.google.com/drive/1npUbGeHoZX4nyfGA8RfhQ6TK0vLZb2Bu,3.666667,1
114,https://colab.research.google.com/drive/11dUGlfX9wJbCvaszE_Z67cGf4CJOMLtc,3.833333,1
123,https://colab.research.google.com/drive/1pDAyzShkJh_Ffcb-BX6ZTUzGbp5VEwUL,3.916667,1
135,https://colab.research.google.com/drive/1L1BznbY_YX3QSJghMk0cl77-UcotAvnZ,3.833333,1
140,https://colab.research.google.com/drive/1DoNeqvYpEyYz5VQGmj5rSAePI4e19ZgL,3.833333,1
141,https://colab.research.google.com/drive/1INHVuPWNQAYJeG4gRgBC-KZz1voaH9PS,3.833333,1


In [33]:
import pandas as pd
pd.set_option('display.max_colwidth', 3 * pd.get_option('display.max_colwidth'))

df_inspection[["colab_link", "avg_score", "avg_user_score", "avg_assistant_score"]]

Unnamed: 0,colab_link,avg_score,avg_user_score,avg_assistant_score
0,https://colab.research.google.com/drive/1Q6bj48hdGstRYcM40l2sufb0XQ1v7HUq,4.666667,4.50,4.833333
1,https://colab.research.google.com/drive/19B-GTgUCxG909E8_DHONjmnQuy2k011Q,4.625000,4.25,5.000000
2,https://colab.research.google.com/drive/1dOLDuVQ-LGEdGY1bfk3oeN9xjdJcEb_W,4.625000,4.25,5.000000
3,https://colab.research.google.com/drive/14TFOINca7Z1NTdeC-Y6PvHVEBl65Nbcr,4.625000,4.25,5.000000
4,https://colab.research.google.com/drive/1PUlAPbpgnj_LDj1w3hAJTirTZhG-SC6c,4.541667,4.25,4.833333
...,...,...,...,...
507,https://colab.research.google.com/drive/1nIWZ-uBGojlmPx3zIYtX157Tf7F3Wqng,3.625000,4.25,3.000000
508,https://colab.research.google.com/drive/1MbkOqrNNTUgteR7lPjcrAnOUv9D68cjb,3.583333,4.00,3.166667
509,https://colab.research.google.com/drive/1lxxrPqudOYpQLSAReLHTxK8msuHigIeq,3.583333,4.00,3.166667
510,https://colab.research.google.com/drive/1vLOF3TF-OipY9-76XUiRGV5N9c59seN2,3.583333,3.50,3.666667


In [None]:
from src.sheets_utils import upload_df_to_sheet


service_account_path = "../../cres/google__sa.json"
tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"

upload_df_to_sheet(
    service_account_path=service_account_path,
    sheet_id=tracking_sheet_id,
    sheet_name="",
    df=df_inspection[["colab_link", "avg_score", "avg_user_score", "avg_assistant_score"]],
)