In [3]:

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from dotenv import load_dotenv
from typing import Optional, Union
from pydantic import Field, BaseModel
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages.ai import AIMessage
import time
import json

# Load environment variables
load_dotenv()

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o-mini")

# Define evaluation schemas
descriptions = {
    "status": """
        This field will be one of the following values:
        - Incomplete or erroneous
            Meaning: Code is incomplete or it is producing errors (it cannot be run succesfully).
        - Incorrect but better
            Meaning: Code is incorrect but better than previous code snippet in terms of syntax, logic and correctness. This means the user is moving in right direction towards the correct code.
        - Incorrect and worse
            Meaning: Code is incorrect and worse than previous code snippet in terms of syntax, logic -  This means the user is moving in wrong direction away from the correct code.
        - Incorrect and same as previous code snippet
            Meaning: Code is incorrect and same as previous code snippet and there is no improvement in the code.
        - Partially correct
            Meaning: Code is partially correct. It covers some of the requirements from the question but not all.
        - Correct but can be improved
            Meaning: Syntax is correct, code does not produce any error on running it and output is as expected BUT the code can be improved (improvements can be in terms of code optimization, better logic, better variable names, better comments, etc.)
        - Correct
            Meaning: Syntax is correct, code does not produce any error on running it and output is as expected. Code is following the guidelines provided in the question. There is no redundancy in the code. Code is optimized and well written.
        """,

    "score": """
        An approximate score out of 100 as the quality of the code snippet provided by user. This score should be based on the quality of code snippet and how well it is following the guidelines provided in the question.
    """,

    "result": """
        - 1
            Meaning: The given code is better than the previous code snippet in terms of syntax, logic and correctness. This means the user is moving in right direction towards the correct code.
        - -1
            Meaning: The given code is worse than the previous code snippet in terms of syntax, logic or correctness. This means the user is moving in wrong direction away from the correct code.
    """,

    "comment": """
        One line comment showing the reason for the given score.
    """
}

class Evaluation(BaseModel):
    """Evaluation of the code provided by the user."""
    status: str = Field(description=descriptions["status"])
    score: int = Field(description=descriptions["score"])
    result: int = Field(description=descriptions["result"])
    comment: str = Field(description=descriptions["comment"])

class ConversationalResponse(BaseModel):
    """Respond in a conversational manner. Be kind and helpful."""
    response: str = Field(description="A conversational response to the user's query")

class FinalResponse(BaseModel):
    final_output: Union[Evaluation, ConversationalResponse]

# Configure LLM for structured output
model_with_structured_output = llm.with_structured_output(FinalResponse)

# Define state and workflow
class AgentState(MessagesState):
    messages: list[BaseMessage]
    final_output: Optional[FinalResponse] = None

def call_model(state: AgentState) -> AgentState:
    response = model_with_structured_output.invoke(state["messages"])
    print("RESPONSE", response)
    return {
        "messages": state["messages"] + [AIMessage(content=str(response.final_output))],
        "final_output": response
    }

# Setup workflow
workflow = StateGraph(state_schema=AgentState)
workflow.add_node("model", call_model)
workflow.add_edge(START, "model")

# Initialize memory and compile workflow
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

# Helper functions
def get_code(filename: str) -> str:
    with open(filename) as f:
        return f.read()

def get_user_code_snippet() -> list[BaseMessage]:
    code_snippet = get_code("test.py")
    return [HumanMessage(content=code_snippet)]

def get_output(messages: list[BaseMessage]):
    config = {"configurable": {"thread_id": "12355"}}
    output = app.invoke({"messages": messages}, config)
    final_output = json.loads(output["final_output"].json())["final_output"]
    if output["messages"]:
        output["messages"][-1].pretty_print()
        print("Status",final_output["status"])
        print("Score",final_output["score"])
        print("Result",final_output["result"])
        print("Comment",final_output["comment"])


    return {"messages":output["messages"], "final_output":final_output}

def init():
    system_message = SystemMessage(content="""
You are a python code quality evaluator.
1. You will receive a question in the next prompt.
2. Then you will receive code snippets from users periodically. 
3. You must evaluate the code quality and provide output.
4. Do not provide solutions, suggestions, or hints.
""")
    
    return get_output([system_message])

def main():
    # Initialize the evaluator
    messages = init()
    
    # Set the question
    question = "Write a program that takes input first name and last name from the user and passes it to the function greeting_full_name to print the full name of the user with a greeting."
    question_message = SystemMessage(content=f"Question: {question}")
    messages = get_output(messages + [question_message])["messages"]

    prev_code_snippet = ""
    
    # Main evaluation loop
    while True:
        try:
            user_message = get_user_code_snippet()
            if user_message[0].content == prev_code_snippet:
                print("Code snippet is same as previous. Skipping evaluation.")
            else:
                output = get_output(messages + user_message)
                messages = output["messages"]
            time.sleep(5)
            prev_code_snippet = user_message[0].content
        except KeyboardInterrupt:
            print("\nEvaluation stopped by user")
            break
        except Exception as e:
            print(f"Error occurred: {e}")
            time.sleep(5)

if __name__ == "__main__":
    main()

RESPONSE final_output=Evaluation(status='Incomplete or erroneous', score=20, result=-1, comment='The code contains significant errors and cannot be executed successfully.')

status='Incomplete or erroneous' score=20 result=-1 comment='The code contains significant errors and cannot be executed successfully.'
Status Incomplete or erroneous
Score 20
Result -1
Comment The code contains significant errors and cannot be executed successfully.
RESPONSE final_output=Evaluation(status='Incomplete or erroneous', score=20, result=-1, comment='The code contains significant errors and cannot be executed successfully.')

status='Incomplete or erroneous' score=20 result=-1 comment='The code contains significant errors and cannot be executed successfully.'
Status Incomplete or erroneous
Score 20
Result -1
Comment The code contains significant errors and cannot be executed successfully.
Code snippet is same as previous. Skipping evaluation.
Code snippet is same as previous. Skipping evaluation.
Code s

hello
