In [None]:
import dspy
from dotenv import load_dotenv
import os
load_dotenv()

lm = dspy.LM("openai/gpt-4.1-mini-2025-04-14",api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)

In [None]:
lm("testing")

In [None]:
from typing import Literal

class ResponseSignature(dspy.Signature):
    context: str = dspy.InputField(
        description="Context for the task. This should be a clear and concise description of the task to be performed by the model."
    )
    problem_statement: str = dspy.InputField(
        description="The problem statement provided in the PR. This should be a clear and concise description of the issue being addressed by the PR."
    )
    hints_text: str = dspy.InputField(
        description="Hints text related to the PR. Can be empty as well. If empty do not flag the PR as problematic."
    )

    problem_category: Literal[""] = dspy.OutputField(
        description="Category of the problem statement.."
    )

    explanation: str = dspy.OutputField(
        description="Explanation for the decision if the PR is flagged as problematic,Should be CONCISE. If not flagged, this should be an empty string. "
    )
    flagged: bool = dspy.OutputField(
        description="Flag indicating whether the PR is problematic if there is a contradiction between the problem statement and the hints_text. If there is a contradiction, this should be True; otherwise, it should be False."
    )

context = """
Your task is to determine whether the **problem statement provided in a GitHub pull request (PR)** is consistent with the **hints text**.
Specifically, check if any hints text contradicts or corrects the original problem statement. For example, the PR description might say the bug is in file X,
but a hint might clarify that the real issue is in file Y. In such cases, you should flag the PR as problematic.
Provide an explanation for your decision IF you flag the input.

If the problem statement is consistent and unambiguous based on the hints text, return an EMPTY STRING and DO NOT flag the input.
"""

predict = dspy.Predict(ResponseSignature)


In [None]:
import pandas as pd
from datasets import load_dataset
dataset = load_dataset(path="princeton-nlp/SWE-bench", split="dev")
df = pd.DataFrame(dataset)

df.head(3)

In [None]:
response = predict(
    context=context,
    problem_statement=df["problem_statement"][15],
    hints_text=df["hints_text"][15]
)
response

In [None]:
import json

def analyze_responses(df, output_file="analysis_results.json",n = 20):
    """
    input: DataFrame with columns 'problem_statement' and 'hints_text' and max n rows to analyze.
    output: JSON file with analysis results.
    """
    results = []

    for i, row in df.iterrows(): 
        if i >= n:
            break
        response = predict(
            context=context,
            problem_statement=row["problem_statement"],
            hints_text=row["hints_text"]
        )

        results.append({
            "flagged": response.flagged,
            "explanation": response.explanation
        })

    # Overwrite the file (reset) with new results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print(f"Results saved to {output_file}")


In [None]:
analyze_responses(df, output_file="analysis_results.json")