In [None]:
import pandas as pd
import json
import time
import openai
import os
from dotenv import load_dotenv

load_dotenv()

# ✅ Initialize OpenAI client (recommended to use OPENAI_API_KEY as an environment variable)
API_KEY = os.getenv("OPENAI_API_KEY")
client = openai.Client(api_key=API_KEY)

# ✅ Load the CSV containing sub-questions and background
df = pd.read_csv("unsw.csv")

# ✅ Prompt template with full background

def build_prompt(full_question, subquestion):
    return f"""You are a statistics master student.

Here is the full background of the problem:
{full_question}

Now, please solve the following sub-question step-by-step using structured reasoning:

{subquestion}

For each step, return a JSON object with:
- step: the step number (or "final" if it's the final answer),
- desc: a short description of what you are doing,
- expr: a math expression if applicable,
- value: the computed result if any

Format the full output as a JSON array of steps.
Return ONLY a valid JSON array. No explanations. No markdown.
"""

# ✅ GPT call function (with debug-friendly saving + error tolerance)
def call_gpt4o(prompt, qid, subid, model="gpt-4o"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        reply = response.choices[0].message.content.strip()

        # Clean markdown wrappers
        if reply.startswith("```json"):
            reply = reply.removeprefix("```json").removesuffix("```").strip()
        elif reply.startswith("```"):
            reply = reply.removeprefix("```").removesuffix("```").strip()

        return json.loads(reply)

    except json.JSONDecodeError as jde:
        print(f"❌ JSON decoding failed ({qid}-{subid}):", jde)
        return [{"step": "error", "desc": "Invalid JSON output", "raw": reply[:300]}]
    except Exception as e:
        print(f"❌ API call failed ({qid}-{subid}):", e)
        return [{"step": "error", "desc": str(e)}]

# ✅ Main loop: iterate through each sub-question
solutions = []

for _, row in df.iterrows():
    qid = row["qid"]
    subid = row["subid"]
    subq = row["subquestion"]
    fullq = row["full_question"]

    print(f"\n🧠 Solving: {qid}-{subid}")

    prompt = build_prompt(fullq, subq)
    steps = call_gpt4o(prompt, qid, subid)

    solutions.append({
        "qid": qid,
        "subid": subid,
        "subquestion": subq,
        "full_question": fullq,
        "steps": steps
    })

    time.sleep(2)

# ✅ Save all results
with open("gpt4o_subquestion_solutions.json", "w") as f:
    json.dump(solutions, f, indent=2, ensure_ascii=False)

print("\n✅ All sub-questions processed! Results saved to gpt4o_subquestion_solutions.json")

import math

# ✅ Recursively convert NaN to None (valid JSON null)
def clean_nan(obj):
    if isinstance(obj, float) and math.isnan(obj):
        return None
    elif isinstance(obj, list):
        return [clean_nan(x) for x in obj]
    elif isinstance(obj, dict):
        return {k: clean_nan(v) for k, v in obj.items()}
    else:
        return obj

# ✅ Apply to the full solutions list
cleaned_solutions = clean_nan(solutions)

# ✅ Save as valid JSON
with open("gpt4o_subquestion_solutions.json", "w") as f:
    json.dump(cleaned_solutions, f, indent=2, ensure_ascii=False)

print("\n✅ Cleaned and saved (NaN → null). File written: gpt4o_subquestion_solutions.json")


In [None]:
import requests

# Load environment variables
load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"  # Verify endpoint

# Load GPT-generated solutions
with open("gpt4o_subquestion_solutions.json") as f:
    solutions = json.load(f)

# Grading prompt template
def build_grading_prompt(subquestion, steps_json):
    steps_str = json.dumps(steps_json, indent=2, ensure_ascii=False)
    return f"""You are a statistics tutor. Please grade a student's step-by-step solution to a sub-question.

Sub-question:
{subquestion}

Student's steps:
{steps_str}

Now, do the following:
1. Evaluate if each step is correct or flawed. If flawed, explain why.
2. Give a short comment for each step.
3. Give an overall score out of 5 and a short feedback.
4. Use the following scoring rubric:
   1 - Completely incorrect: Major logical flaws, fundamental misunderstandings, or missing core steps. Lacks basic understanding.
   2 - Weak: Some grasp of the method, but contains multiple errors, flawed reasoning, or incoherent structure.
   3 - Satisfactory: Main method is correct, includes key steps, but has some calculation or explanation issues.
   4 - Good: Mostly correct, logically structured, only minor issues such as small errors or slightly informal reasoning.
   5 - Excellent: Fully correct, well-organized, rigorous and clear reasoning. A model solution.
5. Format your output as the following JSON:

{{
  "score": X,
  "total": 5,
  "feedback": "...",
  "step_feedback": [{{"step": ..., "comment": "..."}}, ...]
}}
Return ONLY a valid JSON object. No markdown, no extra explanation.
"""

# Modified DeepSeek grading function
def grade_with_deepseek(question, steps, qid, subid, model="deepseek-chat"):
    prompt = build_grading_prompt(question, steps)
    try:
        headers = {
            "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0
        }

        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        
        reply = response.json()["choices"][0]["message"]["content"].strip()
        
        # Clean markdown formatting
        if reply.startswith("```json"):
            reply = reply[7:-3].strip()
        elif reply.startswith("```"):
            reply = reply[3:-3].strip()

        return json.loads(reply)

    except json.JSONDecodeError as jde:
        print(f"❌ JSON parsing failed ({qid}-{subid}):", jde)
        return {"score": 0, "total": 5, "feedback": "Invalid JSON output", "step_feedback": []}
    except requests.exceptions.RequestException as re:
        print(f"❌ API request failed ({qid}-{subid}):", re)
        return {"score": 0, "total": 5, "feedback": f"API Error: {str(re)}", "step_feedback": []}
    except Exception as e:
        print(f"❌ Unexpected error ({qid}-{subid}):", e)
        return {"score": 0, "total": 5, "feedback": str(e), "step_feedback": []}

# Batch grading with DeepSeek
graded_results = []
for item in solutions:
    qid = item["qid"]
    subid = item["subid"]
    question = item["subquestion"]
    steps = item["steps"]

    print(f"📝 Grading: {qid}-{subid}")
    grading = grade_with_deepseek(question, steps, qid, subid)

    graded_results.append({
        "qid": qid,
        "subid": subid,
        "subquestion": question,
        "score": grading.get("score", 0),
        "total": grading.get("total", 5),
        "feedback": grading.get("feedback", ""),
        "step_feedback": grading.get("step_feedback", [])
    })

    time.sleep(1)  # Rate limiting

# Save grading results
with open("deepseek_grading_results.json", "w", encoding='utf-8') as f:
    json.dump(graded_results, f, indent=2, ensure_ascii=False)

print("✅ All sub-questions graded and saved as deepseek_grading_results.json")

In [None]:
# Read grading results
with open("deepseek_grading_results.json", "r", encoding="utf-8") as f:
    graded = json.load(f)

# Filter out sub-questions that did not receive full marks, and record their original indices
wrong = []
for idx, item in enumerate(graded):
    if item.get("score", 0) < item.get("total", 5):
        item_with_index = item.copy()
        item_with_index["index"] = idx  # Add original index
        wrong.append(item_with_index)

# Save as JSON
with open("wrong_subquestions.json", "w", encoding="utf-8") as f:
    json.dump(wrong, f, indent=2, ensure_ascii=False)

print(f"✅ Extracted {len(wrong)} incorrect sub-questions and saved to wrong_subquestions.json with original indices recorded.")

In [None]:
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import numpy as np

# ✅ Load grading results
with open("deepseek_grading_results.json", "r", encoding="utf-8") as f:
    graded = json.load(f)

# ✅ Define index ranges for each course category
category_ranges = {
    "Statistical Inference": range(0, 39),
    "Computational Methods for Finance": range(39, 119),
    "Optimization": range(119, 134)
}

# ✅ Initialize statistics container
category_distribution = defaultdict(lambda: {"score_counts": Counter(), "total": 0})

# ✅ Iterate over grading results and categorize by index
for idx, item in enumerate(graded):
    for category, index_range in category_ranges.items():
        if idx in index_range:
            score = item.get("score", 0)
            total_score = item.get("total", 5)
            category_distribution[category]["score_counts"][score] += 1
            category_distribution[category]["total"] += 1
            break

# ✅ Build summary table
summary_data = []
for category, data in category_distribution.items():
    total_questions = data["total"]
    full_score = max(data["score_counts"].keys(), default=5)
    accuracy = data["score_counts"].get(full_score, 0) / total_questions if total_questions > 0 else 0

    row = {
        "Category": category,
        "Total Questions": total_questions,
        "Accuracy": f"{accuracy:.2%}"
    }

    # Add count and proportion for each score level
    for score in range(full_score + 1):
        count = data["score_counts"].get(score, 0)
        row[f"Score {score} Count"] = count
        row[f"Score {score} Ratio"] = f"{(count / total_questions):.1%}" if total_questions > 0 else "0%"

    summary_data.append(row)

# ✅ Convert to DataFrame
df_summary = pd.DataFrame(summary_data)

# ✅ Custom subject order and score levels
custom_order = ["Optimization", "Computational Methods for Finance", "Statistical Inference"]
score_levels = [1, 2, 3, 4, 5]

# ✅ Extract ratio data for plotting
subject_ratios = {
    cat: [
        float(df_summary[df_summary["Category"] == cat][f"Score {score} Ratio"].values[0].strip('%')) / 100
        for score in score_levels
    ]
    for cat in custom_order
}

# ✅ Convert to matrix (score as x-axis)
ratios_matrix = np.array(list(subject_ratios.values())).T  # shape: (num_scores, num_categories)

# ✅ Plot settings
bar_width = 0.2
x = np.arange(len(score_levels))
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # Blue, Orange, Green — match custom_order

plt.figure(figsize=(10, 6))

# ✅ Draw bar chart for each subject
for i, cat in enumerate(custom_order):
    offsets = x + (i - 1) * bar_width
    heights = ratios_matrix[:, i]
    bars = plt.bar(offsets, heights, width=bar_width, label=cat, color=colors[i])

    # ✅ Add percentage labels
    for bar, height in zip(bars, heights):
        plt.text(bar.get_x() + bar.get_width()/2, height + 0.01,
                 f"{height:.1%}", ha='center', va='bottom', fontsize=9)

# ✅ Chart decoration
plt.xticks(x, [str(s) for s in score_levels])
plt.xlabel("Score")
plt.ylabel("Proportion")
plt.ylim(0, 1.05)
plt.title("Score Distribution of gpt4o by Subject")
plt.legend(loc="upper left")
plt.grid(axis="y", linestyle="--", alpha=0.5)

plt.tight_layout()
plt.show()


In [None]:
# Error types and their counts
categories = [
    "Lack of Justification or Explanation",
    "Less Precise",
    "Missing Intermediate Steps or Details",
    "Unclear or Inconsistent Notation",
    "Poor Structure or Lack of Formal Presentation",
    "Incomplete Gradient/Hessian Analysis",
    "Lack of Logical Flow",
    "Missing Eigenvalue/Matrix-Related Reasoning"
]
counts = [26, 22, 6, 5, 2, 2, 2, 1]

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(counts, labels=categories, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 10})
plt.title("GPT-4o Error Type Distribution on UNSW Dataset", fontsize=14)
plt.axis('equal') 

plt.tight_layout()
plt.show()