In [1]:
import os

os.chdir("/Users/yenchenchou/Documents/GitHub/ai-librarian")

In [12]:
import asyncio
import json
import logging
import os
import random
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional

import aiohttp
import openai
import requests
from openai import OpenAI
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm import tqdm


In [13]:
# Load environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [62]:
# Load the questions data
with open("data/questions/uwm_questions.json", "r") as f:
    questions = json.load(f)


def classify_question(question: str) -> dict:
    """
    Classify a question using OpenAI API.
    """
    prompt = """Classify the following questions from University of Wisconsin-Milwaukee (UWM) https://uwm.libanswers.com/search/ into one of these categories.
    Some questions seem to be genneral questions but you need take into the the factor that they are from UWM students.
    1. GENERAL_QA: General questions not related to the University of Wisconsin-Milwaukee.
    2. SCHOOL_SPECIFIC_WITH_AI_AGENT_POTENTIAL: Questions requiring UWM-specific knowledge or even internet resources that would benefit from AI assistance to provide comprehensive answers, such as developing research strategies or creating step-by-step plans using UWM resources.
    3. SCHOOL_SPECIFIC_WITH_FACT_ONLY: Questions requiring UWM-specific knowledge that can be answered with straightforward facts, without needing additional reasoning or strategic planning. These typically have a single, definitive answer from UWM policies or resources.

    Question: {question}

    Respond in JSON format (without markdown code block) with:
    - category: The chosen category
    - confidence: Confidence score (0-1)
    - explanation: Brief explanation of classification
    """

    response = client.chat.completions.create(
        # model="gpt-4o-2024-08-06",
        model="o4-mini",
        messages=[
            {"role": "system", "content": "You are a library question classifier."},
            {"role": "user", "content": prompt.format(question=question)},
        ],
        # temperature=0.0,
    )

    return response.choices[0].message.content


def parse_classification(classification_str: str) -> dict:
    """
    Parse classification string that might be wrapped in markdown code block.
    """
    # Remove markdown code block if present
    if classification_str.startswith("```json"):
        classification_str = classification_str.split("```json")[1]
    if classification_str.endswith("```"):
        classification_str = classification_str.rsplit("```", 1)[0]

    # Clean up whitespace and parse JSON
    classification_str = classification_str.strip()
    return json.loads(classification_str)


# Classify all questions
print("Classifying questions...")
classifications = []
for q in tqdm(questions[:]):
    result = classify_question(q["question"])
    classifications.append(
        {"question": q["question"], "classification": parse_classification(result)}
    )

# Aggregate results
category_counts = {}
for c in classifications:
    cat = c["classification"]["category"]
    category_counts[cat] = category_counts.get(cat, 0) + 1

# Save detailed results
output_path = Path("data/analysis/classification_results.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
    json.dump(classifications, f, indent=2)

# Display summary
print("\nClassification Summary:")
print("-" * 40)
total = len(classifications)
for category, count in category_counts.items():
    percentage = (count / total) * 100
    print(f"{category:25} {count:3d} ({percentage:.1f}%)")

# Show examples
print("\nExample Questions by Category:")
print("-" * 40)
for category in category_counts.keys():
    print(f"\n{category}:")
    examples = [
        c["question"]
        for c in classifications
        if c["classification"]["category"] == category
    ][
        :2
    ]  # Show up to 2 examples
    for ex in examples:
        print(f"- {ex}")

print(f"\nDetailed results saved to {output_path}")

Classifying questions...


100%|██████████| 74/74 [04:00<00:00,  3.25s/it]


Classification Summary:
----------------------------------------
SCHOOL_SPECIFIC_WITH_FACT_ONLY  55 (74.3%)
GENERAL_QA                 10 (13.5%)
SCHOOL_SPECIFIC_WITH_AI_AGENT_POTENTIAL   9 (12.2%)

Example Questions by Category:
----------------------------------------

SCHOOL_SPECIFIC_WITH_FACT_ONLY:
- What is Search@UW?
- How can I search for items that are only in the Curriculum Collection at UW-Milwaukee?

GENERAL_QA:
- What are "scholarly sources"?
- What is open access and how do I know if the article is open access?

SCHOOL_SPECIFIC_WITH_AI_AGENT_POTENTIAL:
- How do I get help with research?
- Can you help me find autobiographies and biographies for students in grades 1-3?

Detailed results saved to data/analysis/classification_results.json





In [63]:
category_counts

{'SCHOOL_SPECIFIC_WITH_FACT_ONLY': 55,
 'GENERAL_QA': 10,
 'SCHOOL_SPECIFIC_WITH_AI_AGENT_POTENTIAL': 9}