In [1]:
import sys
from pathlib import Path
import pandas as pd
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import RDF, RDFS, OWL, XSD, DC
from tqdm import tqdm

# Add the src directory to the Python path
sys.path.append(str(Path().resolve() / "src"))

## Parsing

In [14]:
from parser import parse_and_extract_articles_langs_from_dirs
import tomllib

ROOT = Path().resolve()
config_path =  "config.toml"

with open(config_path, "rb") as f:
    config = tomllib.load(f)

rdf_dirs = [ROOT / Path(d) for d in config["paths"]["rdf_dirs"]]

cache_dir = ROOT / "data" / "cache"

data_path = ROOT / "data" / "cinii_topics_benchmark" / "article_labels.csv"

In [3]:
df_en = pd.read_pickle(cache_dir / "en_articles.pkl")
df_jp = pd.read_pickle(cache_dir / "jp_articles.pkl")

In [4]:
df_csv = pd.read_csv(data_path)

data = pd.merge(df_csv, df_en, on="uri", how="inner")

data = data[data['confidence'] != 'skip']

In [6]:
avg_char_length = data['abstract'].apply(len).mean()
avg_word_length = data['abstract'].apply(lambda x: len(x.split())).mean()

print(f"Average number of characters: {avg_char_length:.2f}")
print(f"Average number of words: {avg_word_length:.2f}")

Average number of characters: 982.59
Average number of words: 146.67


In [7]:
from indexes import HNSWWrapper
from indexes import build_minhash, build_lshforest_index

from evaluation import (
    precision_at_k, recall_at_k, average_precision_at_k, ndcg_at_k, f1_at_k,  # f1 optional
)

assert {"uri", "title", "abstract"}.issubset(data.columns), "data must have uri/title/abstract"
assert data["uri"].is_unique, "uri must be unique"

In [8]:
texts = (data["title"].fillna("") + " " + data["abstract"].fillna("")).tolist()
uris  = data["uri"].astype(str).tolist()

hnsw = HNSWWrapper(
    texts=texts,
    uris=uris,
    emb_model_name="sentence-transformers/all-MiniLM-L6-v2",
    use_cosine=True,
    ef_search=128,
)

In [9]:
import json

tokens_path = ROOT / "data" / "tokens"
word_tokens_path = tokens_path / "words" / "en_words.json"
shingle_tokens_path = tokens_path / "k_shingles" / "en_k_shingles.json"

with open(word_tokens_path, "r") as f:
    word_tokens = json.load(f)

with open(shingle_tokens_path, "r") as f:
    shingle_tokens = json.load(f)

valid_uris = set(data["uri"])

word_tokens = {k: v for k, v in word_tokens.items() if k in valid_uris}
shingle_tokens = {k: v for k, v in shingle_tokens.items() if k in valid_uris}

In [15]:
if config["lsh"]["token_type"] == "words":
    tokens = word_tokens
    print("Using word tokens for LSH")

elif config["lsh"]["token_type"] == "k_shingles":
    tokens = shingle_tokens
    print("Using k-shingle tokens for LSH")

from datasketch import MinHash

mh_by_uri: dict[str, MinHash] = {}
for u, toks in tokens.items():
    if not isinstance(toks, list):
        raise TypeError("Each df1['tokens'] element must be a list[str]")
    mh_by_uri[u] = build_minhash([str(t) for t in toks], num_perm=config["lsh"]["num_perm"])

forest = build_lshforest_index(mh_by_uri, num_perm=config["lsh"]["num_perm"], l=config["lsh"]["forest_l"])

Using k-shingle tokens for LSH


In [None]:
label_col = "subtopic"  # or "subtopic"
k = 5
exclude_self = True

label_by_uri = dict(zip(data["uri"].astype(str), data[label_col].astype(str)))

def gold_set(query_uri: str) -> set[str]:
    lbl = label_by_uri[query_uri]
    return {u for u in uris if u != query_uri and label_by_uri[u] == lbl}

# HNSW IR 
p_list, r_list, ap_list, nd_list = [], [], [], []
for q in uris:
    gold = gold_set(q)
    if not gold:
        continue  # skip queries with no relevant items in gold

    raw = hnsw.query_by_uri(q, topk=k + (1 if exclude_self else 0), return_scores=False, exclude_self=False)
    ranked = [u for u in raw if (not exclude_self or u != q)][:k]

    rel = [1 if u in gold else 0 for u in ranked]
    num_rel = len(gold)

    p_list.append(precision_at_k(rel, k))
    r_list.append(recall_at_k(rel, k, num_rel))
    ap_list.append(average_precision_at_k(rel, k))
    nd_list.append(ndcg_at_k(rel, k, num_rel))

hnsw_results = {
    f"P@{k}": sum(p_list)/len(p_list) if p_list else float("nan"),
    f"R@{k}": sum(r_list)/len(r_list) if r_list else float("nan"),
    f"mAP@{k}": sum(ap_list)/len(ap_list) if ap_list else float("nan"),
    f"nDCG@{k}": sum(nd_list)/len(nd_list) if nd_list else float("nan"),
    "Queries": len(p_list),
}
hnsw_results

{'P@5': 0.3272727272727273,
 'R@5': 0.22972027972027972,
 'mAP@5': 0.5344696969696969,
 'nDCG@5': 0.36614279465027433,
 'Queries': 143}

In [None]:
# LSH Forest IR 
p_list, r_list, ap_list, nd_list = [], [], [], []
for q in uris:
    gold = gold_set(q)
    if not gold:
        continue

    cand = forest.query(mh_by_uri[q], k + (1 if exclude_self else 0))
    ranked = [u for u in cand if (not exclude_self or u != q)][:k]

    rel = [1 if u in gold else 0 for u in ranked]
    num_rel = len(gold)

    p_list.append(precision_at_k(rel, k))
    r_list.append(recall_at_k(rel, k, num_rel))
    ap_list.append(average_precision_at_k(rel, k))
    nd_list.append(ndcg_at_k(rel, k, num_rel))

lsh_results = {
    f"P@{k}": sum(p_list)/len(p_list) if p_list else float("nan"),
    f"R@{k}": sum(r_list)/len(r_list) if r_list else float("nan"),
    f"mAP@{k}": sum(ap_list)/len(ap_list) if ap_list else float("nan"),
    f"nDCG@{k}": sum(nd_list)/len(nd_list) if nd_list else float("nan"),
    "Queries": len(p_list),
}
lsh_results

{'P@5': 0.07552447552447553,
 'R@5': 0.04543865225683408,
 'mAP@5': 0.15297202797202797,
 'nDCG@5': 0.07519038691928903,
 'Queries': 143}

In [17]:
pd.DataFrame(
    [
        {"Method": "HNSW", **hnsw_results},
        {"Method": "MinHash+LSHForest", **lsh_results},
    ]
)[["Method", f"P@{k}", f"R@{k}", f"mAP@{k}", f"nDCG@{k}", "Queries"]]

Unnamed: 0,Method,P@5,R@5,mAP@5,nDCG@5,Queries
0,HNSW,0.56,0.110509,0.71613,0.575544,150
1,MinHash+LSHForest,0.332,0.040739,0.452343,0.324514,150


In [34]:
FOS_SUBTOPICS = [
    "Mathematics",
    "Computer and Information Sciences",
    "Physical Sciences",
    "Chemical Sciences",
    "Earth and Related Environmental Sciences",
    "Biological Sciences",
    "Other Natural Sciences",
    "Civil Engineering",
    "Electrical Engineering, Electronic Engineering, Information Engineering",
    "Mechanical Engineering",
    "Chemical Engineering",
    "Materials Engineering",
    "Medical Engineering",
    "Environmental Engineering",
    "Environmental Biotechnology",
    "Industrial Biotechnology",
    "Nanotechnology",
    "Other Engineering and Technologies",
    "Basic Medicine",
    "Clinical Medicine",
    "Health Sciences",
    "Medical Biotechnology",
    "Other Medical Sciences",
    "Agriculture, Forestry, and Fisheries",
    "Animal and Dairy Science",
    "Veterinary Science",
    "Agricultural Biotechnology",
    "Other Agricultural Sciences",
    "Psychology",
    "Economics and Business",
    "Educational Sciences",
    "Sociology",
    "Law",
    "Political Science",
    "Social and Economic Geography",
    "Media and Communications",
    "Other Social Sciences",
    "History and Archaeology",
    "Languages and Literature",
    "Philosophy, Ethics and Religion",
    "Art (Arts, History of Arts, Performing Arts, Music)",
    "Other Humanities",
]
FOS_MAJOR_TOPICS = [
    "Natural Sciences",
    "Engineering and Technology",
    "Medical and Health Sciences",
    "Agricultural Sciences",
    "Social Sciences",
    "Humanities",
]
ALLOWED_TOPICS = FOS_SUBTOPICS

In [None]:
import os, time
import pandas as pd
from typing import Literal
from pydantic import BaseModel, create_model
from openai import OpenAI
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# columns: ['uri', 'title', 'abstract', 'subtopic']
assert {"title", "abstract", "subtopic"}.issubset(data.columns)

# Dynamic Pydantic model with a Literal enum over topics
# (All fields required, additionalProperties=false under the hood)
TopicPred = create_model(
    "TopicPred",
    predicted_subtopic=(Literal[tuple(ALLOWED_TOPICS)], ...)  # required
)

In [None]:
MODEL = os.getenv("OPENAI_MODEL", "gpt-5-mini")

SYSTEM = "You classify scholarly records into ONE OECD FOS topic from the allowed set. Output must follow the schema."
USER_TPL = """Choose ONE subtopic from the allowed list.

ALLOWED SUBTOPICS:
{allowed}

RECORD
Title: {title}
Abstract: {abstract}
"""

def predict_one(title: str, abstract: str) -> str:
    user = USER_TPL.format(
        allowed="\n- " + "\n- ".join(ALLOWED_TOPICS),
        title=(title or "").strip(),
        abstract=(abstract or "").strip()
    )

    resp = client.responses.parse(
        model=MODEL,
        input=[{"role": "system", "content": SYSTEM},
               {"role": "user", "content": user}],
        text_format=TopicPred,         # <- Structured Outputs via Pydantic
    )

    # resp.output_parsed is a TopicPred instance
    return resp.output_parsed.predicted_subtopic

# Run predictions
preds = []
for _, row in data.iterrows():
    preds.append(predict_one(row["title"], row["abstract"]))
    time.sleep(0.1)  # pacing if needed

data_eval = data.copy()
data_eval["pred_subtopic"] = preds

In [None]:
gold = data_eval["subtopic"].astype(str).tolist() # major_topic or subtopic
pred = data_eval["pred_subtopic"].astype(str).tolist()

acc = accuracy_score(gold, pred)
prec, rec, f1, _ = precision_recall_fscore_support(gold, pred, average="macro", zero_division=0)

print(f"Accuracy:         {acc:.4f}")
print(f"Macro Precision:  {prec:.4f}")
print(f"Macro Recall:     {rec:.4f}")
print(f"Macro F1:         {f1:.4f}")

print("\nPer-class (head):")
print(classification_report(gold, pred, zero_division=0)[:2000])

Accuracy:         0.6467
Macro Precision:  0.4933
Macro Recall:     0.5174
Macro F1:         0.4788

Per-class (head):
                                                                         precision    recall  f1-score   support

                                             Agricultural Biotechnology       0.00      0.00      0.00         1
                                   Agriculture, Forestry, and Fisheries       0.80      0.50      0.62         8
                                               Animal and Dairy Science       0.33      1.00      0.50         1
                                                         Basic Medicine       0.60      0.50      0.55         6
                                                    Biological Sciences       0.64      0.75      0.69        12
                                                   Chemical Engineering       0.00      0.00      0.00         3
                                                      Chemical Sciences       0.67      0