In [None]:
import os
import pickle
from pathlib import Path
from typing import Any

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

os.chdir(os.path.dirname(os.getcwd()))
data_dirs = [
    d
    for d in Path("./data/blocking").iterdir()
    if d.name not in ["songs", "citeseer-dblp"]
]

In [None]:
result_dir = Path("./results/debug/")

def evaluate(
    candidates: list[set[tuple[Any, Any]]],
    matches: set[tuple[Any, Any]],
    *,
    threshold: float = 0.9,
) -> dict:
    cands = set()
    precisions, recalls = [1], [0]

    average_precision = 0
    for i in range(len(candidates)):
        cands = cands | candidates[i]

        tp = len(cands & matches)
        precision = tp / len(cands)
        recall = tp / len(matches)
        average_precision += precision * (recall - recalls[-1])

        precisions.append(precision)
        recalls.append(recall)

    k = 0
    for i in range(len(candidates) + 1):
        precision = precisions[i]
        recall = recalls[i]
        k = i
        if recall > threshold:
            break

    return {
        "AP": average_precision * 100,
        "PC": recall * 100,
        "PQ": precision * 100,
        "F1": 2 * (precision * recall) / (precision + recall) * 100,
        "K": float(k),
    }

for d in data_dirs:
    print(d.name)
    matches_path = Path(d) / "matches.csv"
    matches = set(
        pd.read_csv(matches_path).itertuples(index=False, name=None)
    )
    with (result_dir / "sparse_join" / f"{d.name}.pickle").open("rb") as f:
        candidates_sparse = pickle.load(f)

    with (result_dir / "dense" / f"{d.name}.pickle").open("rb") as f:
        candidates_dense = pickle.load(f)

    flag = set()
    candidates = []
    for s1, s2 in zip(candidates_sparse, candidates_dense):
        s = s1 | s2
        s = s - flag
        candidates.append(s)
        flag |= s

    metrics = evaluate(candidates, matches)
    print(" & ".join(map(str, metrics.values())))
    
#     prs = []
#     _, recalls, precisions = evaluate(candidates_sparse, matches)
#     for i, (r, p) in enumerate(zip(recalls, precisions)):
#         prs.append({
#             "Data": d.name,
#             "Type": "sparse",
#             "Recall": r,
#             "Precision": p,
#             "k": i,
#         })

#     _, recalls, precisions = evaluate(candidates_dense, matches)
#     for i, (r, p) in enumerate(zip(recalls, precisions)):
#         prs.append({
#             "Data": d.name,
#             "Type": "dense",
#             "Recall": r,
#             "Precision": p,
#             "k": i,
#         })
    
#     df = pd.DataFrame(prs)
#     sns.relplot(data=df, x="k", y="Recall", kind="line", hue="Type", sort=False).set(title=f"{d.name}")

In [None]:
result_dir = Path("./results/debug/")

def evaluate(
    candidates: list[set[tuple[Any, Any]]],
    matches: set[tuple[Any, Any]],
    *,
    threshold: float = 0.9,
) -> dict:
    cands = set()
    precisions, recalls = [1], [0]

    for i in range(len(candidates)):
        cands = cands | candidates[i]

        tp = len(cands & matches)
        precision = tp / len(cands)
        recall = tp / len(matches)

        precisions.append(precision)
        recalls.append(recall)

    k = 0
    for i in range(len(candidates) + 1):
        precision = precisions[i]
        recall = recalls[i]
        k = i
        if recall > threshold:
            break

    average_precision = auc(recalls, precisions)

    return {
        "AP": round(average_precision * 100, 2),
        "PC": round(recall * 100, 2),
        "PQ": round(precision * 100, 2),
        "F1": round(2 * (precision * recall) / (precision + recall) * 100, 2),
        "K": k,
    }, recalls, precisions

def evaluate(
    candidates: list[set[tuple[Any, Any]]],
    matches: set[tuple[Any, Any]],
    *,
    threshold: float = 0.9,
) -> dict:
    cands = set()
    precisions, recalls = [1], [0]

    average_precision = 0
    for i in range(len(candidates)):
        cands = cands | candidates[i]

        tp = len(cands & matches)
        precision = tp / len(cands)
        recall = tp / len(matches)

        average_precision += (recall - recalls[-1]) * precision

        precisions.append(precision)
        recalls.append(recall)

    k = 0
    for i in range(len(candidates) + 1):
        precision = precisions[i]
        recall = recalls[i]
        k = i
        if recall > threshold:
            break

    auc = auc(recalls, precisions)

    return {
        "AP": average_precision,
        "auc": auc,
        "PC": recall,
        "PQ": precision,
        "F1": 2 * (precision * recall) / (precision + recall),
        "K": float(k),
    }, recalls, precisions

for d in data_dirs[1:]:
    print(d.name)
    matches_path = Path(d) / "matches.csv"
    matches = set(
        pd.read_csv(matches_path).itertuples(index=False, name=None)
    )
    with (result_dir / "sparse_join" / f"{d.name}.pickle").open("rb") as f:
        candidates_sparse = pickle.load(f)

    with (result_dir / "dense" / f"{d.name}.pickle").open("rb") as f:
        candidates_dense = pickle.load(f)

#     flag = set()
#     candidates = []
#     for s1, s2 in zip(candidates_sparse, candidates_dense):
#         s = s1 | s2
#         s = s - flag
#         candidates.append(s)
#         flag |= s

#     metrics, _, _ = evaluate(candidates, matches)
#     print(" & ".join(map(str, metrics.values())))
    
    prs = []
    _, recalls, precisions = evaluate(candidates_sparse, matches)
    for i, (r, p) in enumerate(zip(recalls, precisions)):
        prs.append({
            "Data": d.name,
            "Type": "sparse",
            "Recall": r,
            "Precision": p,
            "k": i,
        })

    _, recalls, precisions = evaluate(candidates_dense, matches)
    for i, (r, p) in enumerate(zip(recalls, precisions)):
        prs.append({
            "Data": d.name,
            "Type": "dense",
            "Recall": r,
            "Precision": p,
            "k": i,
        })
    
    df = pd.DataFrame(prs)
    sns.relplot(data=df, x="k", y="Recall", kind="line", hue="Type", sort=False).set(title=f"{d.name}")