In [None]:
import re
import pandas as pd
from collections import Counter
from indexes import build_minhash, build_lsh_index  # your helpers

def ensure_uri(df: pd.DataFrame, lang_code: str) -> pd.DataFrame:
    out = df.copy()
    if "uri" not in out.columns:
        out["uri"] = [f"{lang_code}_{i}" for i in range(len(out))]
    out["lang"] = lang_code
    return out[["uri", "title", "abstract", "lang"]]

df_en2 = ensure_uri(df_en, "en")
df_jp2 = ensure_uri(df_jp, "ja")

# df_all = pd.concat([df_en2, df_jp2], ignore_index=True)
# df_all = df_en2.copy()  # only English for now
df_all = df_jp2.copy()  # only Japanese for now

title_clean = df_all["title"].fillna("").astype(str).str.strip()
abs_clean   = df_all["abstract"].fillna("").astype(str).str.strip()
mask_nonempty = ~(title_clean.eq("") & abs_clean.eq(""))
df_all = df_all.loc[mask_nonempty].reset_index(drop=True)
len(df_all)

In [None]:
# remove rows whose abstract contains the exact substring
before = len(df_all)
mask = df_all["abstract"].fillna("").str.contains("identifier:", regex=False)
df_all = df_all.loc[~mask].reset_index(drop=True)
removed = int(mask.sum())
print(f"Removed {removed} rows (before: {before}, after: {len(df_all)})")
# remove rows whose abstract contains the exact substring '...' and just that
before = len(df_all)
mask = df_all["abstract"].fillna("").str.contains(r"^...", regex=True)
df_all = df_all.loc[~mask].reset_index(drop=True)
removed = int(mask.sum())
print(f"Removed {removed} rows (before: {before}, after: {len(df_all)})")

pattern = r"^\s*(\.{3}|…|‥|…{2}|。{3})\s*$"
before = len(df_all)
mask = df_all["abstract"].fillna("").str.match(pattern)
df_all = df_all.loc[~mask].reset_index(drop=True)

removed = int(mask.sum())
print(f"Removed {removed} rows (before: {before}, after: {len(df_all)})")

In [None]:
SHINGLE_NS   = (2, 3)
NUM_PERM     = 128
LSH_THRESHOLD = 0.3  # tune 0.5–0.8 for more/less bucket collisions

word_re = re.compile(r"\w+", re.UNICODE)

def tokenize(text: str) -> list[str]:
    return word_re.findall(str(text).lower()) if isinstance(text, str) else []

def make_shingles(tokens: list[str], ns=SHINGLE_NS) -> set[str]:
    S = set()
    for n in ns:
        if len(tokens) >= n:
            S.update(" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1))
    return S

texts = (df_all["title"].fillna("") + " " + df_all["abstract"].fillna("")).tolist()
uris  = df_all["uri"].tolist()

minhash_dict = {}
for u, txt in zip(uris, texts):
    toks = tokenize(txt)
    sh   = make_shingles(toks)
    mh   = build_minhash(sorted(sh), num_perm=NUM_PERM)
    minhash_dict[u] = mh

lsh = build_lsh_index(minhash_dict, threshold=LSH_THRESHOLD, num_perm=NUM_PERM)

In [None]:
def lsh_unique_buckets(lsh, min_size: int = 3, top_n: int = 10):
    """
    Return the largest UNIQUE (band, bucket) groups from a datasketch.MinHashLSH.
    A group = all keys that collide in the same (band, bucket).
    If the exact same member set appears in multiple bands, it is returned once.

    Parameters
    ----------
    lsh : datasketch.MinHashLSH
        An already-built LSH index.
    min_size : int
        Minimum number of members required to keep a bucket group.
    top_n : int
        Return only the top-N largest unique groups (use None/0 for all).

    Returns
    -------
    List[dict]
        Each dict has: {"band", "bucket", "size", "members"} with members as a list of keys.
    """
    # 1) Per-band bucket counts
    counts_per_band = lsh.get_counts()  # list[dict[bucket_key -> count]]

    unique = {}  # signature (tuple(sorted members)) -> row dict
    for band_idx, counts in enumerate(counts_per_band):
        ht = lsh.hashtables[band_idx]
        for bucket_key, c in counts.items():
            if c < min_size:
                continue
            # 2) Get members for this (band, bucket)
            members_raw = ht.get(bucket_key)  # iterable of keys
            # Canonicalize members to strings and sort to form a dedup signature
            members = [
                (m.decode("utf-8", "ignore") if isinstance(m, bytes) else str(m))
                for m in members_raw
            ]
            members_sorted = tuple(sorted(members))
            if len(members_sorted) < min_size:
                continue

            # 3) Deduplicate identical member sets across bands
            if members_sorted not in unique:
                unique[members_sorted] = {
                    "band": band_idx,
                    "bucket": bucket_key,
                    "size": len(members_sorted),
                    "members": list(members_sorted),
                }
            else:
                # (optional) keep the largest occurrence; here sets are identical so size is same
                pass

    rows = sorted(unique.values(), key=lambda r: r["size"], reverse=True)
    return rows[:top_n] if top_n else rows

bucket_list = lsh_unique_buckets(lsh, min_size=3)
print(f"Found {len(bucket_list)} buckets with >=3 members")
# print the sizes of the top 5 largest buckets (just the sizes)
for i, b in enumerate(bucket_list[:5]):
    print(f"Bucket {i+1}: size {len(b['members'])}")

# print the members of the bucket 5 and their titles + abstracts
bucket5 = bucket_list[7]  # 0-based index
print(f"\nMembers of bucket 5 (size {len(bucket5['members'])}):")
for uri in bucket5["members"]:
    title = df_all.loc[df_all["uri"] == uri, "title"].values[0]
    abstract = df_all.loc[df_all["uri"] == uri, "abstract"].values[0]
    print(f"- URI: {uri}\n  Title: {title}\n  Abstract: {abstract[:100]}\n")

In [None]:
from preprocess import extract_keywords_df

# Extract keywords from the DataFrame
# df_en_kw, topics = extract_keywords_df(
#     df_en,
#     text_cols=("title", "abstract"),
#     top_k=8,
#     ngram_range=(1, 3),
#     stop_words="english",
#     keywords_col="keywords"
# )

In [None]:
k = 5
id = 20
q = data["uri"].astype(str).iloc[id]
q_title = data.loc[data["uri"] == q, "title"].values[0]
print(f"Querying for URI: {q}\nTitle: {q_title}\n")

# HNSW neighbors (URIs), self excluded here
hnsw_neighbors = hnsw.query_by_uri(q, topk=k, return_scores=False, exclude_self=True)

# LSH Forest neighbors (ask for k+1, drop self if present)
lsh_neighbors = forest.query(mh_by_uri[q], k + 1)
# lsh_neighbors = [u for u in lsh_neighbors if u != q][:k]

# print the top 5 neighbors titles and abstracts
print("HNSW Neighbors:")
for uri in hnsw_neighbors[:5]:
    title = data.loc[data["uri"] == uri, "title"].values[0]
    abstract = data.loc[data["uri"] == uri, "abstract"].values[0]
    print(f"Title: {title}\nAbstract: {abstract}\n")

print("LSH Neighbors:")
for uri in lsh_neighbors[:5]:
    title = data.loc[data["uri"] == uri, "title"].values[0]
    abstract = data.loc[data["uri"] == uri, "abstract"].values[0]
    print(f"Title: {title}\nAbstract: {abstract}\n")