In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
# File paths
paths = {
    'cleaned_for_bert': "../data/cleaned_for_bert.csv",
    'absa_training_set': "../data/absa_training_set.csv"
}

# Separate training data for manual ABSA labeling

In [3]:
infile = paths["cleaned_for_bert"]
outfile = paths["absa_training_set"]

# Columns of interest
useCols_full = ["review_id", "rating", "gmap_id", "sentence_id", "sentence_text"]
useCols_pass1 = ["review_id", "rating", "gmap_id"]

dtypes_pass1 = {"review_id": "string", "rating": "int8", "gmap_id": "string"}
dtypes_full = {
    "review_id": "string",
    "rating": "int8",
    "gmap_id": "string",
    "sentence_id": "int32",
    "sentence_text": "string",
}

# Goal: 2,000 unique reviews per rating, with a spread of gmap_ids
num_per_rating = 2_000

CHUNKSIZE = 500_000
seed = 120
rng = np.random.default_rng(seed)

# Try to avoid over-sampling a single business:
# Keep at most this many sampled review_ids per gmap_id within each rating.
# (Tune up/down depending on how diverse you want the businesses.)
gmap_cap_per_rating = 20


def reservoir_update_ids_capped(res_ids, res_gmap_counts, new_pairs, seen_before, k, rng, gmap_cap):
    """
    Reservoir sampling over a stream of UNIQUE (review_id, gmap_id) pairs,
    with a soft cap on how many samples can come from a single gmap_id.

    res_ids: list of sampled review_ids (size <= k)
    res_gmap_counts: dict[gmap_id] -> count currently in reservoir
    new_pairs: iterable of (review_id, gmap_id) that are new (not seen before)
    seen_before: number of unique ids seen so far for this rating
    """
    seen = seen_before

    for rid, gid in new_pairs:
        seen += 1

        # If reservoir not full, add if it doesn't violate the gmap cap
        if len(res_ids) < k:
            if res_gmap_counts.get(gid, 0) < gmap_cap:
                res_ids.append(rid)
                res_gmap_counts[gid] = res_gmap_counts.get(gid, 0) + 1
            continue

        # Reservoir full: classic reservoir sampling replacement step
        j = rng.integers(0, seen)  # [0, seen-1]
        if j >= k:
            continue

        # Proposed replacement index in reservoir
        old_rid = res_ids[j]

        # NOTE: we don't know old_rid's gmap_id unless we track it.
        # We'll track rid->gid in a dict outside (rid_to_gmap) so we can update counts safely.
        old_gid = rid_to_gmap.get(old_rid)

        # Only replace if the incoming gid won't exceed cap (or it replaces same gid)
        incoming_ok = (res_gmap_counts.get(gid, 0) < gmap_cap) or (old_gid == gid)
        if not incoming_ok:
            continue

        # Apply replacement
        res_ids[j] = rid

        # Update gmap counts bookkeeping
        if old_gid is not None:
            res_gmap_counts[old_gid] = max(0, res_gmap_counts.get(old_gid, 0) - 1)
            if res_gmap_counts[old_gid] == 0:
                del res_gmap_counts[old_gid]

        res_gmap_counts[gid] = res_gmap_counts.get(gid, 0) + 1

    return res_ids, res_gmap_counts, seen


# -------------------------
# PASS 1: sample review_ids (unique) with gmap spread
# -------------------------
sample_ids = {r: [] for r in range(1, 6)}          # reservoirs (lists) of ids
seen_unique = {r: 0 for r in range(1, 6)}          # how many unique ids seen per rating
gmap_counts = {r: {} for r in range(1, 6)}         # per-rating gmap_id counts in reservoir

# Track review_id -> gmap_id for anything currently in reservoirs (for count updates)
rid_to_gmap = {}

# global "seen" set so we only consider each review_id once total
# (prevents duplicates across chunks)
global_seen_review_ids = set()

reader1 = pd.read_csv(
    infile,
    usecols=useCols_pass1,
    dtype=dtypes_pass1,
    chunksize=CHUNKSIZE,
)

for chunk_idx, chunk in enumerate(reader1, start=1):

    # Defensive: keep only expected ratings
    chunk = chunk[chunk["rating"].between(1, 5)]

    # For pass 1, need each review_id once.
    # Drop within-chunk duplicates first to shrink work
    chunk = chunk.drop_duplicates(subset=["review_id"])

    # Remove review_ids already processed in previous chunks
    # (so each review_id enters the sampler exactly once)
    mask_new = ~chunk["review_id"].isin(global_seen_review_ids)
    new_chunk = chunk.loc[mask_new]

    # Add to global set
    new_ids_all = new_chunk["review_id"].astype(str).tolist()
    global_seen_review_ids.update(new_ids_all)

    # Update reservoirs per rating with those new unique ids (+ gmap_id for spreading)
    for r in range(1, 6):
        sub = new_chunk.loc[new_chunk["rating"] == r, ["review_id", "gmap_id"]]

        if sub.empty:
            continue

        # Convert to python strings for stable hashing / dict keys
        pairs = list(zip(sub["review_id"].astype(str).tolist(), sub["gmap_id"].astype(str).tolist()))

        # Pre-register rid->gid so we can decrement counts when we replace later
        # (This is safe; the dict will end up containing lots of rids over time,
        # but it is much smaller than storing all review_ids in a set.)
        for rid, gid in pairs:
            if rid not in rid_to_gmap:
                rid_to_gmap[rid] = gid

        sample_ids[r], gmap_counts[r], seen_unique[r] = reservoir_update_ids_capped(
            sample_ids[r],
            gmap_counts[r],
            pairs,
            seen_unique[r],
            num_per_rating,
            rng,
            gmap_cap_per_rating,
        )

    if chunk_idx % 10 == 0:
        sizes = {r: len(sample_ids[r]) for r in range(1, 6)}
        uniq_gmaps = {r: len(gmap_counts[r]) for r in range(1, 6)}
        print(f"[Pass 1] chunks={chunk_idx} sizes={sizes} unique_gmap_ids={uniq_gmaps}")

selected_ids = set().union(*[set(sample_ids[r]) for r in range(1, 6)])
print("Selected unique review_ids:", len(selected_ids))

# Check if got 2k of each rating
for r in range(1, 6):
    print(f"rating {r}: {len(sample_ids[r])} ids; unique gmap_ids in sample: {len(gmap_counts[r])}")


# ---------------------------------------------
# PASS 2: write all rows for those review_ids
# ---------------------------------------------
# Stream again and append matching rows to disk.
# Avoids ever holding all selected sentences in memory.

reader2 = pd.read_csv(
    infile,
    usecols=useCols_full,
    dtype=dtypes_full,
    chunksize=CHUNKSIZE,
)

# Write header once, then append
first_write = True
rows_written = 0

for chunk_idx, chunk in enumerate(reader2, start=1):

    # filter to selected review_ids
    m = chunk["review_id"].astype(str).isin(selected_ids)
    out = chunk.loc[m]

    if not out.empty:
        out.to_csv(outfile, mode="w" if first_write else "a", index=False, header=first_write)
        first_write = False
        rows_written += len(out)

    if chunk_idx % 10 == 0:
        print(f"[Pass 2] chunks={chunk_idx} rows_written={rows_written}")

print("Done. Wrote:", outfile, "rows:", rows_written)

[Pass 1] chunks=10 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1845, 2: 1750, 3: 1675, 4: 1703, 5: 1825}
[Pass 1] chunks=20 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1923, 2: 1893, 3: 1898, 4: 1903, 5: 1923}
[Pass 1] chunks=30 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1944, 2: 1907, 3: 1919, 4: 1924, 5: 1935}
[Pass 1] chunks=40 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1957, 2: 1917, 3: 1918, 4: 1926, 5: 1931}
[Pass 1] chunks=50 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1955, 2: 1931, 3: 1937, 4: 1948, 5: 1934}
[Pass 1] chunks=60 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1950, 2: 1932, 3: 1937, 4: 1962, 5: 1946}
[Pass 1] chunks=70 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000} unique_gmap_ids={1: 1952, 2: 1940, 3: 1937, 4: 1963, 5: 1958}
[Pass 1] chunks=80 sizes={1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000

# Widget for Manual ABSA Data Labelling

In [4]:
import os
import pandas as pd
from IPython.display import display, clear_output
import ipywidgets as widgets
from datetime import datetime

# =====================================================
# CONFIG
# =====================================================

sent_file = paths["absa_training_set"]
labels_file = "../data/absa_labels_long.csv"
progress_file = "../data/absa_labeling_progress.txt"

ASPECTS = [
    "food_quality",
    "service",
    "wait_time",
    "price_value",
    "cleanliness",
    "atmosphere",
    "general",
]

SENTIMENTS = ["positive", "negative"]

# =====================================================
# LOAD DATA
# =====================================================

df = pd.read_csv(
    sent_file,
    usecols=["review_id", "rating", "gmap_id", "sentence_id", "sentence_text"]
)

df["review_id"] = df["review_id"].astype(str)
df["gmap_id"] = df["gmap_id"].astype(str)
df["sentence_text"] = df["sentence_text"].astype(str)
df["key"] = df["review_id"] + "::" + df["sentence_id"].astype(str)

if os.path.exists(labels_file):
    labels = pd.read_csv(labels_file)
else:
    labels = pd.DataFrame(columns=["review_id", "sentence_id", "aspect", "sentiment"])

labeled_keys = set(labels["review_id"].astype(str) + "::" +
                   labels["sentence_id"].astype(str))

start_idx = 0
if os.path.exists(progress_file):
    try:
        start_idx = int(open(progress_file).read().strip())
    except:
        start_idx = 0

def find_next_unlabeled(i):
    while i < len(df) and df.iloc[i]["key"] in labeled_keys:
        i += 1
    return i

idx = find_next_unlabeled(start_idx)
pending_pairs = []
history = []

# =====================================================
# WIDGETS
# =====================================================

header = widgets.HTML("")
status = widgets.HTML("")
meta_area = widgets.HTML("")
text_area = widgets.HTML("")
pairs_box = widgets.HTML("")
msg = widgets.HTML("")

aspect_dd = widgets.Dropdown(
    options=["(choose)"] + ASPECTS,
    value="(choose)",
    description="Aspect:"
)

sent_dd = widgets.ToggleButtons(
    options=SENTIMENTS,
    description="Sentiment:"
)

add_btn = widgets.Button(description="Add Pair", button_style="info")
clear_btn = widgets.Button(description="Clear Pairs")
skip_btn = widgets.Button(description="Skip", button_style="warning")
save_btn = widgets.Button(description="Save & Next", button_style="success")
undo_btn = widgets.Button(description="Undo", button_style="danger")

controls = widgets.HBox([add_btn, clear_btn, skip_btn, save_btn, undo_btn])
selectors = widgets.HBox([aspect_dd, sent_dd])

ui = widgets.VBox([
    header,
    status,
    meta_area,
    text_area,
    selectors,
    controls,
    pairs_box,
    msg
])

# =====================================================
# HEADER METRICS
# =====================================================

def update_header():
    global labels

    total_sentences = len(df)
    labeled_sentences = len(labeled_keys)
    percent = (labeled_sentences / total_sentences) * 100

    total_pairs = len(labels)

    aspect_counts = labels["aspect"].value_counts().to_dict()
    sentiment_counts = labels["sentiment"].value_counts().to_dict()

    aspect_html = " | ".join(
        [f"{a}: {aspect_counts.get(a,0)}" for a in ASPECTS]
    )

    sentiment_html = " | ".join(
        [f"{s}: {sentiment_counts.get(s,0)}" for s in SENTIMENTS]
    )

    last_saved = (
        datetime.fromtimestamp(os.path.getmtime(labels_file)).strftime("%Y-%m-%d %H:%M:%S")
        if os.path.exists(labels_file)
        else "N/A"
    )

    header.value = f"""
    <div style='padding:10px; border:2px solid #333; border-radius:8px; background:#f5f5f5'>
        <b>Progress:</b> {labeled_sentences:,} / {total_sentences:,} sentences ({percent:.2f}%)<br>
        <b>Total Aspect-Sentiment Pairs:</b> {total_pairs:,}<br>
        <b>Aspect Counts:</b> {aspect_html}<br>
        <b>Sentiment Counts:</b> {sentiment_html}<br>
        <b>Last Saved:</b> {last_saved}
    </div>
    """

# =====================================================
# CORE FUNCTIONS
# =====================================================

def render():
    clear_output(wait=True)

    if idx >= len(df):
        display(widgets.HTML("<h3>Done! ðŸŽ‰</h3>"))
        return

    update_header()

    row = df.iloc[idx]

    status.value = f"<b>Row:</b> {idx+1:,} / {len(df):,}"

    meta_area.value = (
        f"<b>Rating:</b> {row['rating']} &nbsp;&nbsp; "
        f"<b>Review ID:</b> {row['review_id']} &nbsp;&nbsp; "
        f"<b>Sentence ID:</b> {row['sentence_id']} &nbsp;&nbsp; "
        f"<b>Business:</b> {row['gmap_id']}"
    )

    safe_text = (
        row["sentence_text"]
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
    )

    text_area.value = f"""
    <div style='font-size:20px; padding:12px;
                border:1px solid #ddd; border-radius:8px;
                background:#fafafa;'>
        {safe_text}
    </div>
    """

    if pending_pairs:
        pairs_html = "<ul>" + "".join(
            [f"<li><b>{a}</b> â†’ {s}</li>" for a, s in pending_pairs]
        ) + "</ul>"
    else:
        pairs_html = "<i>No labels added yet.</i>"

    pairs_box.value = f"<b>Current Labels:</b>{pairs_html}"
    msg.value = ""

    display(ui)

def autosave():
    with open(progress_file, "w") as f:
        f.write(str(idx))

def append_labels(rows_df):
    global labels
    file_exists = os.path.exists(labels_file)
    rows_df.to_csv(labels_file, mode="a", index=False,
                   header=not file_exists)
    labels = pd.concat([labels, rows_df], ignore_index=True)

def on_add(_):
    if aspect_dd.value == "(choose)":
        msg.value = "<b style='color:red'>Select an aspect first.</b>"
        return
    pending_pairs.append((aspect_dd.value, sent_dd.value))
    aspect_dd.value = "(choose)"
    render()

def on_clear(_):
    pending_pairs.clear()
    render()

def on_skip(_):
    global idx
    history.append({"idx": idx, "rows": None})
    labeled_keys.add(df.iloc[idx]["key"])
    pending_pairs.clear()
    idx = find_next_unlabeled(idx + 1)
    autosave()
    render()

def on_save(_):
    global idx

    row = df.iloc[idx]

    if not pending_pairs:
        on_skip(None)
        return

    out = pd.DataFrame({
        "review_id": [row["review_id"]] * len(pending_pairs),
        "sentence_id": [row["sentence_id"]] * len(pending_pairs),
        "aspect": [a for a, _ in pending_pairs],
        "sentiment": [s for _, s in pending_pairs],
    })

    append_labels(out)

    history.append({"idx": idx, "rows": out})
    labeled_keys.add(row["key"])
    pending_pairs.clear()

    idx = find_next_unlabeled(idx + 1)
    autosave()
    render()

def on_undo(_):
    global idx, labels

    if not history:
        msg.value = "<b style='color:red'>Nothing to undo.</b>"
        render()
        return

    last = history.pop()
    idx = last["idx"]
    key = df.iloc[idx]["key"]

    if key in labeled_keys:
        labeled_keys.remove(key)

    if last["rows"] is not None and os.path.exists(labels_file):
        lab = pd.read_csv(labels_file)
        rid = df.iloc[idx]["review_id"]
        sid = df.iloc[idx]["sentence_id"]
        lab = lab[~((lab["review_id"] == rid) &
                    (lab["sentence_id"] == sid))]
        lab.to_csv(labels_file, index=False)
        labels = lab

    pending_pairs.clear()
    autosave()
    render()

add_btn.on_click(on_add)
clear_btn.on_click(on_clear)
skip_btn.on_click(on_skip)
save_btn.on_click(on_save)
undo_btn.on_click(on_undo)

render()

VBox(children=(HTML(value="\n    <div style='padding:10px; border:2px solid #333; border-radius:8px; backgrounâ€¦