In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
# File paths
paths = {
    'cleaned_for_bert': "../data/cleaned_for_bert.csv",
    'absa_training_set': "../data/absa_training_set.csv"
}

# Separate training data for manual ABSA labeling
## Do not rerun this code (data already separated and saved as absa_training_set.csv)

# Widget for Manual ABSA Data Labelling
## Start here if labelling data

In [None]:
import os
import pandas as pd
from IPython.display import display, clear_output
import ipywidgets as widgets
from datetime import datetime

# =====================================================
# CONFIG
# =====================================================

sent_file = paths["absa_training_set"]
labels_file = "../data/absa_labels_long.csv"
progress_file = "../data/absa_labeling_progress.txt"

ASPECTS = [
    "food_quality",
    "service",
    "wait_time",
    "price_value",
    "cleanliness",
    "atmosphere",
    "general",
]

SENTIMENTS = ["positive", "negative"]

# =====================================================
# LOAD DATA
# =====================================================

df = pd.read_csv(
    sent_file,
    usecols=["review_id", "rating", "gmap_id", "sentence_id", "sentence_text"]
)

df["review_id"] = df["review_id"].astype(str)
df["gmap_id"] = df["gmap_id"].astype(str)
df["sentence_text"] = df["sentence_text"].astype(str)
df["key"] = df["review_id"] + "::" + df["sentence_id"].astype(str)

if os.path.exists(labels_file):
    labels = pd.read_csv(labels_file)
else:
    labels = pd.DataFrame(columns=["review_id", "sentence_id", "aspect", "sentiment"])

labeled_keys = set(labels["review_id"].astype(str) + "::" +
                   labels["sentence_id"].astype(str))

start_idx = 0
if os.path.exists(progress_file):
    try:
        start_idx = int(open(progress_file).read().strip())
    except:
        start_idx = 0

def find_next_unlabeled(i):
    while i < len(df) and df.iloc[i]["key"] in labeled_keys:
        i += 1
    return i

idx = find_next_unlabeled(start_idx)
pending_pairs = []
history = []

# =====================================================
# WIDGETS
# =====================================================

header = widgets.HTML("")
status = widgets.HTML("")
meta_area = widgets.HTML("")
text_area = widgets.HTML("")
pairs_box = widgets.HTML("")
msg = widgets.HTML("")

aspect_dd = widgets.Dropdown(
    options=["(choose)"] + ASPECTS,
    value="(choose)",
    description="Aspect:"
)

sent_dd = widgets.ToggleButtons(
    options=SENTIMENTS,
    description="Sentiment:"
)

add_btn = widgets.Button(description="Add Pair", button_style="info")
clear_btn = widgets.Button(description="Clear Pairs")
skip_btn = widgets.Button(description="Skip", button_style="warning")
save_btn = widgets.Button(description="Save & Next", button_style="success")
undo_btn = widgets.Button(description="Undo", button_style="danger")

controls = widgets.HBox([add_btn, clear_btn, skip_btn, save_btn, undo_btn])
selectors = widgets.HBox([aspect_dd, sent_dd])

ui = widgets.VBox([
    header,
    status,
    meta_area,
    text_area,
    selectors,
    controls,
    pairs_box,
    msg
])

# =====================================================
# HEADER METRICS
# =====================================================

def update_header():
    global labels

    total_sentences = len(df)
    labeled_sentences = len(labeled_keys)
    percent = (labeled_sentences / total_sentences) * 100

    total_pairs = len(labels)

    aspect_counts = labels["aspect"].value_counts().to_dict()
    sentiment_counts = labels["sentiment"].value_counts().to_dict()

    aspect_html = " | ".join(
        [f"{a}: {aspect_counts.get(a,0)}" for a in ASPECTS]
    )

    sentiment_html = " | ".join(
        [f"{s}: {sentiment_counts.get(s,0)}" for s in SENTIMENTS]
    )

    last_saved = (
        datetime.fromtimestamp(os.path.getmtime(labels_file)).strftime("%Y-%m-%d %H:%M:%S")
        if os.path.exists(labels_file)
        else "N/A"
    )

    header.value = f"""
    <div style='padding:10px; border:2px solid #333; border-radius:8px; background:#f5f5f5'>
        <b>Progress:</b> {labeled_sentences:,} / {total_sentences:,} sentences ({percent:.2f}%)<br>
        <b>Total Aspect-Sentiment Pairs:</b> {total_pairs:,}<br>
        <b>Aspect Counts:</b> {aspect_html}<br>
        <b>Sentiment Counts:</b> {sentiment_html}<br>
        <b>Last Saved:</b> {last_saved}
    </div>
    """

# =====================================================
# CORE FUNCTIONS
# =====================================================

def render():
    clear_output(wait=True)

    if idx >= len(df):
        display(widgets.HTML("<h3>Done! ðŸŽ‰</h3>"))
        return

    update_header()

    row = df.iloc[idx]

    status.value = f"<b>Row:</b> {idx+1:,} / {len(df):,}"

    meta_area.value = (
        f"<b>Rating:</b> {row['rating']} &nbsp;&nbsp; "
        f"<b>Review ID:</b> {row['review_id']} &nbsp;&nbsp; "
        f"<b>Sentence ID:</b> {row['sentence_id']} &nbsp;&nbsp; "
        f"<b>Business:</b> {row['gmap_id']}"
    )

    safe_text = (
        row["sentence_text"]
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
    )

    text_area.value = f"""
    <div style='font-size:20px; padding:12px;
                border:1px solid #ddd; border-radius:8px;
                background:#fafafa;'>
        {safe_text}
    </div>
    """

    if pending_pairs:
        pairs_html = "<ul>" + "".join(
            [f"<li><b>{a}</b> â†’ {s}</li>" for a, s in pending_pairs]
        ) + "</ul>"
    else:
        pairs_html = "<i>No labels added yet.</i>"

    pairs_box.value = f"<b>Current Labels:</b>{pairs_html}"
    msg.value = ""

    display(ui)

def autosave():
    with open(progress_file, "w") as f:
        f.write(str(idx))

def append_labels(rows_df):
    global labels
    file_exists = os.path.exists(labels_file)
    rows_df.to_csv(labels_file, mode="a", index=False,
                   header=not file_exists)
    labels = pd.concat([labels, rows_df], ignore_index=True)

def on_add(_):
    if aspect_dd.value == "(choose)":
        msg.value = "<b style='color:red'>Select an aspect first.</b>"
        return
    pending_pairs.append((aspect_dd.value, sent_dd.value))
    aspect_dd.value = "(choose)"
    render()

def on_clear(_):
    pending_pairs.clear()
    render()

def on_skip(_):
    global idx
    history.append({"idx": idx, "rows": None})
    labeled_keys.add(df.iloc[idx]["key"])
    pending_pairs.clear()
    idx = find_next_unlabeled(idx + 1)
    autosave()
    render()

def on_save(_):
    global idx

    row = df.iloc[idx]

    if not pending_pairs:
        on_skip(None)
        return

    out = pd.DataFrame({
        "review_id": [row["review_id"]] * len(pending_pairs),
        "sentence_id": [row["sentence_id"]] * len(pending_pairs),
        "aspect": [a for a, _ in pending_pairs],
        "sentiment": [s for _, s in pending_pairs],
    })

    append_labels(out)

    history.append({"idx": idx, "rows": out})
    labeled_keys.add(row["key"])
    pending_pairs.clear()

    idx = find_next_unlabeled(idx + 1)
    autosave()
    render()

def on_undo(_):
    global idx, labels

    if not history:
        msg.value = "<b style='color:red'>Nothing to undo.</b>"
        render()
        return

    last = history.pop()
    idx = last["idx"]
    key = df.iloc[idx]["key"]

    if key in labeled_keys:
        labeled_keys.remove(key)

    if last["rows"] is not None and os.path.exists(labels_file):
        lab = pd.read_csv(labels_file)
        rid = df.iloc[idx]["review_id"]
        sid = df.iloc[idx]["sentence_id"]
        lab = lab[~((lab["review_id"] == rid) &
                    (lab["sentence_id"] == sid))]
        lab.to_csv(labels_file, index=False)
        labels = lab

    pending_pairs.clear()
    autosave()
    render()

add_btn.on_click(on_add)
clear_btn.on_click(on_clear)
skip_btn.on_click(on_skip)
save_btn.on_click(on_save)
undo_btn.on_click(on_undo)

render()