In [3]:
import pandas as pd
import numpy as np
from openai import OpenAI

In [13]:
client = OpenAI(api_key="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")

# Appending tags to reviews

In [14]:
all_combined_reviews_df = pd.read_csv("./data/all_combined_reviews.csv")

In [15]:
all_combined_reviews_df.columns

Index(['review_text', 'rating', 'has_photo', 'author_name',
       'user_review_count', 'business_name', 'category', 'source'],
      dtype='object')

In [16]:
df = all_combined_reviews_df.copy()

# --- helpers ---
def s(col):
    return df[col].fillna("NA").astype(str).str.strip()

has_photo_str = np.where(df["has_photo"].fillna(False), "yes", "no")

# Optional: clip very long reviews to keep prompts compact
MAX_REVIEW_CHARS = 2000
review_text_clean = s("review_text").str.replace(r"\s+", " ", regex=True).str[:MAX_REVIEW_CHARS]

# --- build comprehensive string ---
df["comprehensive_review"] = (
    "[Business] " + s("business_name") +
    " | [Category] " + s("category") +
    " | [Rating] " + s("rating") +
    " | [Author] " + s("author_name") +
    " | [User Review Count] " + s("user_review_count") +
    " | [Has Photo] " + pd.Series(has_photo_str, index=df.index) +
    " | [Source] " + s("source") +
    " | [Review] " + review_text_clean
).str.replace(r"\s+\|\s+\[Review\]\s+NA$", "", regex=True)

# --- add unique ID column (01, 02, 03, …) ---
df["review_id"] = (
    pd.Series(range(1, len(df) + 1), index=df.index)
    .astype(str)
)

# quick peek
print(df[["review_id", "comprehensive_review"]].head())

  review_id                               comprehensive_review
0         1  [Business] Bass Pro Shops | [Category] ['Sport...
1         2  [Business] Hooters | [Category] ['American res...
2         3  [Business] Dollar Tree | [Category] ['Dollar s...
3         4  [Business] Half Price Books | [Category] ['Boo...
4         5  [Business] McDonald's | [Category] ['Fast food...


In [17]:
def classify_review(comprehensive_review: str) -> dict:
    prompt = f"""
You label customer reviews for policy compliance.

INPUT FORMAT (tagged string):
[Business] <name> | [Category] <one or more categories, possibly a Python-style list> | [Rating] <float/NA> | [Author] <name/NA> | [User Review Count] <int/NA> | [Has Photo] <yes/no> | [Source] <google/kaggle/singapore> | [Review] <free-text>

POLICIES:
1) No Advertisement (is_ad): promotional content, coupon codes, phone numbers, calls to action, links meant to drive traffic/sales.
2) No Irrelevant Content (is_relevant): the review must be about the specific business/location and the user’s experience with it.
3) No Rant Without Visit (is_rant): mark true if the review is a complaint/rant but the author signals no visit/experience.

OUTPUT RULES:
- Return ONLY compact JSON, no markdown, no prose.
- Keys: is_ad, is_relevant, is_rant, is_legit
- Types: booleans (true/false).
- Define is_legit = (not is_ad) AND is_relevant AND (not is_rant).

EXAMPLE:
INPUT:
[Business] Hooters | [Category] ['American restaurant', 'Bar & grill', 'Chicken wings restaurant', 'Takeout Restaurant', 'Sports bar'] | [Rating] 5.0 | [Author] Ericka Woodall | [User Review Count] 1.0 | [Has Photo] no | [Source] google | [Review] Great food, good service, great atmosphere.
OUTPUT:
{{"is_ad": false, "is_relevant": true, "is_rant": false, "is_legit": true}}

NOW LABEL THIS:
{comprehensive_review}
""".strip()

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}  # ensures JSON output
    )

    return resp.choices[0].message.content

In [7]:
df['comprehensive_review'][1]

"[Business] Hooters | [Category] ['American restaurant', 'Bar & grill', 'Chicken wings restaurant', 'Takeout Restaurant', 'Sports bar'] | [Rating] 5.0 | [Author] Ericka Woodall | [User Review Count] 1.0 | [Has Photo] no | [Source] google | [Review] Great food, good service, great atmosphere."

In [24]:
rant = "Honestly, I’ve never even been to this Hooters, but it’s probably just as bad as the one I saw on TV. These chain restaurants are all terrible — the service is always slow and the food is greasy. Anyway, if you want REAL quality dining, don’t waste your time here. Go to www.bestpizzadeals.com instead and use promo code PIZZA20 for 20% off your first order! Trust me, way better than some random sports bar."

In [25]:
classify_review(rant)

'{"is_ad": true, "is_relevant": false, "is_rant": true, "is_legit": false}'

In [17]:
df['comprehensive_review'][1]

"[Business] Hooters | [Category] ['American restaurant', 'Bar & grill', 'Chicken wings restaurant', 'Takeout Restaurant', 'Sports bar'] | [Rating] 5.0 | [Author] Ericka Woodall | [User Review Count] 1.0 | [Has Photo] no | [Source] google | [Review] Great food, good service, great atmosphere."

In [18]:
import pandas as pd

def run_label_batch(df, batch_size=100, start_idx=0):
    """
    Run classify_review on df['comprehensive_review'] in order,
    but only for the batch starting at start_idx.

    Args:
        df (pd.DataFrame): must contain 'comprehensive_review' column.
        batch_size (int): number of reviews to process per batch.
        start_idx (int): index (0-based) to start processing from.

    Returns:
        (out_df, next_start_idx)
        out_df: DataFrame with ['review_id', 'comprehensive_review', 'raw_json']
        next_start_idx: index to use for the next batch
    """
    n = len(df)
    end_idx = min(start_idx + batch_size, n)

    print(f"🚀 Starting batch: rows {start_idx+1} to {end_idx} of {n}")

    review_ids = []
    reviews = []
    outputs = []

    for i in range(start_idx, end_idx):
        text = df.loc[i, "comprehensive_review"]

        try:
            raw_json = classify_review(text)  # <-- your function that returns JSON string
        except Exception as e:
            print(f"⚠️ Error at row {i+1}: {e}")
            raw_json = "{}"  # placeholder on error

        review_ids.append(i + 1)       # 1-based ID
        reviews.append(text)           # keep the full comprehensive_review
        outputs.append(raw_json)       # model output

        # Progress update every 10 reviews or at the end
        if (i + 1) % 10 == 0 or i == end_idx - 1:
            print(f"   Processed review {i+1}/{end_idx} → {raw_json}")

    out_df = pd.DataFrame({
        "review_id": review_ids,
        "comprehensive_review": reviews,
        "raw_json": outputs
    })

    print(f"✅ Finished batch: rows {start_idx+1} to {end_idx}\n")

    return out_df, end_idx

In [None]:
# First batch (rows 12000–12999 - haven't run/finished)
batch1, next_idx = run_label_batch(df, batch_size=1000, start_idx=12000)
batch1.to_csv("labels_batch7.csv", index=False)

🚀 Starting batch: rows 12001 to 13000 of 20615
   Processed review 12010/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12020/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12030/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12040/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12050/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12060/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12070/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12080/13000 → {"is_ad": false, "is_relevant": false, "is_rant": false, "is_legit": false}
   Processed review 12090/13000 → {"is_ad": false, "is_relevant": false, 

KeyboardInterrupt: 

In [None]:
batch2, next_idx = run_label_batch(df, batch_size=900, start_idx=next_idx)

In [35]:
next_idx

100