In [1]:
# ==========================================
# Expert-Rule Labeling (Cold-Start Phase)
#
# Simulates human annotation by applying keyword-based business
# rules to review-queue data. Outputs to gold standard.
# Used during cold start when no human reviewer is available.
#
# Input: NB01's review output (always contains 'gpt_label' column)
# Rule priority: negative keywords > ad keywords > news keywords > AI fallback
# ==========================================

import pandas as pd
import glob
import os
import datetime
from config import DIRS

# -- Config --
PROJECT_NAME = "cold_start"


# -- Expert rules: keyword-based label assignment --
def get_expert_label(row):
    """Apply business rules to assign a sentiment label."""
    text = str(row.get('text', '')).lower()
    orig = str(row.get('sentiment', '')).lower().strip()

    # Read GPT prediction (always 'gpt_label' from NB01 review output)
    ai_label = ''
    if 'gpt_label' in row.index and pd.notna(row.get('gpt_label')):
        ai_label = str(row['gpt_label']).lower().strip()

    # Rule 1: Negative keywords override everything
    neg_keywords = [
        'hate', 'trash', 'garbage', 'suck', 'worst', 'fail', 'broken',
        'bug', 'crash', 'slow', 'stupid', 'fuck', 'shit', 'bad',
        'issue', 'problem', 'ban', 'unconstitutional', 'boring', 'lag',
    ]
    if any(k in text for k in neg_keywords):
        return 'negative'

    # Rule 2: Ads / promotions → neutral
    ad_keywords = [
        'check out', 'preorder', 'shop', 'store', 'win', 'chance to',
        'giveaway', 'amazon', 'ebay', 'link', 'subscribe', 'discount',
        'sale', '% off',
    ]
    if any(k in text for k in ad_keywords):
        return 'neutral'

    # Rule 3: News / factual statements → neutral
    news_keywords = [
        'report', 'announced', 'released', 'update', 'sales',
        'market', 'stocks', 'official', 'statement',
    ]
    if any(k in text for k in news_keywords):
        return 'neutral'

    # Rule 4: Fallback — defer to AI prediction with conservative adjustments
    if ai_label == 'negative':
        return 'negative'
    if ai_label == 'positive':
        # Short texts with "positive" AI label are often noise
        return 'neutral' if len(text.split()) < 4 else 'positive'
    if ai_label == 'neutral':
        # If original dataset said negative, trust that over neutral AI prediction
        return 'negative' if orig == 'negative' else 'neutral'

    return ai_label if ai_label else 'neutral'


# -- Load review-queue data --
search_pattern = f"{DIRS['review']}/*{PROJECT_NAME}*.csv"
files = glob.glob(search_pattern)

if not files:
    raise FileNotFoundError(
        f"No files for [{PROJECT_NAME}] in {DIRS['review']}/. Run Notebook 01 first."
    )

latest_file = max(files, key=os.path.getctime)
print(f"Input: {os.path.basename(latest_file)}")

df = pd.read_csv(latest_file)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# -- Apply expert rules to all rows --
print(f"Applying expert rules to {len(df)} rows...")
df['label'] = df.apply(get_expert_label, axis=1)

# -- Save to gold standard (standardized schema) --
base_name = os.path.basename(latest_file).lower()
base_name = base_name.replace('_review', '').replace('_pending', '').replace('.csv', '')

path_04 = f"{DIRS['gold']}/{base_name}_corrected.csv"
gold_df = df[['text', 'label']].copy()
gold_df['gold_origin'] = 'cold_start_corrected'
gold_df['created_at'] = datetime.datetime.now().isoformat()
gold_df.to_csv(path_04, index=False)

print("-" * 30)
print(f"Gold standard saved: {path_04}")
print(f"Total: {len(df)} | Neg: {sum(df['label']=='negative')} | "
      f"Pos: {sum(df['label']=='positive')} | Neu: {sum(df['label']=='neutral')}")
print("-" * 30)

Input: 20260218_cold_start_review.csv
Applying expert rules to 451 rows...
------------------------------
Gold standard saved: ./04_gold_standard/20260218_cold_start_corrected.csv
Total: 451 | Neg: 232 | Pos: 99 | Neu: 120
------------------------------
