In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("data/eval_products_100.csv")

In [8]:
df.columns

Index(['PRODUCT_ID', 'combined_text', 'claim_strength'], dtype='object')

In [9]:
df.head()

Unnamed: 0,PRODUCT_ID,combined_text,claim_strength
0,901852,FANMATS NBA Miami Heat Nylon Face Basketball R...,strong
1,2251841,"Kitchen Table Runner 14""x72"", Marble Texture A...",strong
2,2185165,VKAIRE Soft Comfortable Organic Cotton Wiggly ...,strong
3,2902869,"Ambesonne Pug Duvet Cover Set of 2, Puppy on T...",weak
4,2858697,Mayabi Women's Soft Cotton Silk Dhakai Jamdani...,medium


In [10]:
# TEST CELL - Run this first to test one row
import google.generativeai as genai
import pandas as pd
import json

genai.configure(api_key='AIzaSyCyj2qCl96ndqjwe8tp5nXXxcO_sfDU8dg')
model = genai.GenerativeModel('gemini-3-pro-preview')

# Test with your sample description
test_description = """Angel Mineral Foldable Bed Study Table Portable Laptop Table Lapdesk for Children Bed Work Office Gaming Home with Tablet Slot & Cup Holder Bed Breakfast Serving Table Angel mineral Material environmental protection„Äë- The desktop is made of MDF, solid wood particle board, safe and environmentally friendly."""

def clean_text(text):
    if pd.isna(text):
        return ""
    return str(text).encode('utf-8', errors='ignore').decode('utf-8')

def test_label(description, prompt_type='zero_shot'):
    description = clean_text(description)
    
    prompt = f"""Analyze whether this product contains greenwashing.

Product: {description}

Respond in JSON format:
{{"label": "yes/no/uncertain", "confidence": 0.0-1.0}}"""
    
    print("=== PROMPT ===")
    print(prompt[:500])
    print("\n=== SENDING REQUEST ===")
    
    response = model.generate_content(prompt, generation_config=genai.GenerationConfig(temperature=0.7))
    
    print("\n=== RAW RESPONSE ===")
    print(response.text)
    
    text = response.text.strip()
    if '```json' in text:
        text = text.split('```json')[1].split('```')[0].strip()
    elif '```' in text:
        text = text.split('```')[1].split('```')[0].strip()
    
    print("\n=== EXTRACTED JSON ===")
    print(text)
    
    result = json.loads(text)
    print("\n=== PARSED RESULT ===")
    print(f"Label: {result['label']}")
    print(f"Confidence: {result['confidence']}")
    
    return result['label'], result['confidence']

# Test it
label, conf = test_label(test_description)

=== PROMPT ===
Analyze whether this product contains greenwashing.

Product: Angel Mineral Foldable Bed Study Table Portable Laptop Table Lapdesk for Children Bed Work Office Gaming Home with Tablet Slot & Cup Holder Bed Breakfast Serving Table Angel mineral Material environmental protection„Äë- The desktop is made of MDF, solid wood particle board, safe and environmentally friendly.

Respond in JSON format:
{"label": "yes/no/uncertain", "confidence": 0.0-1.0}

=== SENDING REQUEST ===

=== RAW RESPONSE ===
```json
{
  "label": "yes",
  "confidence": 0.95
}
```

=== EXTRACTED JSON ===
{
  "label": "yes",
  "confidence": 0.95
}

=== PARSED RESULT ===
Label: yes
Confidence: 0.95


In [20]:
# MAIN CELL - Run this after test succeeds
import google.generativeai as genai
import pandas as pd
import json
import time
from tenacity import retry, stop_after_attempt, wait_exponential

model = genai.GenerativeModel('gemini-3-pro-preview')

PROMPTS = {
    'zero_shot': """Analyze whether this product contains greenwashing.

Product: {description}

Respond in JSON format:
{{"label": "greenwashing/not_greenwashing/uncertain", "confidence": 0.0-1.0}}""",
    
    'one_shot': """Analyze whether products contain greenwashing.

Example:
Product: "Eco-friendly and natural ingredients"
Analysis: {{"label": "greenwashing", "confidence": 0.85}}
Reason: Vague, unsubstantiated claims

Now analyze this product:

Product: {description}

Respond in JSON format:
{{"label": "greenwashing/not_greenwashing/uncertain", "confidence": 0.0-1.0}}""",
    
    'chain_of_thought': """Analyze whether this product contains greenwashing.

Product: {description}

Evaluate across ALL dimensions:

1. SPECIFICITY & EVIDENCE
   - Are claims specific with certifications (GOTS, FSC, Fair Trade, etc.)?
   - Are there measurable metrics (percentages, quantities, comparisons)?
   - Or only vague terms (eco-friendly, sustainable, natural)?

2. TONE & LANGUAGE
   - Is language neutral and factual?
   - Or overly emotional, enthusiastic, or exaggerated?
   - Excessive exclamations, emojis, or superlatives?

3. TRANSPARENCY & SUBSTANCE
   - Is enough information provided to verify claims?
   - Are there hidden tradeoffs or misleading emphasis?
   - Does it highlight minor benefits while ignoring major impacts?

4. OVERALL CREDIBILITY
   - Do environmental benefits seem genuine and proportionate?
   - Is this substantive environmental action or just marketing?

GREENWASHING = Vague claims + emotional language + lack of evidence
NOT GREENWASHING = Specific claims + neutral tone + verifiable evidence

Respond in JSON format:
{{"label": "greenwashing/not_greenwashing/uncertain", "confidence": 0.0-1.0, "key_reason": "brief explanation"}}"""
}

def clean_text(text):
    if pd.isna(text):
        return ""
    return str(text).encode('utf-8', errors='ignore').decode('utf-8')

In [21]:
import os

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def label_greenwashing(description, prompt_type):
    description = clean_text(description)
    prompt = PROMPTS[prompt_type].format(description=description)
    
    response = model.generate_content(prompt, generation_config=genai.GenerationConfig(temperature=0.7))
    text = response.text.strip()
    
    if '```json' in text:
        text = text.split('```json')[1].split('```')[0].strip()
    elif '```' in text:
        text = text.split('```')[1].split('```')[0].strip()
    
    result = json.loads(text)
    return result['label'], result['confidence']

# Check which files already exist
all_prompt_types = ['zero_shot', 'one_shot', 'chain_of_thought']
remaining_prompts = [pt for pt in all_prompt_types if not os.path.exists(f'greenwashing_{pt}.csv')]

print(f"Already completed: {[pt for pt in all_prompt_types if os.path.exists(f'greenwashing_{pt}.csv')]}")
print(f"Remaining to process: {remaining_prompts}\n")

for prompt_type in remaining_prompts:
    print(f"\n=== Processing {prompt_type} ===")
    
    df_result = df.copy()
    results = []
    
    for idx, desc in enumerate(df_result['combined_text']):
        try:
            label, conf = label_greenwashing(desc, prompt_type)
            results.append((label, conf))
            print(f"Processed {idx+1}/100 - Label: {label}")
            time.sleep(2)
        except Exception as e:
            print(f"Error at {idx+1}: {str(e)}")
            results.append(('error', 0.0))
    
    df_result['greenwashing'] = [r[0] for r in results]
    df_result['confidence'] = [r[1] for r in results]
    
    df_result.to_csv(f'data/benchmark/greenwashing_{prompt_type}.csv', index=False)
    print(f"Saved to greenwashing_{prompt_type}.csv")

if not remaining_prompts:
    print("All prompt types already completed!")

Already completed: []
Remaining to process: ['zero_shot', 'one_shot', 'chain_of_thought']


=== Processing zero_shot ===
Processed 1/100 - Label: not_greenwashing
Processed 2/100 - Label: greenwashing
Processed 3/100 - Label: not_greenwashing
Processed 4/100 - Label: greenwashing
Processed 5/100 - Label: greenwashing
Processed 6/100 - Label: not_greenwashing
Processed 7/100 - Label: greenwashing
Processed 8/100 - Label: greenwashing
Processed 9/100 - Label: not_greenwashing
Processed 10/100 - Label: greenwashing
Processed 11/100 - Label: greenwashing
Processed 12/100 - Label: greenwashing
Processed 13/100 - Label: not_greenwashing
Processed 14/100 - Label: greenwashing
Processed 15/100 - Label: greenwashing
Processed 16/100 - Label: greenwashing
Processed 17/100 - Label: not_greenwashing
Processed 18/100 - Label: greenwashing
Processed 19/100 - Label: greenwashing
Processed 20/100 - Label: not_greenwashing
Processed 21/100 - Label: not_greenwashing
Processed 22/100 - Label: greenwashin

## Retraining of error rows

In [11]:
# RETRY CELL - Retrain only error cases from chain_of_thought
import google.generativeai as genai
import pandas as pd
import json
import time
from tenacity import retry, stop_after_attempt, wait_exponential

# Initialize model
model = genai.GenerativeModel('gemini-3-pro-preview')

# Chain of thought prompt
COT_PROMPT = """Analyze whether this product contains greenwashing.
Product: {description}
Evaluate across ALL dimensions:
1. SPECIFICITY & EVIDENCE
   - Are claims specific with certifications (GOTS, FSC, Fair Trade, etc.)?
   - Are there measurable metrics (percentages, quantities, comparisons)?
   - Or only vague terms (eco-friendly, sustainable, natural)?
2. TONE & LANGUAGE
   - Is language neutral and factual?
   - Or overly emotional, enthusiastic, or exaggerated?
   - Excessive exclamations, emojis, or superlatives?
3. TRANSPARENCY & SUBSTANCE
   - Is enough information provided to verify claims?
   - Are there hidden tradeoffs or misleading emphasis?
   - Does it highlight minor benefits while ignoring major impacts?
4. OVERALL CREDIBILITY
   - Do environmental benefits seem genuine and proportionate?
   - Is this substantive environmental action or just marketing?
GREENWASHING = Vague claims + emotional language + lack of evidence
NOT GREENWASHING = Specific claims + neutral tone + verifiable evidence
Respond in JSON format:
{{"label": "greenwashing/not_greenwashing/uncertain", "confidence": 0.0-1.0, "key_reason": "brief explanation"}}"""

def clean_text(text):
    if pd.isna(text):
        return ""
    return str(text).encode('utf-8', errors='ignore').decode('utf-8')

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def label_greenwashing(description):
    description = clean_text(description)
    prompt = COT_PROMPT.format(description=description)
    
    response = model.generate_content(
        prompt, 
        generation_config=genai.GenerationConfig(temperature=0.7)
    )
    text = response.text.strip()
    
    # Clean JSON response
    if '```json' in text:
        text = text.split('```json')[1].split('```')[0].strip()
    elif '```' in text:
        text = text.split('```')[1].split('```')[0].strip()
    
    result = json.loads(text)
    return result['label'], result['confidence']

In [12]:
# Load the chain_of_thought results
print("Loading chain_of_thought results...")
df_cot = pd.read_csv('data/benchmark/greenwashing_chain_of_thought.csv')

Loading chain_of_thought results...


In [14]:
# Identify indices where the previous run failed
error_indices = df_cot[df_cot['greenwashing'] == 'error'].index.tolist()

if len(error_indices) == 0:
    print("No errors found! All cases already labeled.")
else:
    # Retry each error case
    successful_retries = 0
    failed_retries = []
    
    for idx in error_indices:
        try:
            desc = df_cot.loc[idx, 'combined_text']
            print(f"Retrying index {idx} ({successful_retries + 1}/{len(error_indices)})...")
            
            label, conf = label_greenwashing(desc)
            
            # Update the dataframe
            df_cot.loc[idx, 'greenwashing'] = label
            df_cot.loc[idx, 'confidence'] = conf
            
            print(f"  ✓ Success - Label: {label}, Confidence: {conf:.2f}")
            successful_retries += 1
            
            # Sleep to avoid rate limiting
            time.sleep(3)
            
        except Exception as e:
            print(f"  ✗ Failed - Error: {str(e)}")
            failed_retries.append(idx)
    
    # Save updated results
    df_cot.to_csv('data/benchmark/greenwashing_chain_of_thought.csv', index=False)
    
    print(f"\n=== RETRY SUMMARY ===")
    print(f"Total errors found: {len(error_indices)}")
    print(f"Successfully retried: {successful_retries}")
    print(f"Still failed: {len(failed_retries)}")
    
    if failed_retries:
        print(f"Failed indices: {failed_retries}")
    
    # Show remaining errors
    remaining_errors = len(df_cot[df_cot['greenwashing'] == 'error'])
    print(f"\nRemaining errors in dataset: {remaining_errors}")
    
    if remaining_errors == 0:
        print("✓ All cases successfully labeled!")

Retrying index 50 (1/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 51 (2/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 52 (3/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 53 (4/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 54 (5/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.95
Retrying index 55 (6/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 56 (7/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.85
Retrying index 57 (8/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.95
Retrying index 58 (9/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.85
Retrying index 59 (10/50)...
  ✓ Success - Label: greenwashing, Confidence: 0.95
Retrying index 60 (11/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.90
Retrying index 61 (12/50)...
  ✓ Success - Label: not_greenwashing, Confidence: 0.85
Retrying inde