In [29]:
import pandas as pd
import re
from bs4 import BeautifulSoup

In [30]:
def has_environmental_claim(text):
    """
    Detect actual environmental claims, not just keywords
    """
    if pd.isna(text) or len(str(text)) < 20:
        return False
    
    text_lower = str(text).lower()
    
    # Stage 1: Must have environmental context words
    environmental_phrases = [
        # Explicit environmental claims
        'eco-friendly', 'eco friendly', 'environmentally friendly',
        'sustainable', 'sustainability',
        'organic', 'certified organic',
        'recycled', 'recyclable',
        'biodegradable', 'compostable',
        'carbon neutral', 'carbon footprint', 'carbon offset',
        'zero waste', 'plastic-free', 'plastic free',
        'fair trade',
        'green product', 'green living',
        'earth friendly', 'earth-friendly',
        'planet friendly', 'planet-friendly',
        
        # Certifications (strong signal)
        'gots', 'fsc certified', 'usda organic', 'grs certified',
        'ecocert', 'oeko-tex', 'fair trade certified',
        
        # Materials with environmental context
        'organic cotton', 'recycled polyester', 'recycled plastic',
        'bamboo fiber', 'hemp fiber',
        'post-consumer recycled', 'pre-consumer recycled'
    ]
    
    # Check if any phrase exists
    return any(phrase in text_lower for phrase in environmental_phrases)

In [31]:
# Read in chunks (for large files)
chunk_size = 50000
filtered_data = []

print("Applying stricter environmental claim filter...")

Applying stricter environmental claim filter...


In [32]:
for i, chunk in enumerate(pd.read_csv('amazon_products.csv', chunksize=chunk_size)):
    # Combine text fields
    chunk['combined_text'] = (
        chunk['TITLE'].fillna('') + ' ' +
        chunk['DESCRIPTION'].fillna('')
    )
    
    # Apply stricter filter
    env_mask = chunk['combined_text'].apply(has_environmental_claim)
    filtered = chunk[env_mask]
    
    if len(filtered) > 0:
        filtered_data.append(filtered)
    
    print(f"Chunk {i+1}: Found {len(filtered)} environmental products")

Chunk 1: Found 906 environmental products
Chunk 2: Found 890 environmental products
Chunk 3: Found 934 environmental products
Chunk 4: Found 914 environmental products
Chunk 5: Found 871 environmental products
Chunk 6: Found 912 environmental products
Chunk 7: Found 889 environmental products
Chunk 8: Found 925 environmental products
Chunk 9: Found 924 environmental products
Chunk 10: Found 869 environmental products
Chunk 11: Found 908 environmental products
Chunk 12: Found 864 environmental products
Chunk 13: Found 894 environmental products
Chunk 14: Found 926 environmental products
Chunk 15: Found 855 environmental products
Chunk 16: Found 949 environmental products
Chunk 17: Found 886 environmental products
Chunk 18: Found 935 environmental products
Chunk 19: Found 914 environmental products
Chunk 20: Found 902 environmental products
Chunk 21: Found 926 environmental products
Chunk 22: Found 979 environmental products
Chunk 23: Found 897 environmental products
Chunk 24: Found 903 

In [33]:
df_env = pd.concat(filtered_data, ignore_index=True)
print(f"\nTotal after stricter filter: {len(df_env)} products")


Total after stricter filter: 40658 products


In [34]:
def is_false_positive(text):
    """
    Detect false positives (not actual environmental claims)
    """
    if pd.isna(text):
        return True
    
    text_lower = str(text).lower()
    
    # False positive patterns
    false_patterns = [
        # Color descriptions
        r'\bgreen\s+(color|shirt|dress|paint|dye)\b',
        r'\bnatural\s+(color|shade|tone)\b',
        
        # Features/modes (not claims)
        r'\beco\s+mode\b',
        r'\bgreen\s+screen\b',
        r'\bnatural\s+light\b',
        
        # Wood/material descriptions (not claims)
        r'\bnatural\s+wood\b',
        r'\bnatural\s+stone\b',
        
        # Too short (likely just keywords)
        r'^.{0,30}$'  # Less than 30 characters
    ]
    
    # Check for false patterns
    for pattern in false_patterns:
        if re.search(pattern, text_lower):
            return True
    
    return False

In [35]:
# Remove false positives
df_env = df_env[~df_env['combined_text'].apply(is_false_positive)]
print(f"After removing false positives: {len(df_env)} products")

After removing false positives: 40089 products


In [36]:
# Remove very short texts (likely incomplete data)
df_env['text_length'] = df_env['combined_text'].str.len()
df_env = df_env[df_env['text_length'] >= 50]  # At least 50 characters

print(f"After length filter: {len(df_env)} products")

After length filter: 39708 products


In [37]:
# Remove duplicates
df_env = df_env.drop_duplicates(subset=['combined_text'], keep='first')
print(f"After removing duplicates: {len(df_env)} products")

After removing duplicates: 39487 products


In [38]:
def categorize_claim_strength(text):
    """
    Categorize how strong the environmental claim is
    """
    text_lower = str(text).lower()
    
    # Strong claims (have certifications or metrics)
    strong_indicators = ['certified', 'gots', 'fsc', 'usda', 'fair trade', r'\d+%']
    if any(re.search(ind, text_lower) for ind in strong_indicators):
        return 'strong'
    
    # Medium claims (specific materials/processes)
    medium_indicators = ['organic cotton', 'recycled polyester', 'bamboo fiber', 
                        'post-consumer', 'carbon neutral']
    if any(ind in text_lower for ind in medium_indicators):
        return 'medium'
    
    # Weak claims (just buzzwords)
    return 'weak'


In [39]:
# Categorize all claims
df_env['claim_strength'] = df_env['combined_text'].apply(categorize_claim_strength)

print("\nClaim strength distribution:")
print(df_env['claim_strength'].value_counts())

# Stratified sampling: Get balanced mix
sample_sizes = {
    'strong': 1500,   # More strong claims (easier to label as SPECIFIC)
    'medium': 1500,   # Medium claims (mixed)
    'weak': 1000      # Some weak claims (likely VAGUE)
}



Claim strength distribution:
claim_strength
weak      24941
strong    13569
medium      977
Name: count, dtype: int64


In [40]:
# More strong/medium, fewer weak
sample_config = {
    'strong': 2000,   # Plenty available (13,569)
    'medium': 977,    # Take all (only 977 available)
    'weak': 1000      # Reduced from 24,941
}

samples = []
for strength, target_n in sample_config.items():
    subset = df_env[df_env['claim_strength'] == strength]
    n = min(len(subset), target_n)
    sample = subset.sample(n=n, random_state=42)
    samples.append(sample)
    print(f"{strength}: {n}")

df_sample = pd.concat(samples, ignore_index=True)
print(f"\nTotal: {len(df_sample)}")

strong: 2000
medium: 977
weak: 1000

Total: 3977


In [41]:
def clean_html_text(text):
    """
    Remove HTML tags and clean text
    """
    if pd.isna(text):
        return ""
    
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(str(text), 'html.parser')
    text_clean = soup.get_text()
    
    # Clean up extra whitespace
    text_clean = re.sub(r'\s+', ' ', text_clean)
    text_clean = text_clean.strip()
    
    return text_clean

In [42]:
print(f"\nTotal sampled: {len(df_sample):,}")
print("\nNew distribution:")
print(df_sample['claim_strength'].value_counts())


Total sampled: 3,977

New distribution:
claim_strength
strong    2000
weak      1000
medium     977
Name: count, dtype: int64


In [43]:
df_sample.columns.tolist()
df_sample.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,combined_text,text_length,claim_strength
0,1064258,L'ovedbaby Unisex-Baby Organic Cotton Footed O...,[Infant footie with snap buttons from neck to ...,"OR444s-0/3M Size: 0 - 3 Months, Color: Sage Fe...",2364,100.0,L'ovedbaby Unisex-Baby Organic Cotton Footed O...,464,strong
1,2171982,K3 Deluxe Virgin Plastic Bucket (Balti) 25 (23...,[K3 Deluxe Virgin Plastic Bucket (Balti) 25 li...,<b>Sturdy and Light Weight Made from 100% virg...,424,1614.173227,K3 Deluxe Virgin Plastic Bucket (Balti) 25 (23...,1370,strong
2,2400935,"40-Piece Flatware Set For 8, EIUBUIE 18/10 Sta...",[üëç„ÄêFINEST STAINLESS STEEL„Äë18/10 stainless-stee...,<b>EIUBUIE 40 Piece High Quality 18/10 Stainle...,8184,1043.0,"40-Piece Flatware Set For 8, EIUBUIE 18/10 Sta...",2160,strong
3,2924934,TUFFPAULIN 15FT X 15FT 120 GSM Silver Tarpauli...,[Actual size will be approx. half foot shorter...,TUFFPAULIN tarpaulin are made from superior qu...,10249,18000.0,TUFFPAULIN 15FT X 15FT 120 GSM Silver Tarpauli...,1801,strong
4,2518472,"Shrayati Areca Leaf Round Bowls, 5 Inch, Pack ...",[Biodegradability - Every small step towards t...,Shrayati Areca Leaf Round bowls are made from ...,1426,492.125984,"Shrayati Areca Leaf Round Bowls, 5 Inch, Pack ...",627,strong


In [44]:
# Clean text
df_sample['combined_text'] = df_sample['combined_text'].apply(clean_html_text)
df_sample['combined_text'] = df_sample['combined_text'].str.replace(r'\s+', ' ', regex=True)
df_sample['combined_text'] = df_sample['combined_text'].str.strip()

In [45]:
EVAL_SIZE = 100
RANDOM_SEED = 42

df_eval_products = (
    df_sample
    .drop_duplicates(subset='PRODUCT_ID')
    .sample(n=EVAL_SIZE, random_state=RANDOM_SEED)
)


In [46]:
df_sample = df_sample[
    ~df_sample['PRODUCT_ID'].isin(df_eval_products['PRODUCT_ID'])
]


In [47]:
assert set(df_sample['PRODUCT_ID']).isdisjoint(
       set(df_eval_products['PRODUCT_ID'])
)


In [49]:
print(len(df_sample))
print(len(df_eval_products))

3877
100


In [58]:
df_eval_products[['PRODUCT_ID', 'combined_text', 'claim_strength']].to_csv("data/eval_products_100.csv", index=False)


In [51]:
import re
import pandas as pd

def split_into_sentences(text):
    if pd.isna(text):
        return []

    # Normalize bullet points and line breaks
    text = re.sub(r'[\r\n‚Ä¢\-]+', '. ', str(text))

    # Split on sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Merge and filter short sentences
    return [s.strip() for s in sentences if len(s.strip()) >= 20]

# Create sentence-level rows
rows = []

for _, row in df_sample.iterrows():  # iterate over products
    sentences = split_into_sentences(row['combined_text'])
    for sent in sentences:
        rows.append({
            'sentence': sent,
            'claim_strength': row['claim_strength']
        })

# Make dataframe
df_sentences = pd.DataFrame(rows)

print(f"Total sentence-level samples: {len(df_sentences):,}")
df_sentences.sample(5, random_state=42)


Total sentence-level samples: 37,207


Unnamed: 0,sentence,claim_strength
22225,"Due to manual measurement, please kindly allow 1.",strong
19034,Window panels set of 2.,strong
5553,We use flannel cotton that‚Äôs lightweight yet a...,strong
12032,> Heavy Duty: Our Commercial Grade Balls weigh...,strong
11898,"Drop us a line, we'll be happy to assist .",strong


In [52]:
# AUTO-LABEL
print("\n" + "="*50)
print("AUTO-LABELING")
print("="*50)

def auto_label(text, strength):
    """
    Auto-label with claim strength as hint
    """
    if pd.isna(text) or len(str(text)) < 20:
        return 'SKIP'
    
    text_lower = str(text).lower()
    
    # SPECIFIC indicators
    certifications = [
        'certified', 'certification',
        'gots', 'fsc', 'usda organic', 'grs', 
        'fair trade', 'fairtrade',
        'ecocert', 'oeko-tex',
        'rainforest alliance',
        'certified by', 'approved by', 'verified by'
    ]
    has_cert = any(cert in text_lower for cert in certifications)
    
    # Has percentages or measurements
    has_metrics = bool(re.search(r'\d+\s*%|\d+\s*grams?|\d+\s*kg', text))
    
    # Specific materials/processes with details
    specific_details = [
        'organic cotton', 'recycled polyester', 'recycled plastic',
        'post-consumer recycled', 'pre-consumer recycled',
        'carbon neutral', 'carbon offset', 'carbon footprint'
    ]
    has_specific = any(detail in text_lower for detail in specific_details)
    
    # VAGUE indicators (just buzzwords)
    vague_only = [
        'eco-friendly', 'eco friendly',
        'environmentally friendly',
        'sustainable', 'sustainability',
        'green', 'green product',
        'natural', 'nature',
        'earth friendly', 'planet friendly'
    ]
    has_vague_only = any(term in text_lower for term in vague_only)
    
    # DECISION LOGIC with strength hints
    
    # Strong claims: likely SPECIFIC
    if strength == 'strong':
        if has_cert or has_metrics:
            return 'SPECIFIC'
        elif has_specific:
            return 'SPECIFIC'
        else:
            return 'UNCERTAIN'
    
    # Weak claims: likely VAGUE
    elif strength == 'weak':
        if has_cert or has_metrics:
            return 'SPECIFIC'  # Even weak claims can have certs
        elif has_vague_only and not has_specific:
            return 'VAGUE'
        else:
            return 'UNCERTAIN'
    
    # Medium claims: mixed
    else:  # medium
        if has_cert or has_metrics:
            return 'SPECIFIC'
        elif has_specific:
            return 'SPECIFIC'
        elif has_vague_only:
            return 'VAGUE'
        else:
            return 'UNCERTAIN'


AUTO-LABELING


In [53]:
# Apply labeling
df_sentences['label'] = df_sentences.apply(
    lambda row: auto_label(row['sentence'], row['claim_strength']),
    axis=1
)

print("\nSentence-level label distribution:")
print(df_sentences['label'].value_counts())



Sentence-level label distribution:
label
UNCERTAIN    32460
SPECIFIC      3755
VAGUE          992
Name: count, dtype: int64


In [54]:
df_labeled = df_sentences[df_sentences['label'].isin(['SPECIFIC', 'VAGUE'])].copy()

print(f"\nConfident sentence-level labels: {len(df_labeled):,}")
print(df_labeled['label'].value_counts())



Confident sentence-level labels: 4,747
label
SPECIFIC    3755
VAGUE        992
Name: count, dtype: int64


In [55]:
label_map = {
    'SPECIFIC': 1,   # VERIFIABLE
    'VAGUE': 0       # NON-VERIFIABLE
}

df_labeled['label_id'] = df_labeled['label'].map(label_map)


In [56]:
#Show label breakdown by strength
print("\nLabel breakdown by claim strength:")
print(pd.crosstab(df_labeled['claim_strength'], df_labeled['label']))


Label breakdown by claim strength:
label           SPECIFIC  VAGUE
claim_strength                 
medium               942    278
strong              2783      0
weak                  30    714


In [57]:
df_labeled[['sentence', 'label_id']].to_csv(
    'data/training_sentences.csv',
    index=False
)

print(f"‚úì Saved {len(df_labeled):,} sentence-level training examples")


‚úì Saved 4,747 sentence-level training examples
