In [7]:
import pandas as pd

df = pd.read_csv("../data/training_sentences.csv")

df["label"] = df["label_id"].map({
    0: "VAGUE",
    1: "SPECIFIC"
})

print(df["label"].value_counts())


label
SPECIFIC    3755
VAGUE        992
Name: count, dtype: int64


In [8]:
import re

LINGUISTIC_PATTERNS = {
    "certification": r'\b(certified|gots|fsc|usda|fair trade|ecocert|oeko-tex)\b',
    "quantification": r'\b\d+\s*(%|kg|g|grams|liters)\b',
    "process_transparency": r'\b(made from|manufactured using|sourced from|produced with)\b',
    "carbon_metrics": r'\b(carbon neutral|carbon footprint|carbon offset)\b',

    "vague_buzzwords": r'\b(eco-friendly|environmentally friendly|sustainable|green|natural)\b',
    "emotional_language": r'\b(care for|protect|love|help save)\b',
    "hedging_language": r'\b(aims to|designed to|helps to|working towards)\b'
}


In [9]:
def extract_patterns(text):
    text = str(text).lower()
    return {
        pattern: int(bool(re.search(regex, text)))
        for pattern, regex in LINGUISTIC_PATTERNS.items()
    }

pattern_df = df["sentence"].apply(extract_patterns).apply(pd.Series)

df_analysis = pd.concat([df, pattern_df], axis=1)


In [10]:
feature_cols = list(LINGUISTIC_PATTERNS.keys())

df_analysis[feature_cols] = df_analysis[feature_cols].astype(int)


In [11]:
pattern_summary = (
    df_analysis
    .groupby("label")[feature_cols]
    .mean()
    .T
)

pattern_summary["difference"] = (
    pattern_summary["SPECIFIC"] - pattern_summary["VAGUE"]
)

pattern_summary.sort_values("difference", ascending=False)


label,SPECIFIC,VAGUE,difference
certification,0.094541,0.0,0.094541
process_transparency,0.096937,0.055444,0.041494
quantification,0.042344,0.017137,0.025206
carbon_metrics,0.008522,0.0,0.008522
hedging_language,0.008256,0.011089,-0.002833
emotional_language,0.011185,0.019153,-0.007968
vague_buzzwords,0.142743,0.742944,-0.600201


In [12]:
from scipy.stats import chi2_contingency

results = []

for feature in feature_cols:
    contingency = pd.crosstab(
        df_analysis[feature],
        df_analysis["label"]
    )

    chi2, p, _, _ = chi2_contingency(contingency)
    results.append((feature, p))

stats_df = pd.DataFrame(results, columns=["feature", "p_value"])
stats_df.sort_values("p_value")


Unnamed: 0,feature,p_value
4,vague_buzzwords,0.0
0,certification,1.5216650000000002e-23
2,process_transparency,5.473354e-05
1,quantification,0.0002699947
3,carbon_metrics,0.006949439
5,emotional_language,0.0682538
6,hedging_language,0.5112726
