In [20]:
import sys
sys.path.append("../src")

In [21]:
import pandas as pd
from pathlib import Path
import hashlib
import re
from labels import LABELS, label2id

In [22]:
processed_dir = Path("../data/processed")
df = pd.read_csv("../data/processed/all_clauses_clean.csv")

### Merge per-company CSVs and add source_policy from filename

In [23]:
# grab all the per-company labeled clause CSVs
files = sorted(processed_dir.glob("*_clauses_labeled.csv"))
print("Found files:", len(files))

dfs = []
for fp in files:
    df_i = pd.read_csv(fp)

    # IMPORTANT: standardize column names if needed
    # expected columns: clause, label (or label_id)
    if "clause" not in df_i.columns:
        raise ValueError(f"{fp.name} missing 'clause' column. Columns: {df_i.columns}")

    # If some files use different label column names, normalize here:
    if "label" not in df_i.columns and "labels" in df_i.columns:
        df_i = df_i.rename(columns={"labels": "label"})

    # Add policy/company ID from filename stem
    df_i["source_policy"] = fp.stem  # e.g. "google_privacy_clauses_labeled"
    dfs.append(df_i)

all_df = pd.concat(dfs, ignore_index=True)
print(all_df.columns)
print("Total rows:", len(all_df))
print("Unique policies:", all_df["source_policy"].nunique())

Found files: 65
Index(['clause', 'label', 'source_policy'], dtype='object')
Total rows: 17755
Unique policies: 65


### Clean / restrict to your final 7 labels

In [24]:
KEEP = {"none","data_sharing","tracking","refunds","location","arbitration","ai_decisions"}

all_df = all_df[all_df["label"].isin(KEEP)].copy()

all_df["label"].value_counts()

label
none            15366
data_sharing     1178
tracking          798
refunds           138
location          119
ai_decisions       69
arbitration        63
Name: count, dtype: int64

### Add clause_key (for leakage-proof splitting) + deduplicate

In [25]:
import re, hashlib

def normalize_clause(text: str) -> str:
    text = str(text).lower().strip()
    text = re.sub(r"\s+", " ", text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("–", "-").replace("—", "-")
    return text

def clause_hash(text: str) -> str:
    return hashlib.sha1(normalize_clause(text).encode("utf-8")).hexdigest()

all_df["clause_key"] = all_df["clause"].map(clause_hash)

# recommended: dedup exact clause text
all_df = all_df.drop_duplicates(subset=["clause_key"]).copy()

print("After dedup rows:", len(all_df))
print("Policies:", all_df["source_policy"].nunique())

After dedup rows: 12037
Policies: 48


### Recreate label_id consistent with your labels.py

In [26]:
LABELS = ["none","data_sharing","tracking","refunds","location","arbitration","ai_decisions"]
label2id = {l:i for i,l in enumerate(LABELS)}

all_df["label_id"] = all_df["label"].map(label2id)

# final columns for training
final_df = all_df[["clause", "label", "label_id", "source_policy", "clause_key"]].copy()
final_df.head()

Unnamed: 0,clause,label,label_id,source_policy,clause_key
0,"Conditions of Use Last updated: May 30, 2025 W...",none,0,Amazon_COU_clauses_labeled,6b3563a887eb2587b72f0706440733326e50dfe2
1,Amazon.com Services LLC and/or its affiliates ...,none,0,Amazon_COU_clauses_labeled,2cdbad074598ad0cb1e0dbfcc0d68aef9cb3eb2a
2,"By using the Amazon Services, you agree, on be...",none,0,Amazon_COU_clauses_labeled,1a70b6f161b2f65af5978cbad8afb04856570811
3,"We offer a wide range of Amazon Services, and ...",none,0,Amazon_COU_clauses_labeled,4125d9f1472e5afa6ae2c8c46d406bc28935709f
4,"When you use an Amazon Service (for example, Y...",none,0,Amazon_COU_clauses_labeled,13bab015ac9909c43f312c16f66b3db56e880ff6


### Save a new training-ready file

In [27]:
out_path = processed_dir / "fineprint_train_ready_dedup_with_source.csv"
final_df.to_csv(out_path, index=False)

print("Saved:", out_path)
print(final_df["label"].value_counts())
print("Unique policies:", final_df["source_policy"].nunique())

Saved: ../data/processed/fineprint_train_ready_dedup_with_source.csv
label
none            10443
data_sharing      794
tracking          487
refunds           131
location           73
arbitration        59
ai_decisions       50
Name: count, dtype: int64
Unique policies: 48
