Auto-detects each dataset’s raw schema.

Maps to standardized columns for NLP and transactions:

* Text corpora: text, label, intent, slots, conversation_id, turn_id, user_id, timestamp, source

* Transactions: invoice, item, description, qty, timestamp, price, user_id, country, source


Prints:

* Found columns per dataset

* Mapping used per dataset

* Head() of standardized frame

Visualizes:

* Label distribution (if present)

* Basic text length histograms (if text present)

Saves parquet: audit_outputs/standardized/{dataset}.parquet

Quick schema cheat-sheet (expected fields by source)
Relational strategies in customer service

Text-like: text/utterance/message

Label-like: label/strategy/category

Optional: conversation_id, turn_id, user_id, timestamp

3K conversations dataset for chatbot

Text-like: text/utterance/message

Optional: intent/label/slots, conversation_id/dialogue_id, turn_id, user/speaker, timestamp

Customer support on Twitter

Typical: text, created_at/timestamp, author_id/user_id, conversation/thread IDs; sometimes label/intent absent

Stanford Sentiment Treebank

sentence/text/phrase, label (binary or 5-class index/prob bins)

SNIPS NLU

utterance, intent, slots

Topic modeling with BERT input

A single text/content column; unlabeled docs

Reuters-21578

text/body, topics (multi-label), optional title/date/split

E-commerce data (Online Retail)

InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country

Opendatabay consumer datasets

Either transaction-like (invoice, item, qty, price, timestamp, customer, country) or text/review-like (text, rating/label)

In [1]:
# Setup
import os, re, json, glob, warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

os.makedirs("audit_outputs/standardized", exist_ok=True)


In [2]:
# Shared helpers

def savefig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=160)
    plt.close()

def print_header(title):
    print("\n" + "="*90)
    print(title)
    print("="*90)

def show_schema(df, name):
    print_header(f"{name} | raw schema")
    print(f"Shape: {df.shape}")
    print("Columns:", list(df.columns))
    display(df.head(3))

def plot_label_dist(df, label_col, name):
    counts = df[label_col].value_counts()
    plt.figure(figsize=(10,4))
    sns.barplot(x=counts.index, y=counts.values, palette="Blues_d")
    plt.title(f"{name}: {label_col} distribution")
    plt.xticks(rotation=45, ha="right")
    savefig(f"audit_outputs/standardized/{name}_label_distribution.png")
    print(f"Saved label distribution plot -> audit_outputs/standardized/{name}_label_distribution.png")

def plot_text_lengths(df, text_col, name):
    lens = df[text_col].fillna("").astype(str).str.split().map(len)
    plt.figure(figsize=(8,4))
    sns.histplot(lens, bins=60, kde=False)
    plt.title(f"{name}: token length histogram")
    savefig(f"audit_outputs/standardized/{name}_text_lengths.png")
    print(f"Saved text length plot -> audit_outputs/standardized/{name}_text_lengths.png")


In [3]:
# Unified schema and mapping utilities

# Target columns for NLP corpora
NLP_TARGET = ["text","label","intent","slots","conversation_id","turn_id","user_id","timestamp","source"]

# Target columns for transactions
TX_TARGET  = ["invoice","item","description","qty","timestamp","price","user_id","country","source"]

def map_first_present(d, candidates, default=None):
    for c in candidates:
        if c in d.columns:
            return c
    return default

def ensure_cols(df, cols):
    out = df.copy()
    for c in cols:
        if c not in out.columns:
            out[c] = np.nan
    return out[cols]


In [4]:
# Adapters for each dataset
# Note: Paths assume you’ve downloaded/unzipped raw files under data/raw/{shortname}/. Adjust file patterns as needed.

In [5]:
# Relational strategies in customer service

def load_relational_strategies(base="data/raw/relational_strategies"):
    # Try CSV/JSONL variants; flexible mapping
    files = glob.glob(os.path.join(base, "*.csv")) + glob.glob(os.path.join(base, "*.jsonl"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    frames = []
    for f in files:
        if f.endswith(".csv"):
            df = pd.read_csv(f, encoding_errors="ignore")
        else:
            df = pd.read_json(f, lines=True)
        df["source"] = "relational_strategies"
        frames.append(df)
    raw = pd.concat(frames, ignore_index=True)
    show_schema(raw, "relational_strategies")

    # Candidate columns
    text_cand   = ["text","utterance","message","content","body"]
    label_cand  = ["label","strategy","tag","category"]
    intent_cand = ["intent","intent_label"]
    slots_cand  = ["slots","entities"]
    conv_cand   = ["conversation_id","dialogue_id","thread_id","conv_id"]
    turn_cand   = ["turn_id","turn","message_id","seq"]
    user_cand   = ["user_id","author_id","customer_id","uid"]
    time_cand   = ["timestamp","created_at","time","date"]

    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, label_cand),
        "intent": map_first_present(raw, intent_cand),
        "slots": map_first_present(raw, slots_cand),
        "conversation_id": map_first_present(raw, conv_cand),
        "turn_id": map_first_present(raw, turn_cand),
        "user_id": map_first_present(raw, user_cand),
        "timestamp": map_first_present(raw, time_cand),
    }
    print_header("relational_strategies | column mapping")
    print(cols)

    std = pd.DataFrame()
    for k,v in cols.items():
        std[k] = raw[v] if v in raw.columns else np.nan
    std["source"] = "relational_strategies"
    std = ensure_cols(std, NLP_TARGET)

    if std["label"].notna().any():
        plot_label_dist(std, "label", "relational_strategies")
    if std["text"].notna().any():
        plot_text_lengths(std, "text", "relational_strategies")

    out_path = "audit_outputs/standardized/relational_strategies.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [6]:
# 3K conversations dataset for chatbot

def load_conversations_3k(base="data/raw/conversations_3k"):
    files = glob.glob(os.path.join(base, "*.csv")) + glob.glob(os.path.join(base, "*.jsonl")) + glob.glob(os.path.join(base, "*.json"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    frames=[]
    for f in files:
        if f.endswith(".csv"):
            frames.append(pd.read_csv(f, encoding_errors="ignore"))
        elif f.endswith(".jsonl"):
            frames.append(pd.read_json(f, lines=True))
        else:
            frames.append(pd.read_json(f))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "conversations_3k"
    show_schema(raw, "conversations_3k")

    text_cand   = ["text","utterance","message","content"]
    label_cand  = ["label","category","tag"]
    intent_cand = ["intent","intent_label"]
    slots_cand  = ["slots","entities"]
    conv_cand   = ["conversation_id","dialogue_id","thread_id","conv_id","dialogueID"]
    turn_cand   = ["turn_id","turn","message_id","seq","index"]
    user_cand   = ["user_id","author_id","customer_id","uid","speaker_id","speaker"]
    time_cand   = ["timestamp","created_at","time","date"]

    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, label_cand),
        "intent": map_first_present(raw, intent_cand),
        "slots": map_first_present(raw, slots_cand),
        "conversation_id": map_first_present(raw, conv_cand),
        "turn_id": map_first_present(raw, turn_cand),
        "user_id": map_first_present(raw, user_cand),
        "timestamp": map_first_present(raw, time_cand),
    }
    print_header("conversations_3k | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "conversations_3k"
    std = ensure_cols(std, NLP_TARGET)

    if std["label"].notna().any():
        plot_label_dist(std, "label", "conversations_3k")
    if std["text"].notna().any():
        plot_text_lengths(std, "text", "conversations_3k")

    out_path = "audit_outputs/standardized/conversations_3k.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [7]:
# Customer support on Twitter

def load_twitter_support(base="data/raw/twitter_support"):
    files = glob.glob(os.path.join(base, "*.csv"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    raw = pd.concat([pd.read_csv(f, encoding_errors="ignore") for f in files], ignore_index=True)
    raw["source"] = "twitter_support"
    show_schema(raw, "twitter_support")

    # Known-ish fields in popular releases
    text_cand   = ["text","tweet_text","body"]
    time_cand   = ["created_at","timestamp","time"]
    user_cand   = ["author_id","user_id","userid"]
    conv_cand   = ["conversation_id","in_response_to_tweet_id","thread_id"]
    turn_cand   = ["turn_id","seq"]
    label_cand  = ["label","category","tag"]
    intent_cand = ["intent","intent_label"]
    slots_cand  = ["slots","entities"]

    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, label_cand),
        "intent": map_first_present(raw, intent_cand),
        "slots": map_first_present(raw, slots_cand),
        "conversation_id": map_first_present(raw, conv_cand),
        "turn_id": map_first_present(raw, turn_cand),
        "user_id": map_first_present(raw, user_cand),
        "timestamp": map_first_present(raw, time_cand),
    }
    print_header("twitter_support | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "twitter_support"
    std = ensure_cols(std, NLP_TARGET)

    if std["label"].notna().any():
        plot_label_dist(std, "label", "twitter_support")
    if std["text"].notna().any():
        plot_text_lengths(std, "text", "twitter_support")

    out_path = "audit_outputs/standardized/twitter_support.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [8]:
# Stanford Sentiment Treebank (SST)

def load_sst(base="data/raw/sst"):
    # Support tsv/csv with sentence or text and label
    files = glob.glob(os.path.join(base, "*.tsv")) + glob.glob(os.path.join(base, "*.csv"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    frames=[]
    for f in files:
        if f.endswith(".tsv"):
            frames.append(pd.read_csv(f, sep="\t", encoding_errors="ignore"))
        else:
            frames.append(pd.read_csv(f, encoding_errors="ignore"))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "sst"
    show_schema(raw, "sst")

    text_cand   = ["sentence","text","phrase"]
    label_cand  = ["label","sentiment","sentiment_values","class"]
    time_cand   = []
    user_cand   = []
    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, label_cand),
        "intent": None, "slots": None,
        "conversation_id": None, "turn_id": None,
        "user_id": None, "timestamp": None
    }
    print_header("sst | column mapping")
    print(cols)

    std = pd.DataFrame()
    for k,v in cols.items():
        std[k] = raw[v] if v and v in raw.columns else np.nan
    std["source"] = "sst"
    std = ensure_cols(std, NLP_TARGET)

    if std["label"].notna().any():
        plot_label_dist(std, "label", "sst")
    if std["text"].notna().any():
        plot_text_lengths(std, "text", "sst")

    out_path = "audit_outputs/standardized/sst.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [9]:
# SNIPS NLU benchmark

def load_snips(base="data/raw/snips"):
    files = glob.glob(os.path.join(base, "*.jsonl")) + glob.glob(os.path.join(base, "*.json"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    frames=[]
    for f in files:
        if f.endswith(".jsonl"):
            frames.append(pd.read_json(f, lines=True))
        else:
            frames.append(pd.read_json(f))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "snips"
    show_schema(raw, "snips")

    text_cand   = ["utterance","text","sentence"]
    intent_cand = ["intent","intentName","intent_name"]
    slots_cand  = ["slots","entities"]
    label_cand  = ["label","category"]  # often no direct "label", intent is target
    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, label_cand),
        "intent": map_first_present(raw, intent_cand),
        "slots": map_first_present(raw, slots_cand),
        "conversation_id": None,
        "turn_id": None,
        "user_id": None,
        "timestamp": None
    }
    print_header("snips | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "snips"
    std = ensure_cols(std, NLP_TARGET)

    if std["intent"].notna().any():
        plot_label_dist(std.rename(columns={"intent":"_intent"}), "_intent", "snips_intent")
    if std["text"].notna().any():
        plot_text_lengths(std, "text", "snips")

    out_path = "audit_outputs/standardized/snips.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [10]:
# Topic modeling with BERT (inputs)

def load_topic_inputs(base="data/raw/topic_inputs"):
    # Treat as unlabeled docs; expect a CSV with a text-like column or TXT files
    csvs = glob.glob(os.path.join(base, "*.csv"))
    txts = glob.glob(os.path.join(base, "*.txt"))
    frames=[]
    if csvs:
        for f in csvs:
            df = pd.read_csv(f, encoding_errors="ignore")
            frames.append(df)
    elif txts:
        rows=[]
        for f in txts:
            with open(f, "r", encoding="utf-8", errors="ignore") as fh:
                for line in fh:
                    rows.append({"text": line.strip()})
        frames.append(pd.DataFrame(rows))
    else:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=NLP_TARGET)
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "topic_inputs"
    show_schema(raw, "topic_inputs")

    text_cand = ["text","content","body","headline","headline_text"]
    cols = {
        "text": map_first_present(raw, text_cand),
        "label": None, "intent": None, "slots": None,
        "conversation_id": None, "turn_id": None,
        "user_id": None, "timestamp": None
    }
    print_header("topic_inputs | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "topic_inputs"
    std = ensure_cols(std, NLP_TARGET)

    if std["text"].notna().any():
        plot_text_lengths(std, "text", "topic_inputs")

    out_path = "audit_outputs/standardized/topic_inputs.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [None]:
# Reuters-21578 (prefer preprocessed CSV)

def load_reuters(base="data/raw/reuters21578"):
    # Prefer CSV pre-extractions. If only SGM files exist, add a separate parser later.
    files = glob.glob(os.path.join(base, "*.csv"))
    if not files:
        print(f"No CSV files found under {base}. If you have SGM, preprocess to CSV first.")
        return pd.DataFrame(columns=NLP_TARGET)
    raw = pd.concat([pd.read_csv(f, encoding_errors="ignore") for f in files], ignore_index=True)
    raw["source"] = "reuters21578"
    show_schema(raw, "reuters21578")

    text_cand   = ["text","body","BODY"]
    topics_cand = ["topics","TOPICS"]
    title_cand  = ["title","TITLE"]
    date_cand   = ["date","DATE"]
    cols = {
        "text": map_first_present(raw, text_cand),
        "label": map_first_present(raw, topics_cand),  # multi-label possible; keep as is
        "intent": None, "slots": None,
        "conversation_id": None, "turn_id": None,
        "user_id": None, "timestamp": map_first_present(raw, date_cand)
    }
    print_header("reuters21578 | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "reuters21578"
    std = ensure_cols(std, NLP_TARGET)

    if std["label"].notna().any():
        # Flatten if stringified lists
        labels = std["label"].dropna().astype(str).str.split(",")
        flat = pd.Series([x.strip() for sub in labels for x in sub if isinstance(sub, list) or isinstance(sub, list) == False])
        counts = flat.value_counts().head(20)
        plt.figure(figsize=(10,4))
        sns.barplot(x=counts.index, y=counts.values, palette="Greens_d")
        plt.xticks(rotation=45, ha="right")
        plt.title("reuters21578: top topics")
        savefig("audit_outputs/standardized/reuters21578_topics.png")
        print("Saved topics plot -> audit_outputs/standardized/reuters21578_topics.png")

    if std["text"].notna().any():
        plot_text_lengths(std, "text", "reuters21578")

    out_path = "audit_outputs/standardized/reuters21578.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [None]:
# E-commerce data (Online Retail)

def load_ecommerce(base="data/raw/ecommerce"):
    files = glob.glob(os.path.join(base, "*.csv")) + glob.glob(os.path.join(base, "*.xlsx"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame(columns=TX_TARGET)
    frames=[]
    for f in files:
        if f.endswith(".csv"):
            frames.append(pd.read_csv(f, encoding_errors="ignore"))
        else:
            frames.append(pd.read_excel(f))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "ecommerce"
    show_schema(raw, "ecommerce")

    invoice_cand = ["InvoiceNo","invoice"]
    item_cand    = ["StockCode","item","SKU","product_id"]
    desc_cand    = ["Description","item_desc","Product_Description","description"]
    qty_cand     = ["Quantity","qty"]
    time_cand    = ["InvoiceDate","timestamp","Date","Transaction_Date"]
    price_cand   = ["UnitPrice","price","Price","Purchase_Amount"]
    user_cand    = ["CustomerID","user_id","Customer_ID"]
    country_cand = ["Country","country","Location"]

    cols = {
        "invoice": map_first_present(raw, invoice_cand),
        "item": map_first_present(raw, item_cand),
        "description": map_first_present(raw, desc_cand),
        "qty": map_first_present(raw, qty_cand),
        "timestamp": map_first_present(raw, time_cand),
        "price": map_first_present(raw, price_cand),
        "user_id": map_first_present(raw, user_cand),
        "country": map_first_present(raw, country_cand),
    }
    print_header("ecommerce | column mapping")
    print(cols)

    std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
    std["source"] = "ecommerce"
    std = ensure_cols(std, TX_TARGET)

    print_header("ecommerce | quick numeric sanity")
    for c in ["qty","price"]:
        if c in std.columns:
            std[c] = pd.to_numeric(std[c], errors="coerce")
            print(c, "summary:", std[c].describe())

    out_path = "audit_outputs/standardized/ecommerce.parquet"
    std.to_parquet(out_path, index=False)
    print("Saved standardized:", out_path)
    return std


In [None]:
# Opendatabay consumer dataset A (schema discovery)

def load_opendatabay_a(base="data/raw/opendatabay_a"):
    files = glob.glob(os.path.join(base, "*.csv")) + glob.glob(os.path.join(base, "*.json"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame()
    frames=[]
    for f in files:
        if f.endswith(".csv"):
            frames.append(pd.read_csv(f, encoding_errors="ignore"))
        else:
            frames.append(pd.read_json(f))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "opendatabay_a"
    show_schema(raw, "opendatabay_a")

    # Heuristic: if looks like transactions, map TX schema; else, try NLP schema
    lower_cols = [c.lower() for c in raw.columns]
    looks_tx = any(k in lower_cols for k in ["invoice","invoiceno","stockcode","quantity","unitprice","customerid","country","transaction_date","purchase_amount"])
    if looks_tx:
        invoice_cand = ["InvoiceNo","invoice"]
        item_cand    = ["StockCode","item","SKU","product_id","Item Purchased"]
        desc_cand    = ["Description","item_desc","Product_Description","description","Item Purchased"]
        qty_cand     = ["Quantity","qty"]
        time_cand    = ["InvoiceDate","timestamp","Date","Transaction_Date","Transaction Date"]
        price_cand   = ["UnitPrice","price","Price","Purchase_Amount","Purchase Amount (USD)"]
        user_cand    = ["CustomerID","user_id","Customer_ID","Customer ID"]
        country_cand = ["Country","country","Location"]

        cols = {
            "invoice": map_first_present(raw, invoice_cand),
            "item": map_first_present(raw, item_cand),
            "description": map_first_present(raw, desc_cand),
            "qty": map_first_present(raw, qty_cand),
            "timestamp": map_first_present(raw, time_cand),
            "price": map_first_present(raw, price_cand),
            "user_id": map_first_present(raw, user_cand),
            "country": map_first_present(raw, country_cand),
        }
        print_header("opendatabay_a | TX column mapping")
        print(cols)
        std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
        std["source"] = "opendatabay_a"
        std = ensure_cols(std, TX_TARGET)
        out_path = "audit_outputs/standardized/opendatabay_a_tx.parquet"
        std.to_parquet(out_path, index=False)
        print("Saved standardized:", out_path)
        return std
    else:
        text_cand   = ["text","review","message","content"]
        label_cand  = ["label","rating","category"]
        intent_cand = ["intent"]
        time_cand   = ["timestamp","date","created_at"]
        user_cand   = ["user_id","customer_id","author_id"]
        cols = {
            "text": map_first_present(raw, text_cand),
            "label": map_first_present(raw, label_cand),
            "intent": map_first_present(raw, intent_cand),
            "slots": None,
            "conversation_id": None, "turn_id": None,
            "user_id": map_first_present(raw, user_cand),
            "timestamp": map_first_present(raw, time_cand),
        }
        print_header("opendatabay_a | NLP column mapping")
        print(cols)
        std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
        std["source"] = "opendatabay_a"
        std = ensure_cols(std, NLP_TARGET)
        out_path = "audit_outputs/standardized/opendatabay_a_nlp.parquet"
        std.to_parquet(out_path, index=False)
        print("Saved standardized:", out_path)
        return std


In [None]:
# Opendatabay consumer dataset B (schema discovery)

def load_opendatabay_b(base="data/raw/opendatabay_b"):
    files = glob.glob(os.path.join(base, "*.csv")) + glob.glob(os.path.join(base, "*.json"))
    if not files:
        print(f"No files found under {base}")
        return pd.DataFrame()
    frames=[]
    for f in files:
        if f.endswith(".csv"):
            frames.append(pd.read_csv(f, encoding_errors="ignore"))
        else:
            frames.append(pd.read_json(f))
    raw = pd.concat(frames, ignore_index=True)
    raw["source"] = "opendatabay_b"
    show_schema(raw, "opendatabay_b")

    lower_cols = [c.lower() for c in raw.columns]
    looks_tx = any(k in lower_cols for k in ["invoice","invoiceno","stockcode","quantity","unitprice","customerid","country","transaction_date","purchase_amount"])
    if looks_tx:
        invoice_cand = ["InvoiceNo","invoice"]
        item_cand    = ["StockCode","item","SKU","product_id","Item Purchased"]
        desc_cand    = ["Description","item_desc","Product_Description","description","Item Purchased"]
        qty_cand     = ["Quantity","qty"]
        time_cand    = ["InvoiceDate","timestamp","Date","Transaction_Date","Transaction Date"]
        price_cand   = ["UnitPrice","price","Price","Purchase_Amount","Purchase Amount (USD)"]
        user_cand    = ["CustomerID","user_id","Customer_ID","Customer ID"]
        country_cand = ["Country","country","Location"]

        cols = {
            "invoice": map_first_present(raw, invoice_cand),
            "item": map_first_present(raw, item_cand),
            "description": map_first_present(raw, desc_cand),
            "qty": map_first_present(raw, qty_cand),
            "timestamp": map_first_present(raw, time_cand),
            "price": map_first_present(raw, price_cand),
            "user_id": map_first_present(raw, user_cand),
            "country": map_first_present(raw, country_cand),
        }
        print_header("opendatabay_b | TX column mapping")
        print(cols)
        std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
        std["source"] = "opendatabay_b"
        std = ensure_cols(std, TX_TARGET)
        out_path = "audit_outputs/standardized/opendatabay_b_tx.parquet"
        std.to_parquet(out_path, index=False)
        print("Saved standardized:", out_path)
        return std
    else:
        text_cand   = ["text","review","message","content"]
        label_cand  = ["label","rating","category"]
        intent_cand = ["intent"]
        time_cand   = ["timestamp","date","created_at"]
        user_cand   = ["user_id","customer_id","author_id"]
        cols = {
            "text": map_first_present(raw, text_cand),
            "label": map_first_present(raw, label_cand),
            "intent": map_first_present(raw, intent_cand),
            "slots": None,
            "conversation_id": None, "turn_id": None,
            "user_id": map_first_present(raw, user_cand),
            "timestamp": map_first_present(raw, time_cand),
        }
        print_header("opendatabay_b | NLP column mapping")
        print(cols)
        std = pd.DataFrame({k: raw[v] if v in raw.columns else np.nan for k,v in cols.items()})
        std["source"] = "opendatabay_b"
        std = ensure_cols(std, NLP_TARGET)
        out_path = "audit_outputs/standardized/opendatabay_b_nlp.parquet"
        std.to_parquet(out_path, index=False)
        print("Saved standardized:", out_path)
        return std


In [None]:
# Run all adapters and print/save results
# Adjust the base paths to your actual files.

adapters = [
    ("relational_strategies", load_relational_strategies, "data/raw/relational_strategies"),
    ("conversations_3k", load_conversations_3k, "data/raw/conversations_3k"),
    ("twitter_support", load_twitter_support, "data/raw/twitter_support"),
    ("sst", load_sst, "data/raw/sst"),
    ("snips", load_snips, "data/raw/snips"),
    ("topic_inputs", load_topic_inputs, "data/raw/topic_inputs"),
    ("reuters21578", load_reuters, "data/raw/reuters21578"),
    ("ecommerce", load_ecommerce, "data/raw/ecommerce"),
    ("opendatabay_a", load_opendatabay_a, "data/raw/opendatabay_a"),
    ("opendatabay_b", load_opendatabay_b, "data/raw/opendatabay_b"),
]

summaries = []
for name, fn, base in adapters:
    print_header(f"Adapter: {name}")
    try:
        df = fn(base)
        summaries.append({
            "dataset": name,
            "rows": len(df),
            "cols": list(df.columns),
            "path": [p for p in os.listdir("audit_outputs/standardized") if p.startswith(name)]
        })
    except Exception as e:
        print(f"Error in {name}: {e}")

pd.DataFrame(summaries)
