In [32]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from pathlib import Path

# set your filename if needed
file_path = Path("CodeAttemptFNAL_SeqVer.py").resolve()
print(file_path)

/home/nathan/git/fnal-proj/notebooks/CodeAttemptFNAL_SeqVer.py


##### Note: Code split between two parts for me. First part is gathering data to put in a way to use, second part is transforming it. Also, headers are below code. ##### 

## Methods ##


(Garble Methods)



add — returns a stable token for an original and increments its count.

original_from_token — reverse-maps a token back to the original string.

record_from_token — fetches the UserRecord associated with a token.

export_to_json — saves the mapper’s current state to disk.

load_from_json — restores the mapper’s state from a saved JSON file.




(Helper Methods)
is_valid_user — checks that a user value is non-null and non-empty.

is_valid_ipv4 — validates IPv4 dotted-quad format and range.

to_jagged_array — builds [[original, token, count, valid], ...] for non-anonymous use.

dump_json — writes a Python object to a pretty-printed JSON file.




(Data Methods)
load_dataframe — selects required columns from all Parquet files in DATA_DIR.

build_obfuscations — iterates rows to create user/IP token maps with counts/validity.

make_summary_payload — returns anonymized users/IPs jagged arrays plus meta.

failed_users_payload — returns anonymized records for users with failed jobs plus meta.

In [33]:
import sys
import re
import argparse
import json
import string
import secrets
from dataclasses import dataclass, asdict #json helper
from typing import Dict, Optional, List, Tuple #typing helper
#transform data
import duckdb
import pandas as pd
#data I/O


from textwrap import fill, indent
#For readable texts

In [34]:
DATA_DIR = "../data"
OUTPUT_DIR = "./Output"
HUMAN_WRAP = 100 # wrap width for text output

### Import stuff ###

In [35]:
USER_COL   = "User"
IP_COL     = "JobsubClientIpAddress"
FAILED_COL = "DAG_NodesFailed"  # “boolean-ish”
NUM_STARTS_COL     = "NumJobStarts"
NUM_COMPLETIONS_COL= "NumJobCompletions"

### Config / Column Names ###

In [36]:
DIGITS = string.digits
LOWER = string.ascii_lowercase
UPPER = string.ascii_uppercase
DEFAULT_PUNCT = "!#$%&()*+,-.:;<=>?@[]^_{|}~"
CHAR_TYPE_CHOICES = ["digit", "lower", "upper", "punct"]
#obfuscation types

@dataclass #compact record
class UserRecord:
    token: str
    count: int
    valid: bool

In [37]:
class GarbleTokenMapper:
   
    def __init__(
        self,
        prefix: str = "",
        start: int = 1,
        # legacy args kept; ignored
        token_len: int = 8,

        allow_punctuation: bool = False,
        punct_chars: Optional[str] = None,
    ):
        self.prefix = str(prefix or "")
        self.start = int(start)
        # original into UserRecord
        self._by_orig: Dict[str, UserRecord] = {}
        # token into original
        self._token_to_orig: Dict[str, str] = {}
        # issued tokens for parity
        self._seen_tokens = set()
        # counter points +1
        self._counter = self.start - 1

    @staticmethod
    def _extract_trailing_int(s: str) -> Optional[int]:
        m = re.search(r"(\d+)$", str(s))
        return int(m.group(1)) if m else None

    def _next_token(self) -> str:
        self._counter += 1
        return f"{self.prefix}{self._counter}"

    def add(self, original: str, valid: bool = True) -> str:
        key = str(original)
        if key in self._by_orig:
            rec = self._by_orig[key]
            rec.count += 1
            return rec.token

        token = self._next_token()
        self._seen_tokens.add(token)
        rec = UserRecord(token=token, count=1, valid=bool(valid))
        self._by_orig[key] = rec
        self._token_to_orig[token] = key
        return token

    def original_from_token(self, token: str) -> Optional[str]:
        return self._token_to_orig.get(str(token))

    def record_from_token(self, token: str) -> Optional[UserRecord]:
        orig = self._token_to_orig.get(str(token))
        return self._by_orig.get(orig) if orig is not None else None

    def export_to_json(self, filepath: str) -> None:
        entries = []
        for orig, rec in self._by_orig.items():
            e = asdict(rec)
            e["original"] = orig
            entries.append(e)
        state = {
            "entries": entries,
            "config": {"prefix": self.prefix, "start": self.start, "counter": self._counter},
        }
        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(state, f, indent=2)

    def load_from_json(self, filepath: str) -> None:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        entries = data.get("entries", [])
        cfg = data.get("config", {})

        # reset
        self._by_orig.clear()
        self._token_to_orig.clear()
        self._seen_tokens.clear()

        # keep existing prefix/start unless given(in file)
        self.prefix = str(cfg.get("prefix", self.prefix))
        self.start = int(cfg.get("start", self.start))

        max_num = self.start - 1
        for e in entries:
            orig = str(e["original"])
            token = str(e["token"])
            count = int(e.get("count", 0))
            valid = bool(e.get("valid", True))
            rec = UserRecord(token=token, count=count, valid=valid)
            self._by_orig[orig] = rec
            self._token_to_orig[token] = orig
            self._seen_tokens.add(token)
            n = self._extract_trailing_int(token)
            if n is not None:
                max_num = max(max_num, n)

        # count after max number
        self._counter = int(cfg.get("counter", max_num))


### Token Mapper (Part 2) ###

In [38]:
def export_to_json(self, filepath: str):
        entries = []
        for orig, rec in self._by_orig.items():
            e = asdict(rec)
            e["original"] = orig
            entries.append(e)
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump({"entries": entries}, f, indent=2)

#### Json "export" ####

In [39]:
_ipv4_re = re.compile(r"^\d{1,3}(\.\d{1,3}){3}$")

def is_valid_user(u) -> bool:
    if pd.isna(u):
        return False
    s = str(u).strip()
    return len(s) > 0

In [40]:
def is_valid_ipv4(ip) -> bool:
    if pd.isna(ip):
        return False

    s = str(ip).strip()
    if not _ipv4_re.match(s):
        return False

    try:
        parts = [int(p) for p in s.split(".")]
    except ValueError:
        return False

    return all(0 <= p <= 255 for p in parts)


In [41]:
def to_jagged_array(ob_dict: Dict[str, Dict[str, object]]) -> List[List[object]]:

    return [[data["id"], data["count"], data["valid"]]
            for _, data in ob_dict.items()] #throwaway with keys to get values in tuple.

In [42]:
def dump_json(obj, path: str):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)

#### Helpers/Secondary ####

In [43]:
def load_dataframe(data_dir: str) -> pd.DataFrame:
 
    if not os.path.isdir(data_dir):
        raise FileNotFoundError(f"DATA_DIR does not exist or is not a directory: {data_dir}")

    pattern = f"{data_dir}/*.parquet"

    desired_cols = [
        "User",
        "RequestMemory",
        "CumulativeSlotTime",
        "JobsubClientIpAddress",
        "MATCH_EXP_JOB_Site",
        "DAG_NodesFailed",
        "NumJobCompletions",
        "NumJobStarts",
        "Cmd",
        "Environment",
    ]

    # Learn col. across files
    schema_df = duckdb.sql(f"SELECT * FROM read_parquet('{pattern}') LIMIT 0").df()
    available = set(schema_df.columns)

    present = [c for c in desired_cols if c in available]
    missing = [c for c in desired_cols if c not in available]
    if missing:
        print(f"[load_dataframe] Warning: missing columns not found in any file: {missing}")

    if not present:
        raise RuntimeError("None of the desired columns are present in the parquet files.")

    q = ", ".join([f'"{c}"' for c in present])
    query = f"SELECT {q} FROM read_parquet('{pattern}')"
    return duckdb.sql(query).df()


#### Loading Data ####

In [44]:
def build_obfuscations(
    df: pd.DataFrame,
    user_col: str = USER_COL,
    ip_col: str = IP_COL,
) -> Tuple[Dict[str, Dict[str, object]], Dict[str, Dict[str, object]], GarbleTokenMapper, GarbleTokenMapper]:
   
    user_mapper = GarbleTokenMapper(prefix="UR", start=1)
    ip_mapper   = GarbleTokenMapper(prefix="IP", start=1)

    # iterate rows to add and count
    for _, row in df.iterrows():
        u = row.get(user_col)
        ip = row.get(ip_col)

        user_mapper.add(str(u), valid=is_valid_user(u))
        ip_mapper.add(str(ip), valid=is_valid_ipv4(ip))

    users_dict = {
        orig: {"id": rec.token, "count": rec.count, "valid": rec.valid}
        for orig, rec in user_mapper._by_orig.items()
    }
    ips_dict = {
        orig: {"id": rec.token, "count": rec.count, "valid": rec.valid}
        for orig, rec in ip_mapper._by_orig.items()
    }
    return users_dict, ips_dict, user_mapper, ip_mapper


#### Transform ####

In [45]:
def make_output_json(
    df: pd.DataFrame,
    users_dict: Dict[str, Dict[str, object]],
    ips_dict: Dict[str, Dict[str, object]],
) -> str:
    """
    Build the final JSON payload that includes jagged arrays and minimal metadata.
    """
    users_jagged = to_jagged_array(users_dict)
    ips_jagged   = to_jagged_array(ips_dict)

    payload = {
        "users": users_jagged,   
        "ips":   ips_jagged,  
        "meta": {
            "total_rows": int(len(df)),
            "distinct_users": int(len(users_dict)),
            "distinct_ips": int(len(ips_dict)),
        },
    }
    return json.dumps(payload, indent=2)


#### Generic User json (Below) ####

In [46]:
def make_summary_payload(
    df: pd.DataFrame,
    users_dict: Dict[str, Dict[str, object]],
    ips_dict: Dict[str, Dict[str, object]],
    user_col: str = USER_COL,
    ip_col: str = IP_COL,
) -> Dict[str, object]:
   

    # Anonymized jagged arrays (no originals)
    users_jagged_anon = [[d["id"], d["count"], d["valid"]] for d in users_dict.values()]
    ips_jagged_anon   = [[d["id"], d["count"], d["valid"]] for d in ips_dict.values()]

    # Compute, for each user, the most frequent (mode) IP they used
    def _pick_mode_ip(series: pd.Series) -> Optional[str]:
        ser = series.dropna().astype(str)
        if ser.empty:
            return None
        return ser.value_counts().idxmax()

    tmp = df[[user_col, ip_col]].copy()
    tmp[user_col] = tmp[user_col].astype(str)
    top_ip_for_user = tmp.groupby(user_col)[ip_col].apply(_pick_mode_ip)

    # Build correlated records
    user_ip_correlations = []
    for orig_user, udata in users_dict.items():
        key_user = str(orig_user)
        ip_orig = top_ip_for_user.get(key_user, None)
        ip_token = ips_dict.get(ip_orig, {}).get("id") if ip_orig is not None else None
        user_ip_correlations.append([
            key_user,                 # original user
            udata["id"],              # garbled user
            ip_orig,                  # user's (mode) IP original
            ip_token,                 # garbled IP
            int(udata["count"]),      # frequency (user count)
            bool(udata["valid"]),     # user validity
        ])

    return {
        "users": users_jagged_anon,
        "ips": ips_jagged_anon,
        "user_ip_correlations": user_ip_correlations,
        "meta": {
            "total_rows": int(len(df)),
            "distinct_users": int(len(users_dict)),
            "distinct_ips": int(len(ips_dict)),
        },
    }


In [47]:
def _s(x):
    #in case of NaN/none
    return "" if pd.isna(x) else str(x)

In [48]:
def _parse_env(env_raw):

    def _sort_key(pair):
        #Case sens.
        return pair[0].lower()

    s = _s(env_raw).strip()
    if not s:
        return []

    # Json
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            pairs = []
            for k, v in obj.items():
                val = "" if v is None else str(v)
                pairs.append((str(k), val))
            return sorted(pairs, key=_sort_key)

        # Json arrays of KEY=VAL or dicts.
        if isinstance(obj, list):
            pairs = []
            for item in obj:
                if isinstance(item, dict):
                    for k, v in item.items():
                        val = "" if v is None else str(v)
                        pairs.append((str(k), val))
                elif isinstance(item, str) and "=" in item:
                    k, v = item.split("=", 1)
                    pairs.append((k.strip(), v.strip()))
                else:
                    pairs.append(("ITEM", str(item)))
            return sorted(pairs, key=_sort_key)
    except Exception:
        pass

    # Dict.
    if (s.startswith("{") and s.endswith("}")) or (s.startswith("dict(") and s.endswith(")")):
        try:
            s_jsonish = s.replace("'", "\"")
            obj = json.loads(s_jsonish)
            if isinstance(obj, dict):
                pairs = []
                for k, v in obj.items():
                    val = "" if v is None else str(v)
                    pairs.append((str(k), val))
                return sorted(pairs, key=_sort_key)
        except Exception:
            pass

    # KEY=VAL pars.
    candidates = []
    for delim in [";", ",", "\n"]:
        if delim in s:
            candidates = [p for p in s.split(delim)]
            break
    if not candidates:
        # space-separated tokens; keep tokens that look like KEY=VAL
        candidates = s.split()

    pairs = []
    for token in candidates:
        token = token.strip()
        if not token:
            continue
        if "=" in token:
            k, v = token.split("=", 1)
            pairs.append((k.strip(), v.strip()))
    if pairs:
        return sorted(pairs, key=_sort_key)

    # Fallback
    return [("ENV", s)]


In [49]:
def _wrap_block(text, width):
    return fill(_s(text), width=width, replace_whitespace=False)

In [50]:
def _format_env_block(env_pairs, width, indent_spaces=2):
    if not env_pairs:
        return "  (none)"
    lines = []
    for k, v in env_pairs:
        # "KEY=VALUE" with wrapping of the value
        if v:
            wrapped_v = fill(v, width=width - (len(k) + 1 + indent_spaces),
                             subsequent_indent=" " * (len(k) + 1))
            lines.append(f"{k}={wrapped_v}")
        else:
            lines.append(f"{k}=")
    return indent("\n".join(lines), " " * indent_spaces)

### Helpers ###

In [51]:
KNOWN_EXPERIMENTS = {"uboone", "icarus", "pip2", "nova", "dune"}

def _extract_user_handle(user_val: str) -> str:
    """Return local-part before '@' if it's an email-like string; else a stripped token."""
    s = _s(user_val)
    if "@" in s:
        return s.split("@", 1)[0].strip()
    return s.strip()

In [52]:
def build_sensitive_mappers_for_df(
    df: pd.DataFrame,
    *,
    user_col: str = USER_COL,
    env_col: str = "Environment",
    user_prefix: str = "UR_",
    exp_prefix: str = "EX_",
) -> tuple[GarbleTokenMapper, GarbleTokenMapper, dict[str, str], dict[str, str]]:
    user_mapper = GarbleTokenMapper(prefix=user_prefix, start=1)
    exp_mapper  = GarbleTokenMapper(prefix=exp_prefix, start=1)

    handles = set()
    experiments = set()

    if user_col in df.columns:
        for u in df[user_col].dropna().astype(str):
            h = _extract_user_handle(u)
            if h:
                handles.add(h)

    # Gather experiments from Environment and known set
    if env_col in df.columns:
        for raw in df[env_col].dropna().astype(str):
            # Try to parse
            pairs = _parse_env(raw)
            # look for explicit exp.
            for k, v in pairs:
                if k.upper() == "EXPERIMENT" and v:
                    experiments.add(v.strip())
            # capture known experiment tokens present anywhere in text
            low = raw.lower()
            for ex in KNOWN_EXPERIMENTS:
                if ex in low:
                    experiments.add(ex)

    # Add known experiments even if not seen
    experiments |= KNOWN_EXPERIMENTS

    # Build sequential tokens
    user_handle_map = {}
    for h in sorted(handles, key=lambda s: (len(s), s)):  # deterministic
        tok = user_mapper.add(h, valid=True)
        user_handle_map[h] = tok

    experiment_map = {}
    for ex in sorted(experiments, key=lambda s: (len(s), s)):
        tok = exp_mapper.add(ex, valid=True)
        experiment_map[ex] = tok

    return user_mapper, exp_mapper, user_handle_map, experiment_map

In [53]:
def compile_greedy_sub_regex(literals: list[str]) -> re.Pattern:
    if not literals:
        # Match nothing
        return re.compile(r"(?!x)x")
    # longest first so longer alternatives get tried before their substrings
    escaped = [re.escape(s) for s in sorted(literals, key=len, reverse=True)]
    return re.compile("(" + "|".join(escaped) + ")")

In [54]:
def greedy_replace(text: str, mapping: dict[str, str], pattern: re.Pattern) -> str:
    
    if not text:
        return text
    def _sub(m):
        orig = m.group(0)
        return mapping.get(orig, orig)
    return pattern.sub(_sub, text)

In [55]:
def garble_user_email(email: str, user_handle_map: dict[str, str], pat: re.Pattern) -> str:
    #Replace local-part before '@' using greedy mapping
    s = _s(email)
    if "@" not in s:
 #treat whole string as a handle container
        return greedy_replace(s, user_handle_map, pat)
    local, domain = s.split("@", 1)
    new_local = greedy_replace(local, user_handle_map, pat)
    return f"{new_local}@{domain}"

def garble_row_fields(
    row: pd.Series,
    *,
    user_col: str = USER_COL,
    cmd_col: str = "Cmd",
    env_col: str = "Environment",
    user_handle_map: dict[str, str],
    experiment_map: dict[str, str],
    pat_user: re.Pattern,
    pat_user_anywhere: re.Pattern,   # same as pat_user (optional separate), used across other fields
    pat_exp: re.Pattern,
) -> dict:
    out = row.to_dict()

    # 1) User email (local-part)
    if user_col in row.index:
        out[user_col] = garble_user_email(_s(row[user_col]), user_handle_map, pat_user)

    # 2) Cmd (global)
    if cmd_col in row.index and pd.notna(row[cmd_col]):
        s = _s(row[cmd_col])
        s = greedy_replace(s, user_handle_map, pat_user_anywhere)
        s = greedy_replace(s, experiment_map, pat_exp)
        out[cmd_col] = s

    # 3) Environment (global)
    if env_col in row.index and pd.notna(row[env_col]):
        s = _s(row[env_col])
        s = greedy_replace(s, user_handle_map, pat_user_anywhere)
        s = greedy_replace(s, experiment_map, pat_exp)
        out[env_col] = s

    return out


#### Hide sensitive contents in txt output ####

In [56]:
def write_cmd_env_report(
    df: pd.DataFrame,
    out_path: str | Path,
    *,
    group_by: str | None = None,   
    human_wrap: int = HUMAN_WRAP,
    include_meta: bool = True,
    meta_cols: tuple[str, ...] = ("User", "JobsubClientIpAddress",
                                  "CumulativeSlotTime", "DAG_NodesFailed",
                                  "NumJobStarts", "NumJobCompletions"),
    cmd_col: str = "Cmd",
    env_col: str = "Environment",
) -> Path:

    p = Path(out_path)
    p.parent.mkdir(parents=True, exist_ok=True)

    # Ensure required cols exist ( skip if missing)
    cols_needed = set([cmd_col, env_col]) | (set(meta_cols) if include_meta else set())
    missing = [c for c in cols_needed if c not in df.columns]
    if missing:
        print(f"[write_cmd_env_report] Warning: missing columns {missing}; proceeding with what exists.")

    def format_one(idx, row) -> str:
        parts = []

        # Header line
        header = f"— Job #{idx} —"
        parts.append(header)

        # Meta block (compact)
        if include_meta:
            for c in meta_cols:
                if c in row.index:
                    val = _s(row[c])
                    if c == cmd_col or c == env_col:
                        # don't duplicate
                        continue
                    # keep meta short
                    if len(val) > human_wrap:
                        val = _wrap_block(val, human_wrap)
                    parts.append(f"{c}: {val}")

        # Cmd
        if cmd_col in row.index:
            parts.append("Cmd:")
            parts.append(indent(_wrap_block(row[cmd_col], human_wrap), "  "))

        # Environment
        if env_col in row.index:
            parts.append("Environment:")
            env_pairs = _parse_env(row[env_col])
            parts.append(_format_env_block(env_pairs, width=human_wrap, indent_spaces=2))

        return "\n".join(parts)

    lines_out = []

    title = "Job Command & Environment Report"
    meta_summary = f"Total rows: {len(df)}"
    lines_out += [title, meta_summary, "=" * max(28, len(title)), ""]

    if group_by and group_by in df.columns:
        for gval, gdf in df.groupby(group_by, dropna=False):
            header = f"## {group_by}: {_s(gval)}  (jobs: {len(gdf)})"
            lines_out += [header, "-" * len(header)]
            for i, (_, row) in enumerate(gdf.iterrows(), start=1):
                lines_out.append(format_one(i, row))
                lines_out.append("")  # blank line between jobs
            lines_out.append("")      # blank line between groups
    else:
        for i, (_, row) in enumerate(df.iterrows(), start=1):
            lines_out.append(format_one(i, row))
            lines_out.append("")

    txt = "\n".join(lines_out).rstrip() + "\n"
    p.write_text(txt, encoding="utf-8")
    return p

### main writer ###

## cmd and envior. ##

#### Failed User json (Below) ####

In [57]:
def failed_users_payload(
    df: pd.DataFrame,
    user_mapper: GarbleTokenMapper,
    user_col: str = USER_COL,
    starts_col: str = NUM_STARTS_COL,
    completions_col: str = NUM_COMPLETIONS_COL,
) -> Dict[str, object]:

    if not {user_col, starts_col, completions_col} <= set(df.columns):
        return {
            "failed_users": [],
            "meta": {
                "distinct_failed_users": 0,
                "total_failure_rows": 0,
                "note": "Required columns missing; cannot compute failed users.",
            },
        }

    mask_fail = (df[completions_col].astype("int") == 0) & (df[starts_col] > 0)
    failed_df = df.loc[mask_fail, [user_col]]

    # count failure rows per user
    fail_counts = failed_df.groupby(user_col)[user_col].count().rename("failure_count")

    records = []
    total_failure_rows = int(fail_counts.sum()) if not fail_counts.empty else 0

    for orig_user, fcount in fail_counts.items():
        token = user_mapper.add(str(orig_user), valid=is_valid_user(orig_user))
        records.append({
            "token": token,
            "failure_count": int(fcount),
            "valid": is_valid_user(orig_user),
        })

    payload = {
        "failed_users": records,
        "meta": {
            "distinct_failed_users": int(len(records)),
            "total_failure_rows": total_failure_rows,
        },
    }
    return payload

In [58]:
def _canonicalize_site(df: pd.DataFrame, site_col: str, requested: str, case_insensitive: bool):

    if site_col not in df.columns:
        return False, None, f"Missing column: {site_col}"

    series = df[site_col].dropna().astype(str).map(lambda s: s.strip())
    uniques = series.unique().tolist()
    if not uniques:
        return False, None, "No sites found in data."

    req = str(requested).strip()
    if not req:
        return False, None, "Empty site argument."

    # exact match first
    if req in uniques:
        return True, req, "exact"

    # case-insensitive match
    if case_insensitive:
        # collect all ci-matches (could be more than one if data is messy)
        matches = [u for u in uniques if u.casefold() == req.casefold()]
        if len(matches) == 1:
            return True, matches[0], "case-insensitive"
        elif len(matches) > 1:
            # ambiguous: pick first deterministically and note ambiguity
            matches_sorted = sorted(matches)
            return True, matches_sorted[0], f"ambiguous ({len(matches)} ci-matches)"
    return False, None, "not found"

In [59]:
def site_jobs_payload(
    df: pd.DataFrame,
    site_name: str,                       
    *,
    site_col: str = "MATCH_EXP_JOB_Site",
    case_insensitive: bool = True,
    garble: bool = True,
    user_col: str = USER_COL,
    cmd_col: str = "Cmd",
    env_col: str = "Environment",
) -> Dict[str, object]:

    # Validate & canonicalize
    is_valid, canonical_site, match_note = _canonicalize_site(
        df, site_col=site_col, requested=site_name, case_insensitive=case_insensitive
    )

    meta_common = {
        "requested_site": str(site_name),
        "canonical_site": canonical_site,
        "is_valid_site": bool(is_valid),
        "site_column": site_col,
        "match_note": match_note,
        "garbled": bool(garble),
    }

    if not is_valid:
        return {
            "jobs_at_site": [],
            "meta": {
                **meta_common,
                "total_jobs_at_site": 0,
                "columns_included": [],
                "note": "Requested site is not valid; returning empty result.",
            },
        }

    # Filter rows for the (canonical) site
    series = df[site_col].astype(str).map(lambda s: s.strip())
    mask = series == canonical_site
    df_site = df.loc[mask].copy()

    if not garble:
        return {
            "jobs_at_site": df_site.to_dict(orient="records"),
            "meta": {
                **meta_common,
                "total_jobs_at_site": int(len(df_site)),
                "columns_included": list(df_site.columns),
            },
        }

    # --- Garble using your previously defined helpers ---
    user_mapper, exp_mapper, user_handle_map, experiment_map = build_sensitive_mappers_for_df(
        df_site, user_col=user_col, env_col=env_col, user_prefix="UR_", exp_prefix="EX_"
    )
    pat_user_local = compile_greedy_sub_regex(list(user_handle_map.keys()))
    pat_user_anywhere = pat_user_local
    pat_exp = compile_greedy_sub_regex(list(experiment_map.keys()))

    garbled_rows = []
    for _, row in df_site.iterrows():
        garbled_rows.append(
            garble_row_fields(
                row,
                user_col=user_col,
                cmd_col=cmd_col,
                env_col=env_col,
                user_handle_map=user_handle_map,
                experiment_map=experiment_map,
                pat_user=pat_user_local,
                pat_user_anywhere=pat_user_anywhere,
                pat_exp=pat_exp,
            )
        )

    return {
        "jobs_at_site": garbled_rows,
        "meta": {
            **meta_common,
            "total_jobs_at_site": int(len(df_site)),
            "columns_included": list(df_site.columns),
            "maps": {
                "user_handles": user_handle_map,   # {original_handle: "UR_n"}
                "experiments": experiment_map,     # {original_exp: "EX_n"}
            },
            "token_prefixes": {"user": "UR_", "experiment": "EX_"},
        },
    }

#### sites JSON ####

#### Full output ####

In [60]:
def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(add_help=False)  # avoid clashing with ipykernel flags
    # IO + basics
    p.add_argument("--data-dir", default=DATA_DIR, help="Directory containing parquet files")
    p.add_argument("--output-dir", default=OUTPUT_DIR, help="Directory to write outputs")
    p.add_argument("--wrap", type=int, default=HUMAN_WRAP, help="Wrap width for text reports")

    # Columns
    p.add_argument("--user-col", default=USER_COL)
    p.add_argument("--ip-col", default=IP_COL)
    p.add_argument("--site-col", default="MATCH_EXP_JOB_Site")
    p.add_argument("--cmd-col", default="Cmd")
    p.add_argument("--env-col", default="Environment")
    p.add_argument("--starts-col", default=NUM_STARTS_COL)
    p.add_argument("--completions-col", default=NUM_COMPLETIONS_COL)

    # Site selection
    p.add_argument("--site", default="FermiGrid", help="Requested site to export")
    p.add_argument("--case-insensitive", action="store_true", default=True,
                   help="Case-insensitive site match (default ON)")
    p.add_argument("--case-sensitive", dest="case_insensitive", action="store_false",
                   help="Turn OFF case-insensitive site match")
    p.add_argument("--all-sites", action="store_true", help="Emit one JSON per site")

    # Garbling
    p.add_argument("--garble", action="store_true", default=True, help="Turn ON garbling (default ON)")
    p.add_argument("--no-garble", dest="garble", action="store_false", help="Turn OFF garbling")
    p.add_argument("--user-prefix", default="UR_", help="Prefix for user-handle tokens")
    p.add_argument("--exp-prefix", default="EX_", help="Prefix for experiment tokens")
    p.add_argument("--experiments", default="uboone,icarus,pip2,nova,dune",
                   help="Comma-separated experiment keywords")

    # Report controls
    p.add_argument("--report-file", default="cmd_env_report.txt", help="Output TXT report filename")
    p.add_argument("--report-group-by", default=None, help="Optional column to group report by (e.g., User)")
    p.add_argument("--include-meta", action="store_true", default=True)
    p.add_argument("--no-meta", dest="include_meta", action="store_false")

    # Standard help that won’t collide with ipykernel, if you want:
    p.add_argument("-h", "--help", action="help", help="Show this help message and exit")
    return p

def parse_args() -> argparse.Namespace:
    # parse_known_args ignores unknown flags (like Jupyter's -f)
    parser = build_arg_parser()
    args, _ = parser.parse_known_args()
    return args

def change_filename(s: str) -> str:
    s = str(s or "").strip()
    s = re.sub(r"[^\w\-]+", "_", s)
    s = re.sub(r"__+", "_", s).strip("_")
    return s or "site"

### arguments w/ defeaults ###

In [61]:
if __name__ == "__main__":
    args = parse_args()

    # arg overrider (to rid def.)
    DATA_DIR_RUNTIME = args.data_dir
    OUTPUT_DIR_RUNTIME = args.output_dir
    HUMAN_WRAP_RUNTIME = args.wrap

    USER_COL_RUNTIME = args.user_col
    IP_COL_RUNTIME = args.ip_col
    SITE_COL_RUNTIME = args.site_col
    CMD_COL_RUNTIME = args.cmd_col
    ENV_COL_RUNTIME = args.env_col
    STARTS_COL_RUNTIME = args.starts_col
    COMPLETIONS_COL_RUNTIME = args.completions_col

    # overrides global set
    global KNOWN_EXPERIMENTS
    KNOWN_EXPERIMENTS = set([x.strip() for x in args.experiments.split(",") if x.strip()])

    # Ensures output dir exists
    os.makedirs(OUTPUT_DIR_RUNTIME, exist_ok=True)

    # Load data
    df = load_dataframe(DATA_DIR_RUNTIME)

    # Build obfuscations 
    users_dict, ips_dict, user_mapper, ip_mapper = build_obfuscations(
        df, user_col=USER_COL_RUNTIME, ip_col=IP_COL_RUNTIME
    )

    # payload creators with json
    summary_obj = make_summary_payload(df, users_dict, ips_dict,
                                       user_col=USER_COL_RUNTIME, ip_col=IP_COL_RUNTIME)
    summary_json = json.dumps(summary_obj, indent=2)

    # jagged array json
    users_jagged = to_jagged_array(users_dict)
    ips_jagged   = to_jagged_array(ips_dict)
    users_jagged_obj = {
        "users": users_jagged,
        "meta": {
            "distinct_users": int(len(users_dict)),
            "total_rows": int(len(df)),
        },
    }
    ips_jagged_obj = {
        "ips": ips_jagged,
        "meta": {
            "distinct_ips": int(len(ips_dict)),
            "total_rows": int(len(df)),
        },
    }

    # Failed users payload
    failed_obj = failed_users_payload(
        df,
        user_mapper=user_mapper,
        user_col=USER_COL_RUNTIME,
        starts_col=STARTS_COL_RUNTIME,
        completions_col=COMPLETIONS_COL_RUNTIME,
    )

    # standard json output writer
    dump_json(summary_obj, os.path.join(OUTPUT_DIR_RUNTIME, "summary.json"))
    dump_json(users_jagged_obj, os.path.join(OUTPUT_DIR_RUNTIME, "users_jagged.json"))
    dump_json(ips_jagged_obj, os.path.join(OUTPUT_DIR_RUNTIME, "ips_jagged.json"))
    dump_json(failed_obj, os.path.join(OUTPUT_DIR_RUNTIME, "failed_users.json"))

    # job - site json
    def write_site_payload_for(site_req: str):
        payload = site_jobs_payload(
            df,
            site_name=site_req,
            site_col=SITE_COL_RUNTIME,
            case_insensitive=args.case_insensitive,
            garble=args.garble,
            user_col=USER_COL_RUNTIME,
            cmd_col=CMD_COL_RUNTIME,
            env_col=ENV_COL_RUNTIME,
        )
        # Prefer canonical if valid
        name_for_file = payload["meta"]["canonical_site"] or site_req
        out_fname = f"jobs_at_{change_filename(name_for_file)}.json"
        out_path  = os.path.join(OUTPUT_DIR_RUNTIME, out_fname)
        dump_json(payload, out_path)
        print(f"Wrote: {out_path}  (valid={payload['meta']['is_valid_site']}, "
              f"request={site_req}, canonical={payload['meta']['canonical_site']})")

    if args.all_sites and SITE_COL_RUNTIME in df.columns:
        sites = (
            df[SITE_COL_RUNTIME]
            .dropna()
            .astype(str)
            .map(str.strip)
            .unique()
            .tolist()
        )
        for s in sorted(sites, key=str.casefold):
            write_site_payload_for(s)
    else:
        write_site_payload_for(args.site)

    # Text output report 
    report_path = Path(OUTPUT_DIR_RUNTIME) / args.report_file
    write_cmd_env_report(
        df,
        report_path,
        group_by=args.report_group_by,
        human_wrap=HUMAN_WRAP_RUNTIME,
        include_meta=args.include_meta,
        meta_cols=("User", "JobsubClientIpAddress",
                   "CumulativeSlotTime", "DAG_NodesFailed",
                   "NumJobStarts", "NumJobCompletions"),
        cmd_col=CMD_COL_RUNTIME,
        env_col=ENV_COL_RUNTIME,
    )
    print("Wrote:", report_path)

    
    print("\n=== Small samples ===")
    print("users_dict sample:", json.dumps(dict(list(users_dict.items())[:3]), indent=2))
    print("ips_dict sample  :", json.dumps(dict(list(ips_dict.items())[:3]), indent=2))
    print("\nsummary.json preview:\n", summary_json[:800], "...\n")

    
    total_starts = int(df[STARTS_COL_RUNTIME].sum()) if STARTS_COL_RUNTIME in df else 0
    total_completions = int(df[COMPLETIONS_COL_RUNTIME].astype("int").sum()) if COMPLETIONS_COL_RUNTIME in df else 0
    n_job_failures = total_starts - total_completions
    job_failure_frac = (n_job_failures / total_starts) if total_starts else 0.0
    print(f"Job failure fraction %: {job_failure_frac:.3%}, job failure abs number: {n_job_failures}")


Wrote: ./Output/jobs_at_FermiGrid.json  (valid=True, request=FermiGrid, canonical=FermiGrid)
Wrote: Output/cmd_env_report.txt

=== Small samples ===
users_dict sample: {
  "uboonepro@fnal.gov": {
    "id": "UR1",
    "count": 99239,
    "valid": true
  },
  "icaruspro@fnal.gov": {
    "id": "UR2",
    "count": 47080,
    "valid": true
  },
  "gputnam@fnal.gov": {
    "id": "UR3",
    "count": 12693,
    "valid": true
  }
}
ips_dict sample  : {
  "131.225.240.146": {
    "id": "IP1",
    "count": 86225,
    "valid": true
  },
  "131.225.240.90": {
    "id": "IP2",
    "count": 47080,
    "valid": true
  },
  "131.225.240.140": {
    "id": "IP3",
    "count": 12693,
    "valid": true
  }
}

summary.json preview:
 {
  "users": [
    [
      "UR1",
      99239,
      true
    ],
    [
      "UR2",
      47080,
      true
    ],
    [
      "UR3",
      12693,
      true
    ],
    [
      "UR4",
      3652,
      true
    ],
    [
      "UR5",
      15298,
      true
    ],
    [
      "UR

#### Main ####