In [24]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from pathlib import Path

from Library import anon_fnal as fnal # importing my stuff!
# set your filename if needed
file_path = Path("CodeAttemptFNAL_SeqVer.py").resolve()
print(file_path)

/home/nathan/git/FNAL-BatchQueues/notebooks/CodeAttemptFNAL_SeqVer.py


In [25]:
from Library import anon_fnal
import importlib
fnal = importlib.reload(anon_fnal)

##### Note: Code split between two parts for me. First part is gathering data to put in a way to use, second part is transforming it. Also, headers are below code. ##### 

## Methods ##


(Garble Methods)



add — returns a stable token for an original and increments its count.

original_from_token — reverse-maps a token back to the original string.

record_from_token — fetches the UserRecord associated with a token.

export_to_json — saves the mapper’s current state to disk.

load_from_json — restores the mapper’s state from a saved JSON file.




(Helper Methods)
is_valid_user — checks that a user value is non-null and non-empty.

is_valid_ipv4 — validates IPv4 dotted-quad format and range.

to_jagged_array — builds [[original, token, count, valid], ...] for non-anonymous use.

dump_json — writes a Python object to a pretty-printed JSON file.




(Data Methods)
load_dataframe — selects required columns from all Parquet files in DATA_DIR.

build_obfuscations — iterates rows to create user/IP token maps with counts/validity.

make_summary_payload — returns anonymized users/IPs jagged arrays plus meta.

failed_users_payload — returns anonymized records for users with failed jobs plus meta.

In [26]:
import sys
import re
import argparse
import json
from typing import Dict, Optional, List, Tuple #typing helper
#transform data
#data I/O


#For readable texts

In [27]:
DATA_DIR = "../data"
OUTPUT_DIR = "./Output"


### Import stuff ###

In [28]:
def parse_args(argv=None) -> argparse.Namespace:
    #Wrapper around build_arg_parser() so code can just call parse_args().
    parser = build_arg_parser()

    # If argv is None, decide based on environment
    if argv is None:
        if "ipykernel" in sys.argv[0]:
            argv = []
        else:
            argv = sys.argv[1:]

    return parser.parse_args(argv)


In [29]:
def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(add_help=False)  # avoid clashing with ipykernel flags

    # IO + basics
    p.add_argument("--data-dir", default=DATA_DIR, help="Directory containing parquet files")
    p.add_argument("--output-dir", default=OUTPUT_DIR, help="Directory to write outputs")

    # import stuff for human_wrap here
    p.add_argument("--wrap", type=int, default=fnal.HUMAN_WRAP, help="Wrap width for text reports")

    # Columns – use library defaults
    p.add_argument("--user-col", default=fnal.USER_COL)
    p.add_argument("--ip-col", default=fnal.IP_COL)
    p.add_argument("--site-col", default="MATCH_EXP_JOB_Site")
    p.add_argument("--cmd-col", default="Cmd")
    p.add_argument("--env-col", default="Environment")
    p.add_argument("--starts-col", default=fnal.NUM_STARTS_COL)
    p.add_argument("--completions-col", default=fnal.NUM_COMPLETIONS_COL)

    # Site selection
    p.add_argument("--site", default="FermiGrid", help="Requested site to export")
    p.add_argument("--case-insensitive", action="store_true", default=True,
                   help="Case-insensitive site match (default ON)")
    p.add_argument("--case-sensitive", dest="case_insensitive", action="store_false",
                   help="Turn OFF case-insensitive site match")
    p.add_argument("--all-sites", action="store_true", help="Emit one JSON per site")

    # Garbling
    p.add_argument("--garble", action="store_true", default=True, help="Turn ON garbling (default ON)")
    p.add_argument("--no-garble", dest="garble", action="store_false", help="Turn OFF garbling")
    p.add_argument("--user-prefix", default="UR_", help="Prefix for user-handle tokens")
    p.add_argument("--exp-prefix", default="EX_", help="Prefix for experiment tokens")
    p.add_argument("--experiments", default="uboone,icarus,pip2,nova,dune",
                   help="Comma-separated experiment keywords")

    # Report controls
    p.add_argument("--report-file", default="cmd_env_report.txt", help="Output TXT report filename")
    p.add_argument("--report-group-by", default=None, help="Optional column to group report by (e.g., User)")
    p.add_argument("--include-meta", action="store_true", default=True)
    p.add_argument("--no-meta", dest="include_meta", action="store_false")

    p.add_argument("-h", "--help", action="help", help="Show this help message and exit")
    return p


In [30]:
if __name__ == "__main__":
    args = parse_args()

    # Runtime overrides
    DATA_DIR_RUNTIME = args.data_dir
    OUTPUT_DIR_RUNTIME = args.output_dir
    HUMAN_WRAP_RUNTIME = args.wrap

    USER_COL_RUNTIME = args.user_col
    IP_COL_RUNTIME = args.ip_col
    SITE_COL_RUNTIME = args.site_col
    CMD_COL_RUNTIME = args.cmd_col
    ENV_COL_RUNTIME = args.env_col
    STARTS_COL_RUNTIME = args.starts_col
    COMPLETIONS_COL_RUNTIME = args.completions_col

    # Override known experiments inside library module
    fnal.KNOWN_EXPERIMENTS = set(
        x.strip() for x in args.experiments.split(",") if x.strip()
    )

    # Ensure output dir exists
    os.makedirs(OUTPUT_DIR_RUNTIME, exist_ok=True)

    # Load data
    df = fnal.load_dataframe(DATA_DIR_RUNTIME)

    # Dump selected jobs
    selected_out = Path(OUTPUT_DIR_RUNTIME) / "selected_jobs.json"
    fnal.dump_selected_job_fields(df, selected_out)

    # Build obfuscations
    users_dict, ips_dict, user_mapper, ip_mapper = fnal.build_obfuscations(
        df,
        user_col=USER_COL_RUNTIME,
        ip_col=IP_COL_RUNTIME
    )

    # Build summary payload
    summary_obj = fnal.make_summary_payload(
        df,
        users_dict,
        ips_dict,
        user_col=USER_COL_RUNTIME,
        ip_col=IP_COL_RUNTIME
    )
    summary_json = json.dumps(summary_obj, indent=2)

    # Jagged array outputs
    users_jagged = fnal.to_jagged_array(users_dict)
    ips_jagged = fnal.to_jagged_array(ips_dict)

    users_jagged_obj = {
        "users": users_jagged,
        "meta": {
            "distinct_users": len(users_dict),
            "total_rows": len(df),
        },
    }
    ips_jagged_obj = {
        "ips": ips_jagged,
        "meta": {
            "distinct_ips": len(ips_dict),
            "total_rows": len(df),
        },
    }

    # Failed users
    failed_obj = fnal.failed_users_payload(
        df,
        user_mapper=user_mapper,
        user_col=USER_COL_RUNTIME,
        starts_col=STARTS_COL_RUNTIME,
        completions_col=COMPLETIONS_COL_RUNTIME,
    )

    # Write JSON outputs
    fnal.dump_json(summary_obj, os.path.join(OUTPUT_DIR_RUNTIME, "summary.json"))
    fnal.dump_json(users_jagged_obj, os.path.join(OUTPUT_DIR_RUNTIME, "users_jagged.json"))
    fnal.dump_json(ips_jagged_obj, os.path.join(OUTPUT_DIR_RUNTIME, "ips_jagged.json"))
    fnal.dump_json(failed_obj, os.path.join(OUTPUT_DIR_RUNTIME, "failed_users.json"))

    # Helper to make safe filenames
    def change_filename(name: str) -> str:
        return re.sub(r"[^A-Za-z0-9_.-]+", "_", str(name))

    # Write individual site payload(s)
    def write_site_payload_for(site_req: str):
        payload = fnal.site_jobs_payload(
            df,
            site_name=site_req,
            site_col=SITE_COL_RUNTIME,
            case_insensitive=args.case_insensitive,
            garble=args.garble,
            user_col=USER_COL_RUNTIME,
            cmd_col=CMD_COL_RUNTIME,
            env_col=ENV_COL_RUNTIME,
        )
        name_for_file = payload["meta"]["canonical_site"] or site_req
        out_fname = f"jobs_at_{change_filename(name_for_file)}.json"
        out_path = os.path.join(OUTPUT_DIR_RUNTIME, out_fname)
        fnal.dump_json(payload, out_path)

        print(
            f"Wrote: {out_path}  "
            f"(valid={payload['meta']['is_valid_site']}, "
            f"request={site_req}, canonical={payload['meta']['canonical_site']})"
        )

    # Emit site payloads
    if args.all_sites and SITE_COL_RUNTIME in df.columns:
        sites = (
            df[SITE_COL_RUNTIME]
            .dropna()
            .astype(str)
            .map(str.strip)
            .unique()
            .tolist()
        )
        for s in sorted(sites, key=str.casefold):
            write_site_payload_for(s)
    else:
        write_site_payload_for(args.site)

    # Write text report
    report_path = Path(OUTPUT_DIR_RUNTIME) / args.report_file
    fnal.write_cmd_env_report(
        df,
        report_path,
        group_by=args.report_group_by,
        human_wrap=HUMAN_WRAP_RUNTIME,
        include_meta=args.include_meta,
        meta_cols=(
            "User",
            "JobsubClientIpAddress",
            "CumulativeSlotTime",
            "DAG_NodesFailed",
            "NumJobStarts",
            "NumJobCompletions",
        ),
        cmd_col=CMD_COL_RUNTIME,
        env_col=ENV_COL_RUNTIME,
    )
    print("Wrote:", report_path)

    # Console samples and stats
    print("\n=== Small samples ===")
    print("users_dict sample:", json.dumps(dict(list(users_dict.items())[:3]), indent=2))
    print("ips_dict sample:", json.dumps(dict(list(ips_dict.items())[:3]), indent=2))

    print("\nsummary.json preview:\n", summary_json[:800], "...\n")

    total_starts = int(df[STARTS_COL_RUNTIME].sum()) if STARTS_COL_RUNTIME in df else 0
    total_completions = (
        int(df[COMPLETIONS_COL_RUNTIME].astype("int").sum())
        if COMPLETIONS_COL_RUNTIME in df
        else 0
    )
    n_job_failures = total_starts - total_completions
    job_failure_frac = (n_job_failures / total_starts) if total_starts else 0.0

    print(
        f"Job failure fraction %: {job_failure_frac:.3%}, "
        f"job failure abs number: {n_job_failures}"
    )


Wrote: ./Output/jobs_at_FermiGrid.json  (valid=True, request=FermiGrid, canonical=FermiGrid)
Wrote: Output/cmd_env_report.txt

=== Small samples ===
users_dict sample: {
  "uboonepro@fnal.gov": {
    "id": "UR1",
    "count": 86007,
    "valid": true
  },
  "icaruspro@fnal.gov": {
    "id": "UR2",
    "count": 40223,
    "valid": true
  },
  "gputnam@fnal.gov": {
    "id": "UR3",
    "count": 9852,
    "valid": true
  }
}
ips_dict sample: {
  "131.225.240.146": {
    "id": "IP1",
    "count": 74541,
    "valid": true
  },
  "131.225.240.90": {
    "id": "IP2",
    "count": 40223,
    "valid": true
  },
  "131.225.240.140": {
    "id": "IP3",
    "count": 9852,
    "valid": true
  }
}

summary.json preview:
 {
  "users": [
    [
      "UR1",
      86007,
      true
    ],
    [
      "UR2",
      40223,
      true
    ],
    [
      "UR3",
      9852,
      true
    ],
    [
      "UR4",
      3034,
      true
    ],
    [
      "UR5",
      10432,
      true
    ],
    [
      "UR6",
 

#### Main ####