## Perlmutter Anon. Code ##

#### Anon. These fields: User, Account ####

#### Imports ####

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Anonymize CSV job data.

- Reads input files
- Writes anonymized copies
- Replaces Seq.:
    User    -> UR_1, UR_2, ...
    Account -> AC_1, AC_2, ...
  Global mapping across ALL processed files.
- Writes JSON mapping for tracing.
"""

import os
import json
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Dict, Optional

import pandas as pd

#### Token Mapper ####

In [2]:

@dataclass
class TokenRecord:
    token: str
    count: int
    valid: bool


class GarbleTokenMapper:
    def __init__(self, prefix: str = "UR_", start: int = 1):
        self.prefix = str(prefix)
        self.start = int(start)
        self._by_orig: Dict[str, TokenRecord] = {}
        self._token_to_orig: Dict[str, str] = {}
        self._counter = self.start - 1

    def _next_token(self) -> str:
        self._counter += 1
        return f"{self.prefix}{self._counter}"

    def add(self, original: str, *, valid: bool = True) -> str:
        if original is None:
            return original
        key = str(original).strip()
        if not key:
            return original
        if key in self._by_orig:
            rec = self._by_orig[key]
            rec.count += 1
            return rec.token
        tok = self._next_token()
        rec = TokenRecord(token=tok, count=1, valid=bool(valid))
        self._by_orig[key] = rec
        self._token_to_orig[tok] = key
        return tok

    def to_jsonable(self) -> dict:
        entries = []
        for orig, rec in self._by_orig.items():
            item = asdict(rec)
            item["original"] = orig
            entries.append(item)
        return {
            "prefix": self.prefix,
            "start": self.start,
            "counter": self._counter,
            "entries": entries,
        }



#### Core Processing ####

In [3]:

def anonymize_dataframe(df: pd.DataFrame,
                        user_mapper: GarbleTokenMapper,
                        account_mapper: GarbleTokenMapper) -> pd.DataFrame:
    out_df = df.copy()

    if "User" in out_df.columns:
        out_df["User"] = [
            user_mapper.add(str(v).strip(), valid=True) if str(v).strip() else str(v).strip()
            for v in out_df["User"]
        ]

    if "Account" in out_df.columns:
        out_df["Account"] = [
            account_mapper.add(str(v).strip(), valid=True) if str(v).strip() else str(v).strip()
            for v in out_df["Account"]
        ]

    return out_df


def process_file(input_path: Path,
                 output_path: Path,
                 user_mapper: GarbleTokenMapper,
                 account_mapper: GarbleTokenMapper) -> None:
 
    df = pd.read_csv(input_path, dtype={"Reservation": str, "FailedNode": str}, low_memory=False)

    df_anon = anonymize_dataframe(df, user_mapper, account_mapper)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df_anon.to_csv(output_path, index=False)



def build_mapping_json(user_mapper: GarbleTokenMapper,
                       account_mapper: GarbleTokenMapper) -> dict:
    return {
        "users": user_mapper.to_jsonable(),
        "accounts": account_mapper.to_jsonable(),
        "meta": {
            "description": "Mapping from original User/Account to anonymized names (UR_n / AC_n)",
        },
    }


#### Directory ####

In [4]:
def main() -> None:

    try:
        script_dir = Path(__file__).resolve().parent
    except NameError:
        script_dir = Path.cwd()


    input_root = (script_dir.parent / "perlmutter_data").resolve()

    output_root = script_dir / "PelmutterOutput"
    output_root.mkdir(exist_ok=True, parents=True)

    user_mapper = GarbleTokenMapper(prefix="UR_", start=1)
    account_mapper = GarbleTokenMapper(prefix="AC_", start=1)

    if not input_root.exists():
        raise FileNotFoundError(f"Input root does not exist: {input_root}")

    for year_dir in sorted(input_root.iterdir()):
        if not year_dir.is_dir():
            continue

        rel_year = year_dir.relative_to(input_root)
        out_year_dir = output_root / rel_year
        out_year_dir.mkdir(parents=True, exist_ok=True)

        for in_file in sorted(year_dir.iterdir()):
            if not in_file.is_file():
                continue
            if in_file.suffix.lower() not in {".csv", ".txt"}:
                continue

            rel_path = in_file.relative_to(input_root)
            out_file = output_root / rel_path

            process_file(in_file, out_file, user_mapper, account_mapper)

    mapping = build_mapping_json(user_mapper, account_mapper)
    mapping_path = output_root / "perlmutter_user_account_map.json"
    with mapping_path.open("w", encoding="utf-8") as f:
        json.dump(mapping, f, indent=2)

    print(f"Anonymized files written under: {output_root}")
    print(f"Mapping written to: {mapping_path}")
    print("Script directory:", script_dir)
    print("Expected input root:", input_root)

if __name__ == "__main__":
    main()


Anonymized files written under: /home/nathan/git/FNAL-BatchQueues/notebooks/PelmutterOutput
Mapping written to: /home/nathan/git/FNAL-BatchQueues/notebooks/PelmutterOutput/perlmutter_user_account_map.json
Script directory: /home/nathan/git/FNAL-BatchQueues/notebooks
Expected input root: /home/nathan/git/FNAL-BatchQueues/perlmutter_data
