In [9]:
import hashlib
import json 
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import os

In [8]:
consolidated_file = "ticket_data/consolidated.json"
anonymezed_file = "ticket_data/consolidated_anon.json"
hash_map_file = "ticket_data/hash_map.json"

sensitive_cols = ["customer.document", "customer.email", "customer.cellPhoneNumber"]

In [10]:
load_dotenv()
SALT = os.getenv("SALT")

if not SALT:
    raise ValueError("The SALT variable was not found")

In [11]:
#Function responsible for generate unique value hash with salt
def generate_hash(value: str, salt:str) -> str:
    if pd.isna(value) or value is None:
        return None 
    value_str = str(value)
    return hashlib.sha256((value_str + salt).encode("utf-8")).hexdigest()

In [12]:
#Load or start the hash_map
def load_hash_map():
    if os.path.exists(hash_map_file):
        with open(hash_map_file,"r", encoding="utf-8") as f:
            return json.load(f)
    return {}

In [13]:
def save_hash_map(hash_map: dict):
    with open(hash_map_file, "w", encoding="utf-8") as f:
        json.dump(hash_map, f, indent=2, ensure_ascii=False)

In [16]:
def anonymeze_date():
    df = pd.read_json(consolidated_file)

    sensitive_columns = sensitive_cols

    hash_map = load_hash_map()

    for col in sensitive_columns:
        if col in df.columns:
            new_values = []
            for val in df[col]:
                if pd.isna(val) or val is None:
                    new_values.append(val)  # mantém nulos
                else:
                    val_str = str(val)
                    if val_str in hash_map:
                        new_values.append(hash_map[val_str])
                    else:
                        hashed = generate_hash(val_str, SALT)
                        hash_map[val_str] = hashed
                        new_values.append(hashed)
            df[col] = new_values

    # Salvar arquivo anonimizado
    df.to_json(anonymezed_file, orient="records", indent=2, force_ascii=False)

    # Atualizar hash_map
    save_hash_map(hash_map)

    print(f"File anonymized save in: {anonymezed_file}")
    print(f"hash map updated in: {hash_map_file}")

In [None]:
if __name__ == "__main__":
    
    anonymeze_date()

File anonymized save in: ticket_data/consolidated_anon.json
hash map updated in: ticket_data/hash_map.json
