In [None]:
import json
import pandas as pd
from pathlib import Path

pd.set_option("display.max_colwidth", 800)

### Load the provided dataset containing simulated generative AI outputs

In [None]:
DATA_PATH = Path("simulated_genai_outputs.jsonl")

# TODO: Load the JSONL file into a list of Python dicts called `records`.
# - Each line is a JSON object
# - Use json.loads(...)
# - Skip blank lines safely
records = []

### Review a sample of the outputs to understand the scenarios and context

In [None]:
# TODO: Display a random sample of 5 rows with these columns:
# - output_id
# - timestamp
# - text
# Hint: df.sample(...)[[...]]

### Establish a baseline leakage assessment

In [None]:
import re

# Regex library of common leakage patterns
PATTERNS = {
    "email": re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
    "phone_us": re.compile(r"\b(?:\+1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b"),
    "ssn_us": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
    "credit_card_like": re.compile(r"\b(?:\d[ -]*?){13,19}\b"),
    "date_mmddyyyy": re.compile(r"\b(0?[1-9]|1[0-2])[/-](0?[1-9]|[12]\d|3[01])[/-](19|20)\d{2}\b"),
    "employee_id_like": re.compile(r"\bEMP-\d{4,}\b", re.IGNORECASE),
    "case_id_like": re.compile(r"\bCASE-\d{4}-\d{2}-\d{3,}\b", re.IGNORECASE),
    "order_id_like": re.compile(r"\b#[A-Z0-9-]{5,}\b"),
    "internal_url": re.compile(r"\bhttps?://[A-Za-z0-9.-]*\b(?:internal|corp|local)\b[A-Za-z0-9./+_-]*\b", re.IGNORECASE),
    "hostname_internal": re.compile(r"\b[A-Za-z0-9-]+\.(?:internal|local|corp)\.[A-Za-z]{2,}\b", re.IGNORECASE),
    "api_key_like": re.compile(r"\b(?:sk-[A-Za-z0-9]{8,}|AKIA[0-9A-Z]{16})\b"),
    "password_like": re.compile(r"\b(?:temp(?:orary)?\s*password\s*:\s*\S+|password\s*:\s*\S+)\b", re.IGNORECASE),
}

# TODO Implement a function `detect_signals(text, patterns)` that:
# - Iterates over pattern dict items
# - Counts matches for each pattern
# - Adds a "total_signals" field equal to the sum of all category counts
#
# def detect_signals(text: str, patterns: dict) -> dict:
#     ...


# TODO Apply detect_signals to df["text"] and expand into columns
# - signals = df["text"].apply(...).apply(pd.Series)
# - df_base = pd.concat([...], axis=1)
# df_base = ...


# TODO Display a baseline preview table with:
# ["output_id", "total_signals"] + list(PATTERNS.keys())
# Hint: df_base[[...]].head()


# ------------------------------------------------------------
# Create a baseline summary table
# ------------------------------------------------------------
# TODO Build `summary_base` that:
# - keeps output_id, timestamp, total_signals, and each pattern column
# - sorts by total_signals descending, then output_id ascending
# - resets index
# summary_base = ...


# TODO: Display summary_base

### Document your baseline findings

In [None]:
RISK_THRESHOLD = 1

risky = summary_base[summary_base["total_signals"] >= RISK_THRESHOLD]
clean = summary_base[summary_base["total_signals"] == 0]

print(f"Total outputs: {len(summary_base)}")
print(f"Risky outputs (>= {RISK_THRESHOLD} signal): {len(risky)}")
print(f"Clean outputs (0 signals): {len(clean)}\n")

# Which categories appear most
category_totals = summary_base[list(PATTERNS.keys())].sum().sort_values(ascending=False)
category_totals

### Apply lightweight privacy defenses to reduce leakage risk

In [None]:
# Rule-based redaction
REDACTIONS = {
    "email": "[REDACTED_EMAIL]",
    "phone_us": "[REDACTED_PHONE]",
    "ssn_us": "[REDACTED_SSN]",
    "credit_card_like": "[REDACTED_CARD]",
    "date_mmddyyyy": "[REDACTED_DATE]",
    "employee_id_like": "[REDACTED_EMPLOYEE_ID]",
    "case_id_like": "[REDACTED_CASE_ID]",
    "order_id_like": "[REDACTED_ORDER_ID]",
    "internal_url": "[REDACTED_INTERNAL_URL]",
    "hostname_internal": "[REDACTED_INTERNAL_HOST]",
    "api_key_like": "[REDACTED_SECRET]",
    "password_like": "[REDACTED_PASSWORD]",
}

def redact_text(text: str, patterns: dict, replacements: dict) -> str:
    redacted = text or ""
    # Apply more specific patterns first to reduce accidental over-redaction
    for name, rx in patterns.items():
        token = replacements.get(name, "[REDACTED]")
        redacted = rx.sub(token, redacted)
    return redacted

df_mitigated = df_base.copy()
df_mitigated["text_mitigated"] = df_mitigated["text"].apply(lambda t: redact_text(t, PATTERNS, REDACTIONS))
df_mitigated[["output_id", "text", "text_mitigated"]].head(3)


### Re-evaluate the same outputs after defenses are applied

In [None]:
after_signals = df_mitigated["text_mitigated"].apply(lambda t: detect_signals(t, PATTERNS)).apply(pd.Series)
df_after = pd.concat([df_mitigated, after_signals.add_prefix("after_")], axis=1)

compare_cols = (
    ["output_id", "total_signals", "after_total_signals"]
    + [k for k in PATTERNS.keys()]
    + [f"after_{k}" for k in PATTERNS.keys()]
)

comparison = df_after[compare_cols].copy()
comparison["delta_total"] = comparison["after_total_signals"] - comparison["total_signals"]

comparison.sort_values(["delta_total", "output_id"]).head(10)

In [None]:
before_total = int(df_after["total_signals"].sum())
after_total = int(df_after["after_total_signals"].sum())

print(f"Total signals BEFORE: {before_total}")
print(f"Total signals AFTER:  {after_total}")
print(f"Net change (AFTER - BEFORE): {after_total - before_total}")

by_category = pd.DataFrame({
    "before": df_after[list(PATTERNS.keys())].sum(),
    "after": df_after[[f"after_{k}" for k in PATTERNS.keys()]].sum().rename(lambda s: s.replace("after_", "")),
})
by_category["delta"] = by_category["after"] - by_category["before"]
by_category.sort_values("delta")

### Assess defense effectiveness and document results

In [None]:
def classify_effectiveness(before: int, after: int) -> str:
    if before == 0 and after == 0:
        return "N/A (no signals)"
    if before > 0 and after == 0:
        return "Effective"
    if before > 0 and 0 < after < before:
        return "Partially effective"
    if before > 0 and after >= before:
        return "Ineffective"
    # edge case: before==0, after>0 (shouldn't happen, but guard)
    return "Regression"

effectiveness = []
for k in PATTERNS.keys():
    b = int(by_category.loc[k, "before"])
    a = int(by_category.loc[k, "after"])
    effectiveness.append({
        "category": k,
        "before": b,
        "after": a,
        "effectiveness": classify_effectiveness(b, a),
    })

effectiveness_df = pd.DataFrame(effectiveness).sort_values(["effectiveness", "category"])
effectiveness_df

In [None]:
# Show a couple of the highest-risk examples
top = df_after.sort_values("total_signals", ascending=False).head(3)

for _, row in top.iterrows():
    print("\n" + "="*100)
    print(f"OUTPUT_ID: {row['output_id']} | timestamp: {row['timestamp']} | baseline signals: {int(row['total_signals'])}")
    print("-"*100)
    print("BEFORE:\n")
    print(row["text"])
    print("\nAFTER (mitigated):\n")
    print(row["text_mitigated"])

### Design a structured incident response protocol and write-up 

In [None]:
# ------------------------------------------------------------
# Incident response protocol for GenAI leakage
# ------------------------------------------------------------
# TODO: Create a structured incident response protocol.
# Represent it as a Python dict named INCIDENT_RESPONSE_PROTOCOL that includes:
# - detection_and_escalation (signals, severity tiers, escalation criteria, owners)
# - containment (immediate actions, short-term actions)
# - notification_and_documentation (notify list, record list)
# - post_incident_review (root cause, remediation, follow-up)
#
# INCIDENT_RESPONSE_PROTOCOL = { ... }


In [None]:
# ------------------------------------------------------------
# Instruction Step 10: Security-focused write-up
# ------------------------------------------------------------
# TODO: Create 4-6 bullet points for:
# - technical controls you implemented
# - procedural safeguards you designed
#
# Then print a markdown-friendly summary that can be pasted into a markdown cell.
#
# Hint: Build lists, then join them into a formatted string.