In [8]:
import pandas as pd
pd.set_option("display.max_colwidth", 800)

In [9]:
# -----------------------------------
# Scenario: News summarization
# -----------------------------------
scenario = {
    "use_case": "News summarization",
    "industry": "Media / Information Services",
    "system_description": (
        "A generative AI system designed to produce concise summaries of news articles "
        "for internal briefings and limited external sharing."
    ),
    "key_risk": "Copyright, licensing, and data provenance across mixed data sources"
}

pd.DataFrame([scenario])

Unnamed: 0,use_case,industry,system_description,key_risk
0,News summarization,Media / Information Services,A generative AI system designed to produce concise summaries of news articles for internal briefings and limited external sharing.,"Copyright, licensing, and data provenance across mixed data sources"


In [10]:
# -----------------------------------
# Load the dataset manifest from CSV
# -----------------------------------
manifest = pd.read_csv("training_data_manifest.csv")

manifest

Unnamed: 0,source_id,source_name,source_type,owner_or_provider,license_type,usage_rights,redistribution_allowed,allowed_actions,provenance_confidence,contains_pii,notes
0,SRC-001,Licensed News Feed A,licensed_feed,Vendor A,commercial_license,summarize_internal_only,False,summarize; no_quotes; no_fulltext,high,unlikely,"Contract allows internal summaries, prohibits redistribution and verbatim excerpts."
1,SRC-002,Company Knowledge Base (internal),internal_docs,Client,owned,internal_use,True,summarize; quote_with_attribution,high,possible,Internal policies may include names or emails; requires PII scanning.
2,SRC-003,Public Web Articles (scraped),web_scrape,Various,unknown,unknown,unknown,unknown,low,possible,High risk source with unclear rights and potential takedown risk.
3,SRC-004,Archived News Dataset (legacy),legacy_dataset,Unknown,missing,missing,unknown,unknown,low,unlikely,Legacy dataset without documentation; origin and rights unclear.
4,SRC-005,Creative Commons Blog Posts (CC BY),open_license,Various,CC BY,reuse_with_attribution,True,summarize; quote_with_attribution,medium,possible,Attribution required; verify each item is legitimately CC BY.


In [11]:
# -----------------------------------
# What reviewers look at first
# -----------------------------------
# These are the columns reviewers tend to scan immediately
review_view = manifest[
    [
        "source_id",
        "source_name",
        "source_type",
        "license_type",
        "usage_rights",
        "redistribution_allowed",
        "provenance_confidence",
        "contains_pii"
    ]
]

review_view

Unnamed: 0,source_id,source_name,source_type,license_type,usage_rights,redistribution_allowed,provenance_confidence,contains_pii
0,SRC-001,Licensed News Feed A,licensed_feed,commercial_license,summarize_internal_only,False,high,unlikely
1,SRC-002,Company Knowledge Base (internal),internal_docs,owned,internal_use,True,high,possible
2,SRC-003,Public Web Articles (scraped),web_scrape,unknown,unknown,unknown,low,possible
3,SRC-004,Archived News Dataset (legacy),legacy_dataset,missing,missing,unknown,low,unlikely
4,SRC-005,Creative Commons Blog Posts (CC BY),open_license,CC BY,reuse_with_attribution,True,medium,possible


In [12]:
# -----------------------------------
# Flag common licensing and lineage risks
# -----------------------------------
def flag_risks(df):
    df = df.copy()

    def is_unknown(value):
        return str(value).lower() in ["unknown", "missing", "", "nan"]

    df["risk_license_unclear"] = (
        df["license_type"].apply(is_unknown)
        | df["usage_rights"].apply(is_unknown)
    )

    df["risk_low_provenance"] = df["provenance_confidence"].str.lower().eq("low")

    # This system supports limited external sharing
    intended_external_use = True
    df["risk_redistribution_conflict"] = df["redistribution_allowed"].apply(
        lambda x: x is False and intended_external_use
    )

    df["risk_possible_pii"] = df["contains_pii"].str.lower().eq("possible")

    def overall_risk(row):
        hits = sum(
            [
                row["risk_license_unclear"],
                row["risk_low_provenance"],
                row["risk_redistribution_conflict"],
                row["risk_possible_pii"],
            ]
        )
        if hits >= 3:
            return "high"
        elif hits == 2:
            return "medium"
        elif hits == 1:
            return "low"
        return "none"

    df["overall_risk"] = df.apply(overall_risk, axis=1)
    return df

manifest_with_risk = flag_risks(manifest)

manifest_with_risk[
    [
        "source_id",
        "source_name",
        "overall_risk",
        "risk_license_unclear",
        "risk_low_provenance",
        "risk_redistribution_conflict",
        "risk_possible_pii",
    ]
].sort_values(by="overall_risk", ascending=False)


Unnamed: 0,source_id,source_name,overall_risk,risk_license_unclear,risk_low_provenance,risk_redistribution_conflict,risk_possible_pii
0,SRC-001,Licensed News Feed A,none,False,False,False,False
3,SRC-004,Archived News Dataset (legacy),medium,True,True,False,False
1,SRC-002,Company Knowledge Base (internal),low,False,False,False,True
4,SRC-005,Creative Commons Blog Posts (CC BY),low,False,False,False,True
2,SRC-003,Public Web Articles (scraped),high,True,True,False,True


In [13]:
# -----------------------------------
# Reviewer-style summary table
# -----------------------------------
review_summary = pd.DataFrame(
    [
        {
            "Reviewer focus": "Unclear or missing licenses",
            "Why it matters": "Data with unknown rights creates legal exposure and blocks safe reuse.",
            "Example in this system": "Scraped web articles and legacy datasets",
        },
        {
            "Reviewer focus": "Low provenance confidence",
            "Why it matters": "If origin can't be traced, risk cannot be defended.",
            "Example in this system": "Legacy archive with no documentation",
        },
        {
            "Reviewer focus": "Redistribution conflicts",
            "Why it matters": "Internal-only data influencing externally shared outputs is a red flag.",
            "Example in this system": "Licensed news feed restricted to internal summaries",
        },
        {
            "Reviewer focus": "Potential PII exposure",
            "Why it matters": "Personal data can leak into training pairs or summaries.",
            "Example in this system": "Internal knowledge base and CC blog content",
        },
    ]
)

review_summary


Unnamed: 0,Reviewer focus,Why it matters,Example in this system
0,Unclear or missing licenses,Data with unknown rights creates legal exposure and blocks safe reuse.,Scraped web articles and legacy datasets
1,Low provenance confidence,"If origin can't be traced, risk cannot be defended.",Legacy archive with no documentation
2,Redistribution conflicts,Internal-only data influencing externally shared outputs is a red flag.,Licensed news feed restricted to internal summaries
3,Potential PII exposure,Personal data can leak into training pairs or summaries.,Internal knowledge base and CC blog content
