In [21]:
import json
import pandas as pd
from pathlib import Path
pd.set_option("display.max_colwidth", 800)

### Load the dataset and metadata files

In [22]:
csv_path = "news_articles.csv"
json_path = "dataset_metadata.json"

df_articles = pd.read_csv(csv_path)
with open(json_path, "r") as f:
    metadata = json.load(f)

print("Articles shape:", df_articles.shape)
print("Metadata keys:", list(metadata.keys()))
df_articles.head(3)

Articles shape: (5, 6)
Metadata keys: ['dataset_name', 'description', 'version', 'creation_date', 'source_origin', 'collection_method', 'ownership', 'license', 'intended_usage', 'restrictions', 'provenance_notes', 'known_risks', 'audit_readiness']


Unnamed: 0,article_id,title,source,author,publication_date,article_text
0,1,City Council Approves New Climate Initiative,Synthetic Daily News,Staff Writer,2030-03-01,"The city council voted unanimously to approve a new climate initiative aimed at reducing carbon emissions over the next decade. Officials stated that the plan focuses on renewable energy adoption, public transportation improvements, and community education."
1,2,Tech Company Releases Annual Earnings Report,Fictional Business Journal,Jane Doe,2030-03-05,"The technology firm reported steady growth in its annual earnings report, citing increased demand for cloud services and enterprise software solutions. Analysts noted moderate expectations for the coming fiscal year."
2,3,Local Schools Expand Computer Science Programs,Synthetic Education Weekly,Michael Reyes,2030-03-10,"Several school districts announced plans to expand computer science programs, emphasizing coding, data literacy, and responsible technology use. Educators believe the changes will better prepare students for future careers."


### Review the datasetâ€™s stated purpose and intended GenAI use

In [23]:
dataset_name = metadata.get("dataset_name", "Unknown dataset")
description = metadata.get("description", "")
intended_usage = metadata.get("intended_usage", [])
provenance_notes = metadata.get("provenance_notes", "")

print("Dataset:", dataset_name)
print("\nDescription:\n", description)
print("\nIntended usage:")
for item in intended_usage:
    print("-", item)

print("\nProvenance notes:\n", provenance_notes)

# TODO: Manually fill this in
purpose_note= """
        This dataset would be ingested into a generative AI training pipeline as a corpus of
        news-style articles used to teach a model how to identify key facts and structure
        concise summaries.

        During preprocessing, articles would be cleaned, tokenized, and filtered to remove
        metadata fields not needed for training.

        The processed text could then be used to fine-tune a language model specifically
        for news summarization tasks.

        Downstream outputs would include short, abstractive summaries of articles that
        preserve key points while reducing length.

        These summaries could be used in internal tools, demos, or educational applications.
    """

print("\nNotebook note to complete:\n", purpose_note)

Dataset: Synthetic News Articles Dataset

Description:
 A fully synthetic dataset of news-style articles created for instructional purposes, including data provenance, licensing, and generative AI governance exercises.

Intended usage:
- Educational exercises on data provenance and licensing
- Training or fine-tuning generative AI models for summarization
- Demonstrations of AI governance and risk assessment workflows

Provenance notes:
 All articles are fictional and do not originate from real publishers or copyrighted sources.

Notebook note to complete:
 
        This dataset would be ingested into a generative AI training pipeline as a corpus of
        news-style articles used to teach a model how to identify key facts and structure
        concise summaries.

        During preprocessing, articles would be cleaned, tokenized, and filtered to remove
        metadata fields not needed for training.

        The processed text could then be used to fine-tune a language model specifi

### Inspect and extract key provenance and licensing fields from metadata

In [24]:
def get_nested(d, path, default=None):
    """
    Safely retrieve a nested field from a dict using a list path.
    Example: get_nested(metadata, ['license', 'license_name'])
    """
    cur = d
    for key in path:
        if isinstance(cur, dict) and key in cur:
            cur = cur[key]
        else:
            return default
    return cur

fields = {
    "source_origin": metadata.get("source_origin"),
    "collection_method": metadata.get("collection_method"),
    "ownership.owner": get_nested(metadata, ["ownership", "owner"]),
    "ownership.contact": get_nested(metadata, ["ownership", "contact"]),
    "license.license_name": get_nested(metadata, ["license", "license_name"]),
    "license.license_type": get_nested(metadata, ["license", "license_type"]),
    "license.terms_summary": get_nested(metadata, ["license", "terms_summary"]),
    "license.derivative_use_allowed": get_nested(metadata, ["license", "derivative_use_allowed"]),
    "license.genai_training_allowed": get_nested(metadata, ["license", "genai_training_allowed"]),
    "license.commercial_use_allowed": get_nested(metadata, ["license", "commercial_use_allowed"]),
    "intended_usage": metadata.get("intended_usage"),
}

pd.DataFrame(
    [{"field": k, "value": v} for k, v in fields.items()]
)

Unnamed: 0,field,value
0,source_origin,Synthetic content generated by instructional authors
1,collection_method,Manually authored synthetic news articles designed to resemble real-world reporting without referencing real publications or events
2,ownership.owner,Course Content Team
3,ownership.contact,content@example.com
4,license.license_name,Synthetic Data Educational License
5,license.license_type,Custom
6,license.terms_summary,"This dataset is composed entirely of synthetic content and may be used for educational, research, and internal commercial purposes, including training and evaluating generative AI models. Redistribution of the dataset as a standalone product is not permitted."
7,license.derivative_use_allowed,True
8,license.genai_training_allowed,True
9,license.commercial_use_allowed,True


### Evaluate whether the license permits GenAI training and derivative outputs

In [25]:
license_obj = metadata.get("license", {})

license_name = license_obj.get("license_name")
license_type = license_obj.get("license_type")
terms_summary = (license_obj.get("terms_summary") or "").lower()

derivative_allowed = license_obj.get("derivative_use_allowed")
genai_training_allowed = license_obj.get("genai_training_allowed")
commercial_allowed = license_obj.get("commercial_use_allowed")

# Basic heuristics to detect ambiguous or restrictive language
restrictive_markers = ["non-commercial", "no derivatives", "research only", "no ai", "no machine learning", "no training"]
ambiguous_markers = ["may", "should", "typically", "as appropriate", "upon request", "case-by-case", "contact us"]

found_restrictive = [m for m in restrictive_markers if m in terms_summary]
found_ambiguous = [m for m in ambiguous_markers if m in terms_summary]

print("License name:", license_name)
print("License type:", license_type)
print("Derivative allowed:", derivative_allowed)
print("GenAI training allowed:", genai_training_allowed)
print("Commercial use allowed:", commercial_allowed)
print("\nRestrictive markers found:", found_restrictive)
print("Ambiguous markers found:", found_ambiguous)

License name: Synthetic Data Educational License
License type: Custom
Derivative allowed: True
GenAI training allowed: True
Commercial use allowed: True

Restrictive markers found: []
Ambiguous markers found: ['may']


### Identify provenance and licensing risks (gaps, conflicts, missing details)

In [26]:
def is_missing(x):
    if x is None:
        return True
    if isinstance(x, str) and x.strip() == "":
        return True
    if isinstance(x, (list, dict)) and len(x) == 0:
        return True
    return False

risk_flags = []

# Provenance checks
if is_missing(metadata.get("source_origin")):
    risk_flags.append("Missing source_origin in metadata.")
if is_missing(metadata.get("collection_method")):
    risk_flags.append("Missing collection_method in metadata.")
if is_missing(get_nested(metadata, ["ownership", "owner"])) and is_missing(get_nested(metadata, ["ownership", "contact"])):
    risk_flags.append("Ownership details are missing or incomplete.")

# License checks
if is_missing(get_nested(metadata, ["license", "license_name"])) and is_missing(get_nested(metadata, ["license", "license_type"])):
    risk_flags.append("License identifier (name/type) is missing.")
if is_missing(get_nested(metadata, ["license", "terms_summary"])):
    risk_flags.append("License terms_summary is missing (hard to audit).")

# Conflicts: intended usage mentions GenAI training but license says no training (or doesn't explicitly allow)
intended_usage_text = " ".join(metadata.get("intended_usage", [])).lower()
mentions_training = any(k in intended_usage_text for k in ["train", "fine-tun", "generative"])
if mentions_training and genai_training_allowed is False:
    risk_flags.append("Conflict: intended usage includes GenAI training, but genai_training_allowed is False.")
if mentions_training and genai_training_allowed is None:
    risk_flags.append("Potential conflict: intended usage includes GenAI training, but metadata does not explicitly allow training.")

# Terms summary language indicators
if found_restrictive:
    risk_flags.append(f"Restrictive license language detected: {found_restrictive}")
if found_ambiguous and (genai_training_allowed is None or derivative_allowed is None):
    risk_flags.append(f"Ambiguous license language and missing explicit flags: {found_ambiguous}")

pd.DataFrame({"risk_flag": risk_flags}) if risk_flags else pd.DataFrame({"risk_flag": ["No obvious risks detected from metadata fields."]})


Unnamed: 0,risk_flag
0,No obvious risks detected from metadata fields.


### Build the data provenance audit table (flags + evidence)

In [27]:
def license_clarity_label(license_obj, found_restrictive, found_ambiguous):
    # Prefer explicit boolean flags if present
    genai = license_obj.get("genai_training_allowed")
    deriv = license_obj.get("derivative_use_allowed")
    terms = (license_obj.get("terms_summary") or "").strip()

    if not terms and genai is None and deriv is None:
        return "unclear"

    if found_restrictive:
        return "restricted"

    # If explicitly allowed (and no restrictive markers), treat as clear
    if genai is True and deriv is True:
        return "clear"

    # If missing explicit flags and language is ambiguous, call unclear
    if (genai is None or deriv is None) and found_ambiguous:
        return "unclear"

    # Default
    return "unclear"

def provenance_completeness_label(metadata):
    required = [
        metadata.get("source_origin"),
        metadata.get("collection_method"),
        get_nested(metadata, ["ownership", "owner"]),
        get_nested(metadata, ["license", "license_name"]) or get_nested(metadata, ["license", "license_type"]),
    ]
    missing_count = sum(1 for x in required if is_missing(x))
    if missing_count == 0:
        return "complete"
    if missing_count <= 2:
        return "partial"
    return "insufficient"

def suitability_for_genai(metadata, license_clarity):
    # Conservative logic: restricted => no; unclear => conditional; clear => yes
    if license_clarity == "restricted":
        return "no"
    if license_clarity == "unclear":
        return "conditional"
    return "yes"

license_clarity = license_clarity_label(license_obj, found_restrictive, found_ambiguous)
provenance_completeness = provenance_completeness_label(metadata)
suitability = suitability_for_genai(metadata, license_clarity)

evidence = {
    "license_name": license_obj.get("license_name"),
    "license_type": license_obj.get("license_type"),
    "genai_training_allowed": license_obj.get("genai_training_allowed"),
    "derivative_use_allowed": license_obj.get("derivative_use_allowed"),
    "terms_summary_excerpt": (license_obj.get("terms_summary") or "")[:220],
    "source_origin": metadata.get("source_origin"),
    "collection_method": metadata.get("collection_method"),
    "ownership_owner": get_nested(metadata, ["ownership", "owner"]),
}

audit_table = pd.DataFrame([{
    "dataset_name": metadata.get("dataset_name"),
    "license_clarity": license_clarity,                     # clear / unclear / restricted
    "provenance_completeness": provenance_completeness,     # complete / partial / insufficient
    "suitability_for_genai_training": suitability,          # yes / conditional / no
    "key_risks": "; ".join(risk_flags) if risk_flags else "None identified from metadata checks",
    "evidence_refs": json.dumps(evidence, indent=2)
}])

audit_table


Unnamed: 0,dataset_name,license_clarity,provenance_completeness,suitability_for_genai_training,key_risks,evidence_refs
0,Synthetic News Articles Dataset,clear,complete,yes,None identified from metadata checks,"{\n ""license_name"": ""Synthetic Data Educational License"",\n ""license_type"": ""Custom"",\n ""genai_training_allowed"": true,\n ""derivative_use_allowed"": true,\n ""terms_summary_excerpt"": ""This dataset is composed entirely of synthetic content and may be used for educational, research, and internal commercial purposes, including training and evaluating generative AI models. Redistribution of the dataset as"",\n ""source_origin"": ""Synthetic content generated by instructional authors"",\n ""collection_method"": ""Manually authored synthetic news articles designed to resemble real-world reporting without referencing real publications or events"",\n ""ownership_owner"": ""Course Content Team""\n}"


### Document remediation steps required before using the dataset

In [32]:
remediation_steps = []

if license_clarity != "clear":
    remediation_steps.append("Clarify license terms in writing, explicitly addressing GenAI training and derivative outputs.")

if provenance_completeness != "complete":
    remediation_steps.append("Fill provenance gaps: add full source origin details, collection method specifics, and ownership contact.")

if found_restrictive:
    remediation_steps.append("If restricted, either remove restricted content, obtain re-licensing/permission, or choose an alternative dataset.")

if "Conflict" in " ".join(risk_flags):
    remediation_steps.append("Resolve conflicts between intended usage and license terms before any training use.")

if not remediation_steps:
    remediation_steps.append("No remediation required based on current metadata review.")

pd.DataFrame({"remediation_step": remediation_steps})


Unnamed: 0,remediation_step
0,No remediation required based on current metadata review.


### Write a short audit conclusion (approve / conditionally approve / reject)

In [33]:
def recommendation_from_flags(license_clarity, provenance_completeness, suitability):
    if suitability == "no":
        return "rejected"
    if suitability == "conditional" or provenance_completeness in ["partial", "insufficient"]:
        return "conditionally approved"
    return "approved"

recommendation = recommendation_from_flags(license_clarity, provenance_completeness, suitability)

conclusion = f"""
Audit Conclusion
- Dataset: {metadata.get('dataset_name')}
- License clarity: {license_clarity}
- Provenance completeness: {provenance_completeness}
- Suitability for GenAI training: {suitability}
- Recommendation: {recommendation}

Rationale: TODO KESHA TO FIGURE OUT THE ERROR HERSELF
# - Key risks: {'; '.join(risk_flags) if risk_flags else 'No major risks detected based on metadata checks.'}
# - Required remediation: {'; '.join(remediation_steps) if remediation_steps else 'None'}
"""

print(conclusion)



Audit Conclusion
- Dataset: Synthetic News Articles Dataset
- License clarity: clear
- Provenance completeness: complete
- Suitability for GenAI training: yes
- Recommendation: approved

Rationale: TODO KESHA TO FIGURE OUT THE ERROR HERSELF
# - Key risks: No major risks detected based on metadata checks.
# - Required remediation: No remediation required based on current metadata review.

