In [None]:
import json
import pandas as pd
from pathlib import Path
pd.set_option("display.max_colwidth", 800)

### Load the dataset and metadata files

In [None]:
csv_path = "news_articles.csv"
json_path = "dataset_metadata.json"

# TODO:
# 1. Load the news_articles.csv file into a pandas DataFrame
# 2. Load the dataset_metadata.json file into a Python dictionary

# df_articles = ...
# metadata = ...

# TODO:
# Print the shape of the dataset and list the top-level metadata keys

### Review the dataset’s stated purpose and intended GenAI use

In [None]:
dataset_name = metadata.get("dataset_name", "Unknown dataset")
description = metadata.get("description", "")
intended_usage = metadata.get("intended_usage", [])
provenance_notes = metadata.get("provenance_notes", "")

print("Dataset:", dataset_name)
print("\nDescription:\n", description)
print("\nIntended usage:")
for item in intended_usage:
    print("-", item)

print("\nProvenance notes:\n", provenance_notes)

# TODO (Written response):
# In 3–5 sentences, explain how this dataset would be used in a
# generative AI pipeline for news summarization.

purpose_note= """
        
    """

print("\nNotebook note to complete:\n", purpose_note)

### Inspect and extract key provenance and licensing fields from metadata

In [None]:
def get_nested(d, path, default=None):
    """
    Safely retrieve a nested field from a dict using a list path.
    Example: get_nested(metadata, ['license', 'license_name'])
    """
    cur = d
    for key in path:
        if isinstance(cur, dict) and key in cur:
            cur = cur[key]
        else:
            return default
    return cur

# TODO:
# Extract and store the following metadata fields:
# - source_origin
# - collection_method
# - ownership owner and contact
# - license name and type
# - license terms summary
# - genai_training_allowed
# - derivative_use_allowed
# - commercial_use_allowed

pd.DataFrame(
    [{"field": k, "value": v} for k, v in fields.items()]
)

### Evaluate whether the license permits GenAI training and derivative outputs

In [None]:
license_obj = metadata.get("license", {})

license_name = license_obj.get("license_name")
license_type = license_obj.get("license_type")
terms_summary = (license_obj.get("terms_summary") or "").lower()

derivative_allowed = license_obj.get("derivative_use_allowed")
genai_training_allowed = license_obj.get("genai_training_allowed")
commercial_allowed = license_obj.get("commercial_use_allowed")

# Basic heuristics to detect ambiguous or restrictive language
restrictive_markers = ["non-commercial", "no derivatives", "research only", "no ai", "no machine learning", "no training"]
ambiguous_markers = ["may", "should", "typically", "as appropriate", "upon request", "case-by-case", "contact us"]

found_restrictive = [m for m in restrictive_markers if m in terms_summary]
found_ambiguous = [m for m in ambiguous_markers if m in terms_summary]

print("License name:", license_name)
print("License type:", license_type)
print("Derivative allowed:", derivative_allowed)
print("GenAI training allowed:", genai_training_allowed)
print("Commercial use allowed:", commercial_allowed)
print("\nRestrictive markers found:", found_restrictive)
print("Ambiguous markers found:", found_ambiguous)

### Identify provenance and licensing risks (gaps, conflicts, missing details)

In [None]:
def is_missing(x):
    if x is None:
        return True
    if isinstance(x, str) and x.strip() == "":
        return True
    if isinstance(x, (list, dict)) and len(x) == 0:
        return True
    return False

risk_flags = []

# Provenance checks
if is_missing(metadata.get("source_origin")):
    risk_flags.append("Missing source_origin in metadata.")
if is_missing(metadata.get("collection_method")):
    risk_flags.append("Missing collection_method in metadata.")
if is_missing(get_nested(metadata, ["ownership", "owner"])) and is_missing(get_nested(metadata, ["ownership", "contact"])):
    risk_flags.append("Ownership details are missing or incomplete.")

# License checks
if is_missing(get_nested(metadata, ["license", "license_name"])) and is_missing(get_nested(metadata, ["license", "license_type"])):
    risk_flags.append("License identifier (name/type) is missing.")
if is_missing(get_nested(metadata, ["license", "terms_summary"])):
    risk_flags.append("License terms_summary is missing (hard to audit).")

# Conflicts: intended usage mentions GenAI training but license says no training (or doesn't explicitly allow)
intended_usage_text = " ".join(metadata.get("intended_usage", [])).lower()
mentions_training = any(k in intended_usage_text for k in ["train", "fine-tun", "generative"])
if mentions_training and genai_training_allowed is False:
    risk_flags.append("Conflict: intended usage includes GenAI training, but genai_training_allowed is False.")
if mentions_training and genai_training_allowed is None:
    risk_flags.append("Potential conflict: intended usage includes GenAI training, but metadata does not explicitly allow training.")

# Terms summary language indicators
if found_restrictive:
    risk_flags.append(f"Restrictive license language detected: {found_restrictive}")
if found_ambiguous and (genai_training_allowed is None or derivative_allowed is None):
    risk_flags.append(f"Ambiguous license language and missing explicit flags: {found_ambiguous}")

pd.DataFrame({"risk_flag": risk_flags}) if risk_flags else pd.DataFrame({"risk_flag": ["No obvious risks detected from metadata fields."]})


### Build the data provenance audit table (flags + evidence)

In [None]:
def license_clarity_label(license_obj, found_restrictive, found_ambiguous):
    # Prefer explicit boolean flags if present
    genai = license_obj.get("genai_training_allowed")
    deriv = license_obj.get("derivative_use_allowed")
    terms = (license_obj.get("terms_summary") or "").strip()

    if not terms and genai is None and deriv is None:
        return "unclear"

    if found_restrictive:
        return "restricted"

    # If explicitly allowed (and no restrictive markers), treat as clear
    if genai is True and deriv is True:
        return "clear"

    # If missing explicit flags and language is ambiguous, call unclear
    if (genai is None or deriv is None) and found_ambiguous:
        return "unclear"

    # Default
    return "unclear"

def provenance_completeness_label(metadata):
    required = [
        metadata.get("source_origin"),
        metadata.get("collection_method"),
        get_nested(metadata, ["ownership", "owner"]),
        get_nested(metadata, ["license", "license_name"]) or get_nested(metadata, ["license", "license_type"]),
    ]
    missing_count = sum(1 for x in required if is_missing(x))
    if missing_count == 0:
        return "complete"
    if missing_count <= 2:
        return "partial"
    return "insufficient"

def suitability_for_genai(metadata, license_clarity):
    # Conservative logic: restricted => no; unclear => conditional; clear => yes
    if license_clarity == "restricted":
        return "no"
    if license_clarity == "unclear":
        return "conditional"
    return "yes"

license_clarity = license_clarity_label(license_obj, found_restrictive, found_ambiguous)
provenance_completeness = provenance_completeness_label(metadata)
suitability = suitability_for_genai(metadata, license_clarity)

evidence = {
    "license_name": license_obj.get("license_name"),
    "license_type": license_obj.get("license_type"),
    "genai_training_allowed": license_obj.get("genai_training_allowed"),
    "derivative_use_allowed": license_obj.get("derivative_use_allowed"),
    "terms_summary_excerpt": (license_obj.get("terms_summary") or "")[:220],
    "source_origin": metadata.get("source_origin"),
    "collection_method": metadata.get("collection_method"),
    "ownership_owner": get_nested(metadata, ["ownership", "owner"]),
}

audit_table = pd.DataFrame([{
    "dataset_name": metadata.get("dataset_name"),
    "license_clarity": license_clarity,                     # clear / unclear / restricted
    "provenance_completeness": provenance_completeness,     # complete / partial / insufficient
    "suitability_for_genai_training": suitability,          # yes / conditional / no
    "key_risks": "; ".join(risk_flags) if risk_flags else "None identified from metadata checks",
    "evidence_refs": json.dumps(evidence, indent=2)
}])

audit_table


### Document remediation steps required before using the dataset

In [None]:
# TODO:
# Create a list called remediation_steps

# TODO:
# Add remediation steps if:
# - license clarity is not "clear"
# - provenance completeness is not "complete"
# - restrictive license language was found
# - conflicts were detected

# TODO:
# If no remediation is required, add a single entry explaining why

# pd.DataFrame({"remediation_step": remediation_steps})


### Write a short audit conclusion (approve / conditionally approve / reject)

In [None]:
def recommendation_from_flags(license_clarity, provenance_completeness, suitability):
    if suitability == "no":
        return "rejected"
    if suitability == "conditional" or provenance_completeness in ["partial", "insufficient"]:
        return "conditionally approved"
    return "approved"

# TODO:
# Generate the final recommendation using the function above

# TODO:
# Create a multi-line audit conclusion string that includes:
# - dataset name
# - license clarity
# - provenance completeness
# - suitability for GenAI training
# - final recommendation
# - key risks
# - required remediation steps

# Print the conclusion
