### Group: SAXA 4
### Bassma Ali · Andrew Singh · Andy Oliver · Destiny Floyd-McGuiness . Vahid Dabbaghi Sadr

### 1. Setup & Imports

In [14]:
# ============================================================
# Notebook 09 — LLM Integration
# Purpose:
#   - Load inference artifacts
#   - Build prompt template
#   - Test OpenAI LLM responses
#   - Wrap everything into a clean function for Streamlit
# ============================================================

import joblib
import pandas as pd
import json
from pathlib import Path

# Optional: Jupyter-friendly display
pd.set_option("display.max_colwidth", 200)

In [19]:
from dotenv import load_dotenv
import os
load_dotenv("openai_api_key.env")

key = os.getenv("OPENAI_API_KEY")
print(key[:8] + "..." if key else "Key not found")

from openai import OpenAI
client = OpenAI()

sk-proj-...


In [20]:
response = client.responses.create(
    model="gpt-4o-mini",
    input="Reply with OK"
)
print(response.output_text)

OK


### 2. Load Final Artifacts (Model, Vectorizer, Encoder, Context Table)

In [21]:
# Paths (adjust if needed)
MODEL_PATH = Path("log_reg_final.joblib")
VEC_PATH   = Path("vectorizer_final.joblib")
ENC_PATH   = Path("label_encoder_final.joblib")
CTX_PATH   = Path("context_table_agency_scores.csv")

# Load artifacts
log_reg_model = joblib.load(MODEL_PATH)
tfidf_vectorizer = joblib.load(VEC_PATH)
label_encoder = joblib.load(ENC_PATH)

context_df = pd.read_csv(CTX_PATH)

context_df.head()

Unnamed: 0,3_agency,4_bureau,n_cases,pct_rights_impacting,pct_safety_impacting,mean_governance,mean_transparency,mean_fairness
0,Board of Governors of the Federal Reserve System,Division of Board Members,1,0.0,0.0,,,
1,Board of Governors of the Federal Reserve System,Division of Consumer and Community Affairs,1,0.0,0.0,0.333,0.0,
2,Board of Governors of the Federal Reserve System,Division of Information Technology,6,0.0,0.0,0.611,0.0,
3,Board of Governors of the Federal Reserve System,Division of International Finance,2,0.0,0.0,,,
4,Board of Governors of the Federal Reserve System,Division of Monetary Affairs,2,0.0,0.0,0.333,0.0,


### 3. Import Your Existing Cleaning + Inference Functions

In [22]:
# ------------------------------------------------------------
# TEXT CLEANING
# ------------------------------------------------------------
def basic_clean(text: str) -> str:
    import re
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# ------------------------------------------------------------
# PREDICT FUNCTION
# ------------------------------------------------------------
def predict_impact(text: str):
    cleaned = basic_clean(text)
    X_vec = tfidf_vectorizer.transform([cleaned])
    probs = log_reg_model.predict_proba(X_vec)[0]
    idx = probs.argmax()
    pred_label = label_encoder.inverse_transform([idx])[0]

    result = {
        "cleaned_input": cleaned,
        "predicted_class": pred_label,
        "predicted_index": int(idx),
        "confidence": round(float(probs[idx]), 4),
        "class_probabilities": {label_encoder.classes_[i]: round(float(p),4) for i,p in enumerate(probs)}
    }
    return result

### 4. Extract Top TF-IDF Keywords

In [23]:
import numpy as np

def extract_top_tfidf_keywords(model, vectorizer, text, top_n=8):
    cleaned = basic_clean(text)
    X = vectorizer.transform([cleaned])
    coef = model.coef_[np.argmax(model.predict_proba(X))]
    feature_names = vectorizer.get_feature_names_out()
    top_idx = np.argsort(coef)[-top_n:]
    return [feature_names[i] for i in top_idx]

### 5. Helper: Fetch Agency Governance Context

In [28]:
def get_agency_context(agency: str):
    row = context_df[context_df["3_agency"] == agency].head(1)
    if row.empty:
        return None
    return row.to_dict(orient="records")[0]

### 6. LLM Prompt Template (First Draft)

In [29]:
BASE_PROMPT = """
You are an AI policy advisor assisting a federal reviewer.

Your job is to generate a short, factual, governance-aware note explaining the ML classifier’s prediction.

=== CLASSIFIER OUTPUT ===
Cleaned text:
{cleaned_input}

Predicted impact type: {predicted_class}
Confidence: {confidence}

Class probabilities:
{class_probabilities}

Top keywords (signal words):
{keywords}

=== AGENCY CONTEXT ===
Agency: {agency}
Governance score: {governance_score}
Transparency score: {transparency_score}
Fairness score: {fairness_score}
Total use cases reported: {n_cases}

=== TASK ===
Write a concise policy note (5–7 sentences) that:
1. Explains what the classifier detected.
2. Connects the prediction to the agency’s governance maturity.
3. Highlights whether the narrative shows possible gaps (rights, safety, both, neither).
4. Uses neutral civil-service language.
5. Provides one practical next step for the reviewer.

Be precise, avoid exaggeration, and keep the tone analytical and brief.
"""


### Step 7 — LLM API wrapper (Responses API)

In [30]:
from openai import OpenAI

# Create client (reads OPENAI_API_KEY from environment)
client = OpenAI()

def generate_policy_note(
    cleaned_input: str,
    predicted_class: str,
    confidence: float,
    class_probabilities: dict,
    keywords: list,
    agency_context: dict,
    model_name: str = "gpt-4o-mini"
) -> str:
    """
    Calls OpenAI to generate a short policy note using BASE_PROMPT.
    ML decides; LLM explains.
    """

    # Defensive defaults if optional parts are missing
    if agency_context is None:
        agency_context = {
            "agency": "Unknown",
            "governance_score": "NA",
            "transparency_score": "NA",
            "fairness_score": "NA",
            "n_cases": "NA"
        }

    prompt_filled = BASE_PROMPT.format(
        cleaned_input=cleaned_input,
        predicted_class=predicted_class,
        confidence=round(confidence, 4),
        class_probabilities=class_probabilities,
        keywords=keywords,
        agency=agency_context.get("agency", "Unknown"),
        governance_score=agency_context.get("governance_score", "NA"),
        transparency_score=agency_context.get("transparency_score", "NA"),
        fairness_score=agency_context.get("fairness_score", "NA"),
        n_cases=agency_context.get("n_cases", "NA"),
    )

    response = client.responses.create(
        model=model_name,
        input=[
            {
                "role": "user",
                "content": prompt_filled
            }
        ],
        temperature=0.2,
        max_output_tokens=220
    )

    # Responses API helper to extract text
    return response.output_text.strip()


### Step 8 — End-to-end single example test

In [31]:
sample_text = """
The system uses biometric identification and continuous monitoring to verify individuals
and determine eligibility for services. It processes sensitive personal information and may
affect rights if misclassified.
"""

# 1) ML prediction
pred = predict_impact(sample_text)

# 2) Optional keywords
keywords = extract_top_tfidf_keywords(log_reg_model, tfidf_vectorizer, pred["cleaned_input"], top_n=8)

# 3) Pull agency context (replace with a real agency name from your dataset)
agency_name = "Department of Agriculture"
agency_ctx = get_agency_context(agency_name)

# 4) LLM explanation
policy_note = generate_policy_note(
    cleaned_input=pred["cleaned_input"],
    predicted_class=pred["predicted_class"],
    confidence=pred["confidence"],
    class_probabilities=pred["class_probabilities"],
    keywords=keywords,
    agency_context=agency_ctx
)

print("Predicted class:", pred["predicted_class"])
print("Confidence:", pred["confidence"])
print("Keywords:", keywords)
print("\n--- Policy Note ---\n")
print(policy_note)

Predicted class: neither
Confidence: 0.4807
Keywords: ['development', 'text', 'quality', 'learning', 'research', 'documents', 'data', 'tool']

--- Policy Note ---

The machine learning classifier analyzed the provided text and identified that the system employs biometric identification and continuous monitoring for service eligibility verification. The predicted impact type is categorized as "neither," with a confidence level of 48.07%. This suggests that the classifier did not find strong evidence to indicate significant implications for rights or safety, although there is a notable probability (29.95%) associated with potential rights impacts. Given the agency's governance maturity is currently unknown, this prediction highlights a possible gap in transparency and fairness assessments, particularly concerning the handling of sensitive personal information. A practical next step for the reviewer would be to conduct a thorough evaluation of the agency's governance practices related to 

### Step 9 — Mini “batch mode” helper

In [32]:
def score_row_with_llm(text: str, agency: str) -> dict:
    """
    Convenience wrapper for one row:
    ML prediction + keywords + agency context + LLM note.
    Returns a single dict you can attach to a dataframe or Streamlit output.
    """
    pred = predict_impact(text)
    keywords = extract_top_tfidf_keywords(
        log_reg_model,
        tfidf_vectorizer,
        pred["cleaned_input"],
        top_n=8
    )
    agency_ctx = get_agency_context(agency)

    note = generate_policy_note(
        cleaned_input=pred["cleaned_input"],
        predicted_class=pred["predicted_class"],
        confidence=pred["confidence"],
        class_probabilities=pred["class_probabilities"],
        keywords=keywords,
        agency_context=agency_ctx
    )

    return {
        **pred,
        "keywords": keywords,
        "agency_context": agency_ctx,
        "llm_policy_note": note
    }

In [34]:
# test score_row_with_llm() 
sample_text = """
The system uses biometric identification and continuous monitoring to verify individuals
and determine eligibility for services. It processes sensitive personal information and
may affect rights if misclassified.
"""

out = score_row_with_llm(sample_text, agency="Department of Agriculture")
out


{'cleaned_input': 'the system uses biometric identification and continuous monitoring to verify individuals and determine eligibility for services. it processes sensitive personal information and may affect rights if misclassified.',
 'predicted_class': 'neither',
 'predicted_index': 1,
 'confidence': 0.4807,
 'class_probabilities': {'both': 0.1391,
  'neither': 0.4807,
  'rights': 0.2995,
  'safety': 0.0808},
 'keywords': ['development',
  'text',
  'quality',
  'learning',
  'research',
  'documents',
  'data',
  'tool'],
 'agency_context': {'3_agency': 'Department of Agriculture',
  '4_bureau': 'APHIS: Animal and Plant Health Inspection Service',
  'n_cases': 11,
  'pct_rights_impacting': 0.0,
  'pct_safety_impacting': 0.0,
  'mean_governance': 0.03,
  'mean_transparency': 0.0,
  'mean_fairness': nan},
 'llm_policy_note': 'The machine learning classifier analyzed the provided text and identified that the system employs biometric identification and continuous monitoring for service e

In [None]:
# reminder for github streamlit
#*.env
#openai_api_key.env