In [2]:
import os
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
import fitz  # PyMuPDF
import json
import os
import openai
import pandas as pd

# Constants
PERSONAS = [
    {
        "name": "Compliance Officer - Large Tech",
        "industry": "Technology",
        "resources": "High",
        "legal_team": "In-house counsel + external",
        "familiarity": "High"
    },
    {
        "name": "Small Business Owner",
        "industry": "Retail",
        "resources": "Low",
        "legal_team": "None",
        "familiarity": "Low"
    },
    {
        "name": "Healthcare Administrator",
        "industry": "Healthcare",
        "resources": "Medium",
        "legal_team": "Limited in-house counsel",
        "familiarity": "Medium"
    },
    {
        "name": "Financial Services Compliance Manager",
        "industry": "Financial Services",
        "resources": "High",
        "legal_team": "Extensive in-house and specialized external",
        "familiarity": "Very High"
    },
    {
        "name": "Manufacturing Plant Manager",
        "industry": "Manufacturing",
        "resources": "Medium",
        "legal_team": "Occasional external consultation",
        "familiarity": "Low"
    },
    {
        "name": "Nonprofit Director",
        "industry": "Nonprofit",
        "resources": "Very Low",
        "legal_team": "Pro bono when available",
        "familiarity": "Low"
    },
    {
        "name": "EdTech Startup Founder",
        "industry": "Education Technology",
        "resources": "Medium",
        "legal_team": "Outsourced as needed",
        "familiarity": "Medium"
    },
    {
        "name": "Government Agency Director",
        "industry": "Public Sector",
        "resources": "Medium",
        "legal_team": "Department legal counsel",
        "familiarity": "High"
    },
    {
        "name": "Pharmaceutical Research Director",
        "industry": "Pharmaceuticals",
        "resources": "High",
        "legal_team": "Specialized regulatory counsel",
        "familiarity": "Very High"
    },
    {
        "name": "Real Estate Developer",
        "industry": "Real Estate",
        "resources": "Medium",
        "legal_team": "External firm on retainer",
        "familiarity": "Medium"
    },
    {
        "name": "E-commerce Platform Manager",
        "industry": "E-commerce",
        "resources": "Medium-High",
        "legal_team": "Small in-house team",
        "familiarity": "Medium"
    },
    {
        "name": "Hospitality Chain Owner",
        "industry": "Hospitality",
        "resources": "Medium",
        "legal_team": "Occasional consultation",
        "familiarity": "Low"
    }
]

# Enhanced template with more specific subtask requirements
PROMPT_TEMPLATE = """
You are simulating the behavior of a {persona_name} working in the {industry} sector.
They have {resources} resources and their legal team is: {legal_team}. Their familiarity with regulatory compliance is: {familiarity}.

Given the regulation text below, estimate the following:
1. Break down the compliance process into specific subtasks
2. Estimate hours required for each subtask (both first-year and ongoing)
3. Identify key dependencies between subtasks
4. Flag subtasks that may require specialized expertise

Regulation text:
\"\"\"{regulation_excerpt}\"\"\"

Structure your response as JSON with the following schema:
{{
  "persona": "{persona_name}",
  "first_year_total_hours": <number>,
  "annual_ongoing_hours": <number>,
  "subtasks": [
    {{
      "id": "task1",
      "name": "Task name",
      "description": "Brief description of what this task involves",
      "first_year_hours": <number>,
      "ongoing_annual_hours": <number>,
      "requires_expertise": ["legal", "technical", "hr", etc.],
      "dependencies": ["task3", "task5"],
      "risk_level": "high/medium/low",
      "notes": "Any specific considerations for this persona"
    }},
    ...
  ],
  "key_risks": [
    "Risk description 1",
    "Risk description 2"
  ],
  "confidence_level": "high/medium/low"
}}

Ensure the subtasks are meaningful, specific, and appropriate for the regulation requirements.
"""

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def create_prompt(persona, excerpt):
    return PROMPT_TEMPLATE.format(
        persona_name=persona["name"],
        industry=persona["industry"],
        resources=persona["resources"],
        legal_team=persona["legal_team"],
        familiarity=persona["familiarity"],
        regulation_excerpt=excerpt[:3000]  # truncate if too long
    )

def query_openai(prompt, model="gpt-4"):
    # Updated to use the new OpenAI API format (>=1.0.0)
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    response = openai.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    
    return response.choices[0].message.content

def simulate_all_personas(reg_text):
    results = []
    for persona in PERSONAS:
        prompt = create_prompt(persona, reg_text)
        try:
            raw_output = query_openai(prompt)
            json_output = json.loads(raw_output)
            results.append(json_output)
        except Exception as e:
            results.append({"persona": persona["name"], "error": str(e)})
    return results

def generate_subtask_report(outputs):
    """Generate a comprehensive report comparing subtasks across personas"""
    # Create a DataFrame for better analysis
    all_subtasks = []
    
    for output in outputs:
        if "subtasks" in output:
            persona = output["persona"]
            for task in output["subtasks"]:
                task_copy = task.copy()
                task_copy["persona"] = persona
                all_subtasks.append(task_copy)
    
    subtask_df = pd.DataFrame(all_subtasks)
    
    # Generate basic statistics
    if not subtask_df.empty:
        subtask_summary = subtask_df.groupby(["name", "persona"]).agg({
            "first_year_hours": "sum",
            "ongoing_annual_hours": "sum"
        }).reset_index()
        
        subtask_summary.to_csv("subtask_comparison.csv", index=False)
        print("Subtask comparison saved to subtask_comparison.csv")

if __name__ == "__main__":
    # Replace with your actual PDF file path
    pdf_path = "data/A6453.pdf"
    reg_text = extract_text_from_pdf(pdf_path)
    outputs = simulate_all_personas(reg_text)
    
    # Save raw outputs
    df = pd.DataFrame(outputs)
    df.to_csv("compliance_estimates.csv", index=False)
    print("Estimates saved to compliance_estimates.csv")
    
    # Generate subtask report
    generate_subtask_report(outputs)
    
    # Save full structured output
    with open("detailed_compliance_analysis.json", "w") as f:
        json.dump(outputs, f, indent=2)
    print("Detailed analysis saved to detailed_compliance_analysis.json")