In [5]:
import re
import json
import text_processing
import secrets 

from typing import List, Dict
import openai  # Assuming OpenAI API, but adaptable to other models

# Load text file and taxonomy JSON

taxonomy_file_path = 'privacy_ontology.json'  # Replace with actual path
document_file_path = 'annotations/Actual_Budget/Accounts_&_Transactions.txt'

# Load previously processed example files
example_data = text_processing.process_input(document_file_path)

### Defining a prompt template


In [6]:
import json
from typing import Dict, List, Union
import tiktoken  # Library for token counting with OpenAI models


# Calculate token count for the prompt
def count_tokens(prompt: str, model: str = "gpt-3.5-turbo") -> int:
    """Calculates the token count for a given prompt string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(prompt))

# Load the ontology from a JSON file
def load_privacy_ontology(ontology_path: str) -> Dict[str, Union[List[str], Dict]]:
    """Loads categories for actions, data types, and purposes from a JSON ontology file."""
    with open(ontology_path, 'r') as file:
        ontology = json.load(file)
    return ontology

# Recursive function to display data types with subcategories
def format_data_types(data_types: Union[Dict, List], level: int = 0) -> str:
    """Formats data types and their subcategories recursively for prompt clarity."""
    formatted_text = ""
    indent = "  " * level  # Indentation for hierarchy

    if isinstance(data_types, list):
        # Base case: data_types is a list of items
        formatted_text += indent + ", ".join(data_types) + "\n"
    elif isinstance(data_types, dict):
        # Recursive case: data_types is a dictionary with subcategories
        for category, subcategories in data_types.items():
            formatted_text += f"{indent}{category}:\n"
            formatted_text += format_data_types(subcategories, level + 1)
    return formatted_text

# Create the annotation prompt using the ontology
def create_annotation_prompt(example_file: Dict, target_text: str, ontology: Dict[str, Union[List[str], Dict]]) -> str:
    # Start building the prompt with an instructional message
    prompt = (
        "You are a helpful assistant trained to annotate text files with metadata about behaviors, "
        "actions, data types, and purposes. For each section in a file, annotate the following:\n\n"
        "1. Actions: Actions that are performed or expected in this section.\n"
        "2. Data Types: Types of data referenced in this section. Data types may include specific subcategories.\n"
        "3. Purposes: Purposes or intentions related to these actions and data types.\n\n"
        "After providing your annotations, explain your rationale for these annotations. "
        "Place a <R> tag between your annotations and your rationale.\n\n"
    )

    # Add guidance from ontology
    prompt += "Use only the categories listed below when annotating the sections:\n\n"
    
    # Display actions
    prompt += "Actions:\n" + ", ".join(ontology.get("Actions", [])) + "\n\n"
    
    # Display data types with subcategories
    prompt += "Data Types:\n" + format_data_types(ontology.get("Data Types", {})) + "\n"
    
    # Display purposes
    prompt += "Purposes:\n" + ", ".join(ontology.get("Purpose", [])) + "\n\n"

    # Add an example from the provided file
    prompt += "Here is an example of annotated sections:\n\n"
    prompt += f"--- File: {example_file['file_name']} ---\n"
    prompt += f"Full Cleaned Text:\n{example_file['full_cleaned_text']}\n\n"
    for section in example_file["sections"]:
        prompt += f"Section Text:\n{section['section_text_with_tags']}\n"
        prompt += f"Actions: {', '.join(section['metadata']['actions'] or [])}\n"
        prompt += f"Data Types: {', '.join(section['metadata']['data_types'] or [])}\n"
        prompt += f"Purposes: {', '.join(section['metadata']['purposes'] or [])}\n"
        prompt += "<R>\n"
        prompt += f"Rationale: {section.get('rationale', 'Explain your reasoning here.')}\n\n"

    # Add target text for annotation
    prompt += "--- New File Text ---\n"
    prompt += f"{target_text}\n\n"
    prompt += (
        "Annotate the sections of the above text with actions, data types, and purposes as demonstrated, "
        "using only the categories from the list provided. For each section, provide your annotations "
        "followed by your rationale, and place a <R> tag between your annotations and your rationale.\n"
    )

    return prompt

# Load the privacy ontology
ontology_path = "privacy_ontology.json"
privacy_ontology = load_privacy_ontology(ontology_path)

# Load one processed example file for demonstration
example_file = text_processing.process_input(document_file_path)[0]

# Define target text to be annotated (for demonstration)
new_text_to_annotate = """
This is an example section text for a new file. The assistant will identify and annotate actions, 
data types, and purposes within this text.
"""

# Generate the prompt
prompt_example = create_annotation_prompt(example_file, new_text_to_annotate, privacy_ontology)

# Print the prompt to review it
print(prompt_example)
print(f"\nToken Count: {count_tokens(prompt_example)} tokens")


You are a helpful assistant trained to annotate text files with metadata about behaviors, actions, data types, and purposes. For each section in a file, annotate the following:

1. Actions: Actions that are performed or expected in this section.
2. Data Types: Types of data referenced in this section. Data types may include specific subcategories.
3. Purposes: Purposes or intentions related to these actions and data types.

After providing your annotations, explain your rationale for these annotations. Place a <R> tag between your annotations and your rationale.

Use only the categories listed below when annotating the sections:

Actions:
sub, Collect, Process, Share

Data Types:
sub:
  anonymize:
  aggregate:
Personal Data:
  Patterns:
    Minimal-Information-Asymmetry, Awareness Feed, User data confinement pattern
  Synonyms:
    Personally Identifiable Information, PII, Personal Information, Your Data, Your Information
  Contact Data:
    Phone Number:
      Synonyms:
        mobile

### Annotating with LLMs 

In [8]:
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
from typing import Dict, List, Union
import tiktoken  # Library for token counting with OpenAI models
from secrets_file import openai_api_key
import csv
import torch

def send_prompt(prompt: str, model_name: str, use_openai: bool = True, **kwargs):
    if use_openai:
        # Set OpenAI API key
        openai.api_key = openai_api_key
        response = openai.ChatCompletion.create(
            model=model_name,
            messages=[{"role": "system", "content": prompt}],
            temperature=kwargs.get('temperature', 0),
            max_tokens=kwargs.get('max_tokens', 1500)
        )
        output = response.choices[0].message.content.strip()
    else:
        # Use Hugging Face Mamba 2 model
        from transformers import AutoTokenizer, AutoModelForCausalLM

        # Load tokenizer with specific settings
        tokenizer = AutoTokenizer.from_pretrained(model_name, revision='refs/pr/9', from_slow=True, legacy=False)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"

        # Load model
        model = AutoModelForCausalLM.from_pretrained(model_name, revision='refs/pr/9')
        model.eval()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)

        # Encode prompt
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Generate output
        input_length = inputs['input_ids'].shape[1]
        max_length = input_length + kwargs.get('max_tokens', 150)
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=kwargs.get('temperature', 0.7),
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode output
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract generated text after the prompt
        output = full_output[len(prompt):].strip()
    return output

# Function to create a few-shot prompt with multiple example files and annotate a new target file
def annotate_with_few_shot_prompt(example_directory: str, target_file_path: str, ontology_path: str, model_name: str = "gpt-4", use_openai: bool = True):
    # Load the privacy ontology
    ontology = load_privacy_ontology(ontology_path)
    
    # Load all example files in the directory
    example_files = text_processing.process_input(example_directory)
    
    # Load the content of the target text file
    with open(target_file_path, 'r') as target_file:
        target_text = target_file.read()
    
    # Generate the few-shot prompt using `create_annotation_prompt`
    prompt = ""
    for example_file in example_files:
        prompt += create_annotation_prompt(example_file, target_text, ontology) + "\n\n"
    
    # Print prompt and token count for review
    print(prompt)
    if use_openai:
        token_count = count_tokens(prompt)
    else:
        # Load tokenizer for Hugging Face model
        tokenizer = AutoTokenizer.from_pretrained(model_name, revision='refs/pr/9', from_slow=True, legacy=False)
        token_count = count_tokens(prompt, tokenizer=tokenizer)
    print(f"\nToken Count: {token_count} tokens")
    
    # Send the prompt to the chosen LLM
    annotated_data = send_prompt(
        prompt=prompt,
        model_name=model_name,
        use_openai=use_openai,
        temperature=0,
        max_tokens=1500
    )
    
    print("\nAnnotations from LLM:\n", annotated_data)
    
    # Save the prompt and response to CSV
    with open('llm_output.csv', mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([prompt, annotated_data])

example_directory = document_file_path  # Directory containing annotated example files
target_file_path = "input/activities.txt"       # New file to be annotated
ontology_path = "privacy_ontology.json"         # Path to the ontology JSON

# Choose the model and specify whether to use OpenAI or Hugging Face
model_name = "gpt-4o"      # For OpenAI models like "gpt-4" or "gpt-3.5-turbo"
use_openai = True         # Set to False to use a Hugging Face model

# For Hugging Face models, set use_openai to False and specify the model name
# model_name = "Mamba2"     # Example Hugging Face model


# Annotate the target file
annotate_with_few_shot_prompt(example_directory, target_file_path, ontology_path, model_name, use_openai)


You are a helpful assistant trained to annotate text files with metadata about behaviors, actions, data types, and purposes. For each section in a file, annotate the following:

1. Actions: Actions that are performed or expected in this section.
2. Data Types: Types of data referenced in this section. Data types may include specific subcategories.
3. Purposes: Purposes or intentions related to these actions and data types.

After providing your annotations, explain your rationale for these annotations. Place a <R> tag between your annotations and your rationale.

Use only the categories listed below when annotating the sections:

Actions:
sub, Collect, Process, Share

Data Types:
sub:
  anonymize:
  aggregate:
Personal Data:
  Patterns:
    Minimal-Information-Asymmetry, Awareness Feed, User data confinement pattern
  Synonyms:
    Personally Identifiable Information, PII, Personal Information, Your Data, Your Information
  Contact Data:
    Phone Number:
      Synonyms:
        mobile