In [5]:
import re
import json
import text_processing
import secrets 

from typing import List, Dict
import openai  # Assuming OpenAI API, but adaptable to other models

# Load text file and taxonomy JSON

taxonomy_file_path = 'privacy_ontology.json'  # Replace with actual path
example_file_path = 'annotations/Actual_Budget/Accounts_&_Transactions.txt'
target_file_path = 'input/Direct_Messaging_README.txt'

# Load previously processed example files
example_data = text_processing.process_input(example_file_path)

### Building prompts


In [6]:
import json
from typing import Dict, List, Union
import tiktoken  # Library for token counting with OpenAI models
import prompt_templates

# Load the privacy ontology
ontology_path = "privacy_ontology.json"
privacy_ontology = prompt_templates.load_privacy_ontology(ontology_path)

# Load one processed example file for demonstration
example_file = text_processing.process_input(example_file_path)[0]


# Define target text to be annotated (for demonstration) & Load annotated version 
target_file_annotations = text_processing.process_input(target_file_path)
new_text_to_annotate = open(target_file_path).read()

# Generate the prompt
prompt_example = prompt_templates.create_annotation_prompt(example_file, new_text_to_annotate, privacy_ontology)

# Print the prompt to review it
print(prompt_example)
print(f"\nToken Count: {prompt_templates.count_tokens(prompt_example)} tokens")


You are a helpful assistant trained to annotate text files with metadata about behaviors, actions, data types, and purposes. For each section in a file, annotate the following:

1. Actions: Actions that are performed or expected in this section.
2. Data Types: Types of data referenced in this section. Data types may include specific subcategories.
3. Purposes: Purposes or intentions related to these actions and data types.

After providing your annotations, explain your rationale for these annotations. Place a <R> tag between your annotations and your rationale.

Use only the categories listed below when annotating the sections:

Actions:
sub, Collect, Process, Share

Data Types:
sub:
  anonymize:
  aggregate:
Patterns:
  Minimal-Information-Asymmetry, Awareness Feed, User data confinement pattern
Synonyms:
  Personally Identifiable Information, PII, Personal Information, Your Data, Your Information
Contact Data:
  Phone Number:
    Synonyms:
      mobile number
  Email address:
  User

### Annotating with LLMs 

In [12]:
import os
import csv
import json
import model_routing
import text_processing
import prompt_templates
from getpass import getpass

# Set API Key
if os.environ['OPENAI_API_KEY'] is None : os.environ['OPENAI_API_KEY'] = getpass('Enter your OPENAI API key: ')

def save_results_to_csv(models, prompts, target_annotations, model_responses, output_file):
    """
    Save the model outputs, prompts, and file annotations to a CSV file.

    Args:
        models (list): List of model names.
        prompts (list): List of prompt strings.
        target_annotations (dict): Processed annotations for the target file.
        model_responses (dict): Dictionary of model responses for each prompt.
        output_file (str): Path to the output CSV file.
    """
    # Open the output CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        # Determine the number of model responses
        num_runs = max(len(responses) for model in models for responses in model_responses[model])
        
        # Prepare the CSV header
        fieldnames = ['Prompt', 'Model', 'Target Annotations'] + [f'Model Response {i + 1}' for i in range(num_runs)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Write data for each prompt
        for i, prompt in enumerate(prompts):
            for model in models:
                row = {
                    'Prompt': prompt,
                    'Model': model,
                    'Target Annotations': json.dumps(target_annotations, ensure_ascii=False),
                }

                # Write the responses for the current model and prompt
                for j in range(num_runs):
                    # Access the model's responses
                    response = model_responses[model][i][j] if j < len(model_responses[model][i]) else 'No Response'
                    row[f'Model Response {j + 1}'] = json.dumps(response, ensure_ascii=False) if response != 'No Response' else 'No Response'
                
                # Write the row to the CSV file
                writer.writerow(row)


def example_usage(target_file_path, example_file_path, ontology_path, output_csv):
    """
    Run the annotation process and save results to a CSV file.
    
    Args:
        target_file_path (str): Path to the target file for annotation.
        example_file_path (str): Path to an example processed file.
        ontology_path (str): Path to the privacy ontology file.
        output_csv (str): Path to save the CSV results.
    """
    # Load the privacy ontology
    privacy_ontology = prompt_templates.load_privacy_ontology(ontology_path)

    # Process target file and example file
    target_file_annotations = text_processing.process_input(target_file_path)[0]
    example_file_annotations = text_processing.process_input(example_file_path)[0]

    # Read the raw text of the target file
    with open(target_file_path, 'r', encoding='utf-8') as file:
        new_text_to_annotate = file.read()

    # Generate a prompt
    prompt_example = prompt_templates.create_annotation_prompt(
        example_file_annotations, 
        new_text_to_annotate, 
        privacy_ontology
    )

    # List of models to test
    test_models = [
        'openai:gpt-4o-mini',
    ]

    # List of prompts to test
    test_prompts = [prompt_example]
        
    try:
        # Get model responses
        model_responses = model_routing.run_multi_model_prompts(
            models=test_models, 
            prompts=test_prompts, 
            num_runs=2  # Number of runs per model
        )

        # Save results to CSV with added annotation data
        save_results_to_csv(
            models=test_models,
            prompts=test_prompts,
            target_annotations=target_file_annotations,
            model_responses=model_responses,
            output_file=output_csv
        )

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
taxonomy_file_path = 'privacy_ontology.json'  # Replace with actual path
example_file_path = 'annotations/Actual_Budget/Accounts_&_Transactions.txt'
target_file_path = 'input/Direct_Messaging_README.txt'

output_csv = 'LLMAnnotation_12_5_24.csv'

# Run the updated example
example_usage(target_file_path, example_file_path, ontology_path, output_csv)


Model: openai:gpt-4o-mini - Completed a run
Model: openai:gpt-4o-mini - Completed a run


In [8]:
'''import model_routing
import secrets_file
import os
import csv
from getpass import getpass
import text_processing  # Assuming this includes your text annotation processing functions
import prompt_templates

# Set API Key
os.environ['OPENAI_API_KEY'] = getpass('Enter your OPENAI API key: ')

import json

def save_results_to_csv(models, prompts, target_annotations, model_responses, output_file):
    """
    Save the model outputs, prompts, and file annotations to a CSV file.

    Args:
        models (list): List of model names.
        prompts (list): List of prompt strings.
        target_annotations (dict): Processed annotations for the target file.
        model_responses (dict): Dictionary of model responses for each prompt.
        output_file (str): Path to the output CSV file.
    """
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Prompt', 'Model', 'Target Annotations'] + [f'Model Response {i + 1}' for i in range(len(models))]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for i, prompt in enumerate(prompts):
            for model in models:
                row = {
                    'Prompt': prompt,
                    'Model': model,
                    'Target Annotations': json.dumps(target_annotations),
                }
                row.update({f'Model Response {j + 1}': model_responses[model][i][j] for j in range(len(model_responses[model][i]))})
                writer.writerow(row)

def example_usage(target_file_path, example_file_path, ontology_path, output_file):
    """
    Run the annotation process and save results to a JSON file.
    
    Args:
        target_file_path (str): Path to the target file for annotation.
        example_file_path (str): Path to an example processed file.
        ontology_path (str): Path to the privacy ontology file.
        output_file (str): Path to save the results JSON file.
    """
    # Load the privacy ontology
    privacy_ontology = prompt_templates.load_privacy_ontology(ontology_path)

    # Process target file and example file
    target_file_annotations = text_processing.process_input(target_file_path)[0]
    example_file_annotations = text_processing.process_input(example_file_path)[0]

    # Read the raw text of the target file
    with open(target_file_path, 'r', encoding='utf-8') as file:
        new_text_to_annotate = file.read()

    # Generate a prompt
    prompt_example = prompt_templates.create_annotation_prompt(
        example_file_annotations, 
        new_text_to_annotate, 
        privacy_ontology
    )

    # List of models to test
    test_models = [
        'openai:gpt-4o-mini',
    ]

    # List of prompts to test
    test_prompts = [prompt_example]

    try:
        model_responses = model_routing.run_multi_model_prompts(
            models=test_models, 
            prompts=test_prompts, 
            num_runs=2,  # Number of runs per model
            output_file=output_csv  # We'll handle the output directly
        )

        # Include the processed target file annotations
        results = {
            'target_annotations': target_file_annotations,
            'model_responses': model_responses
        }

        # Save the results to a JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)

        print(f"Results saved to: {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

''''''#Define paths
target_file_path = 'path/to/target_file.txt'
example_file_path = 'path/to/example_file.txt'
ontology_path = 'privacy_ontology.json'
output_csv = 'model_comparison.csv'
''''''
output_csv = 'LLMAnnotation_12_5_24.csv'


# Run the updated example
example_usage(target_file_path, example_file_path, ontology_path, output_csv)
'''

'import model_routing\nimport secrets_file\nimport os\nimport csv\nfrom getpass import getpass\nimport text_processing  # Assuming this includes your text annotation processing functions\nimport prompt_templates\n\n# Set API Key\nos.environ[\'OPENAI_API_KEY\'] = getpass(\'Enter your OPENAI API key: \')\n\nimport json\n\ndef save_results_to_csv(models, prompts, target_annotations, model_responses, output_file):\n    """\n    Save the model outputs, prompts, and file annotations to a CSV file.\n\n    Args:\n        models (list): List of model names.\n        prompts (list): List of prompt strings.\n        target_annotations (dict): Processed annotations for the target file.\n        model_responses (dict): Dictionary of model responses for each prompt.\n        output_file (str): Path to the output CSV file.\n    """\n    with open(output_file, \'w\', newline=\'\', encoding=\'utf-8\') as csvfile:\n        fieldnames = [\'Prompt\', \'Model\', \'Target Annotations\'] + [f\'Model Response