## Processing input privacy policies for story prompting 

: Take in input privacy policy :

-> Output sections, with most similar section and processed behaviors and stories within that similar section formatted for the following prompt. 


In [2]:
# Import the process_privacy_policy function from your script
from processing.prompt_processing import process_privacy_policy

# Specify the file path and section size
file_path = "input/com.aisecurity privacy_policy.txt"

max_section_size = 1000

# Process input policy in sections
sections, section_info = process_privacy_policy(file_path, max_section_size)

print(sections)
print(len(section_info))



Privacy Policy for AI Security - FreePrivacyPolicy.com
Privacy Policy for AI Security
Privacy Policy
Last updated: May 17, 2023
This Privacy Policy describes Our policies and procedures on the collection, use and disclosure of Your information when You use the Service and tells You about Your privacy rights and how the law protects You. We use Your Personal data to provide and improve the Service. By using the Service, You agree to the collection and use of information in accordance with this Privacy Policy. This Privacy Policy has been created with the help of the
Free Privacy Policy Generator . Interpretation and Definitions
Interpretation
The words of which the initial letter is capitalized have meanings defined under the following conditions. The following definitions shall have the same meaning regardless of whether they appear in singular or in plural. Definitions
For the purposes of this Privacy Policy:
Account
means a unique account created for You to access our Service or part

## If we want to save all the input texts to be processed for story prompting (i.e. Multiple policies at once)

In [7]:
import json
from processing.prompt_processing import SectionInformation


import os

def generate_section_info(directory, split_char="_"):
    section_info_mapping = {}
    for file_name in os.listdir(directory):
        full_path = os.path.join(directory, file_name)
        if os.path.isfile(full_path) and file_name.endswith(".txt"):
            # Process the privacy policy file
            sections, section_info = process_privacy_policy(full_path, max_section_size)
            section_info_mapping[file_name] = {
                'sections': sections,
                'section_info': section_info
            }
    return section_info_mapping

# Define the input directory and max section size
input_directory = "input/policies_descriptions"
max_section_size = 1000

# Call the function to generate section info for each policy
section_info_mapping = generate_section_info(input_directory)

# Print the section info mapping
for file_name, info in section_info_mapping.items():
    print(f"File: {file_name}")
    print(f"Number of sections: {len(info['sections'])}")
    print(f"Section info: {info['section_info']}")

print(section_info_mapping)

def serialize_section_info(section_info):
    if isinstance(section_info, SectionInformation):
        return {
            'section_text': section_info.section_text,
            'cleaned_annotated_text': section_info.cleaned_annotated_text,
            'details': section_info.details,
            'privacy_stories': section_info.privacy_stories,
            'privacy_behaviors': section_info.privacy_behaviors
        }
    raise TypeError(f"Object of type '{type(section_info)}' is not JSON serializable")

output_directory = "input/policies_descriptions"

output_file_path = os.path.join(output_directory, "section_info_mapping.json")

# Write the section_info_mapping object to the output file as JSON
with open(output_file_path, "w") as output_file:
    json.dump(section_info_mapping, output_file, indent=4, default=serialize_section_info)

print(f"Section info mapping saved to {output_file_path}")

Privacy Policy
Skip to content
Search site
Menu
Search Informa
Home
About Us
What We Do
Where We Are
History
Strategy
Market Trends
Purpose & Principles
Executive Management
Board of Directors
Policies
Divisions
Informa Connect
Informa Markets
Informa Tech
Taylor & Francis
Global Support
Investors
Investor Days
Investment Case
Shareholder Centre
Annual Reports
Results & Presentations
Press Releases & News
Financial Calendar
Financial Summary
Debt Summary
Tax Approach
Advisors & Analysts
Analyst Research
Pension Schemes
Corporate Governance
Investor Relations Contacts
Sustainability
FasterForward
Climate Impact
Sustainable Products
Community Impact
Recognition
Sustainability Reports
Talent
Life at Informa
Diversity & Inclusion
Vacancies
Graduates
Media
Media Contacts
News
Contact
Global Contacts
Office Locator
Home
Privacy Policy
EU Appendix
UAE Appendix
Singapore Appendix
Japan Appendix
South Korea Appendix
China Appendix
Canada Appendix
Hong Kong Appendix
US Appendix
Brazilian Appendi

KeyboardInterrupt: 

### Building prompts using the annotated policy information 

In [3]:
# print(sections_behaviors[0].comparison_sections[1].privacy_behaviors)

from processing.prompt_processing import ontology_to_string, load_ontology, ontology_file

privacy_taxonomy = load_ontology(ontology_file)

privacy_behavior_taxonomy = ontology_to_string(privacy_taxonomy) 

behaviors_strings = simple_stories_strings = stories_strings = []

build_string = (
        f"User: As an annotator of privacy policies, your task involves identifying and categorizing privacy-related behaviors "
        f"in policy texts. Each behavior should be annotated with the most specific label possible, based on the provided privacy behavior taxonomy. "
        f"This taxonomy serves as your reference for classifying privacy behaviors mentioned in the policies.\n"
        f"Privacy Behavior Taxonomy:\n{privacy_behavior_taxonomy}\n"
)

print(len(sections))
for i, section in enumerate(section_info):
    section_story_information = sections[i]
    section_behavior_information = section_info[i]

    # Iterate through all comparison sections safely
    for j, nearest_match in enumerate(section_story_information.comparison_sections):
        # Ensure there's a corresponding match in behaviors before accessing
        if j < len(section_behavior_information.comparison_sections):
            nearest_match_behaviors = section_behavior_information.comparison_sections[j]

    # print(nearest_match.cleaned_annotated_text)
    # Constructing the prompts
    # sections.append(section_story_information.cleaned_annotated_text)
    # print(nearest_match.privacy_stories)
    # print(nearest_match_behaviors.privacy_behaviors)

    behaviors_strings.append(
        f"User: Privacy policy: {nearest_match.cleaned_annotated_text} "
        f"User: Write the privacy behaviors found within this text\n"
        f"System: Privacy Behaviors: {', '.join(nearest_match_behaviors.privacy_behaviors)}\n"
        f"User: Privacy Policy:\n{section_story_information.cleaned_annotated_text}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )
    simple_stories_strings.append(
        f"User: This taxonomy serves as your reference for understanding and classifying various privacy practices mentioned in the policies.\n"
        f"Privacy Behavior Taxonomy:\n{privacy_behavior_taxonomy}\n"
        f"User: Privacy Policy:\n{section_story_information.cleaned_annotated_text}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )
    stories_strings.append(
        f"User: This taxonomy serves as your reference for understanding and classifying various privacy practices mentioned in the policies.\n"
        f"Privacy Behavior Taxonomy:\n{privacy_behavior_taxonomy}\n"
        f"User: Privacy policy: {nearest_match.cleaned_annotated_text} "
        f"User: Write the privacy behaviors found within this text\n"
        f"System: Privacy Behaviors: {', '.join(nearest_match_behaviors.privacy_behaviors)}\n"
        f"User: Write the privacy stories found within this text, connecting related actions, data types and purposes together in the format of we (action) (data type) for the purpose of (purpose)\n"
        f"System: Privacy stories: {', '.join(nearest_match.privacy_stories)}\n"
        f"User: Privacy Policy:\n{section_story_information.cleaned_annotated_text}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )
    system_string_1 = (
    "This taxonomy serves as your reference for understanding and classifying various privacy practices mentioned in the policies.\n\n"
    "Privacy Behavior Taxonomy:\n" + privacy_behavior_taxonomy + "\n"
    "Privacy policy:\n" + nearest_match.cleaned_annotated_text + "\n"
    "Privacy Stories including privacy behaviors from the privacy behavior taxonomy:\n" + "\n".join(nearest_match.privacy_stories) + "\n"
    "Privacy Policy:\n" + section_story_information.cleaned_annotated_text + "\n"
    "write the privacy behaviors found within the privacy behavior taxonomy from the the input section text "
    "reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text."
    "\nPrivacy Behaviors:"
    )



''' To copy prompt to clipboad
import pyperclip 
behaviors = ''
for behavior in behaviors_strings:
    behaviors += behaviors
pyperclip.copy(behaviors)
print(behaviors)'''

# print(behaviors_strings)

# Build the template
# annotator_template = system_string_1

level_1_prompt = simple_stories_strings
level_2_prompt = behaviors_strings
level_3_prompt = stories_strings
for prompt in stories_strings: print('\n','*-'*30,'\n',prompt)

2

 *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- 
 User: Privacy policy: [#ns Privacy Policy for AI Security - FreePrivacyPolicy.com
Privacy Policy for AI Security
Privacy Policy
Last updated: May 17, 2023
This Privacy Policy describes Our policies and procedures on the collection, use and disclosure of Your information when You use the Service and tells You about Your privacy rights and how the law protects You.] {#as We  use Your  Personal data to (functionality) provide and improve the Service. /} [#ns By using the Service, You agree to the collection and use of information in accordance with this Privacy Policy. This Privacy Policy has been created with the help of the
Free Privacy Policy Generator . Interpretation and Definitions
Interpretation
The words of which the initial letter is capitalized have meanings defined under the following conditions. The following definitions shall have the same meaning regardless of whether they appear in singular or in plural. Def

### Load LLMs from apis 

In [4]:
import json
from processing.prompt_processing import ontology_to_string, load_ontology, ontology_file

# Load the privacy taxonomy
privacy_taxonomy = load_ontology(ontology_file)
privacy_behavior_taxonomy = ontology_to_string(privacy_taxonomy)

# Define the output directory
output_directory = "input/United States"

# Define the output file path
output_file_path = os.path.join(output_directory, "section_info_mapping.json")

# Load the section_info_mapping from the JSON file
with open(output_file_path, "r") as input_file:
    section_info_mapping = json.load(input_file)

# Function to generate prompts for a section
def generate_prompts(section_story_info, section_behavior_info):
    behaviors_strings = []
    simple_stories_strings = []
    stories_strings = []

    nearest_match = section_story_info
    nearest_match_behaviors = section_behavior_info

    behaviors_strings.append(
        f"User: Privacy policy: {nearest_match} "
        f"User: Write the privacy behaviors found within this text\n"
        f"System: Privacy Behaviors: {', '.join(nearest_match_behaviors)}\n"
        f"User: Privacy Policy:\n{section_story_info}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )
    simple_stories_strings.append(
        f"User: This taxonomy serves as your reference for understanding and classifying various privacy practices mentioned in the policies.\n"
        f"Privacy Behavior Taxonomy:\n{privacy_behavior_taxonomy}\n"
        f"User: Privacy Policy:\n{section_story_info}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )
    stories_strings.append(
        f"User: This taxonomy serves as your reference for understanding and classifying various privacy practices mentioned in the policies.\n"
        f"Privacy Behavior Taxonomy:\n{privacy_behavior_taxonomy}\n"
        f"User: Privacy policy: {nearest_match} "
        f"User: Write the privacy behaviors found within this text\n"
        f"System: Privacy Behaviors: {', '.join(nearest_match_behaviors)}\n"
        f"User: Write the privacy stories found within this text, connecting related actions, data types and purposes together in the format of we (action) (data type) for the purpose of (purpose)\n"
        f"System: Privacy stories: {', '.join([])}\n"  # Assuming no privacy stories
        f"User: Privacy Policy:\n{section_story_info}\n"
        f"User: Write the privacy behaviors found within this text and those which are connected "
        f"reflect a critical understanding of the text. Be mindful of avoiding assumptions or hallucinations that are not supported by the text.\n"
        f"System: Privacy Behaviors:"
    )

    return behaviors_strings, simple_stories_strings, stories_strings

# Iterate through the section_info_mapping and generate prompts
for file_name, info_dict in section_info_mapping.items():
    info_dict['behaviors_strings'] = []
    info_dict['simple_stories_strings'] = []
    info_dict['stories_strings'] = []

    for section_story_info, section_behavior_info in zip(info_dict['sections'], info_dict['section_info']):
        behaviors_strings, simple_stories_strings, stories_strings = generate_prompts(section_story_info, section_behavior_info)

        info_dict['behaviors_strings'].append(behaviors_strings)
        info_dict['simple_stories_strings'].append(simple_stories_strings)
        info_dict['stories_strings'].append(stories_strings)

# Write the updated section_info_mapping to the output file
with open(output_file_path, "w") as output_file:
    json.dump(section_info_mapping, output_file, indent=4)

print(f"Section info mapping with prompts saved to {output_file_path}")

NameError: name 'os' is not defined

In [6]:
import os
from utils.secrets import DEEPINFRA_API_TOKEN, OPENAI_API_KEY

#First add utils/secrets.py folder with this info 

os.environ["DEEPINFRA_API_TOKEN"] = DEEPINFRA_API_TOKEN
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


from langchain_community.llms import DeepInfra, Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
# from langchain_openai import OpenAI

mixtral = DeepInfra(model_id="mistralai/Mixtral-8x7B-Instruct-v0.1")
mixtral.model_kwargs = {
    "temperature": 0.7,
    "repetition_penalty": 1.2,
    "max_new_tokens": 250,
    "top_p": 0.9,
}

# gpt4 = OpenAI(model_name="gpt-3.5-turbo-instruct",openai_api_key="OPENAI_API_TOKEN")

#For running locally with ollama 

Ollama_phi3 = Ollama(model="phi3")


### Story prompt Ollama or DeepInfra models and save results to excel file


In [None]:
import os
import json
import time
import pandas as pd 

llm = Ollama_phi3

if llm == Ollama_phi3: model_name = "phi3"
if llm == mixtral: model_name = "mixtral"


marker = time
output_dir = 'output'

max_section_size = 1000

def level_3_llm(prompt, iterations, app, level, file_path_xlsx="phi_priv_stories.xlsx"):

    file_path_xlsx = os.path.join(output_dir, file_path_xlsx)

    # Check if the xlsx file exists to append data or to create a new DataFrame
    if os.path.exists(file_path_xlsx):
        df_existing = pd.read_xlsx(file_path_xlsx)
    else:
        df_existing = pd.DataFrame()

    # Prepare columns for the DataFrame according to the requirements
    columns = ['Section', 'App / level / model'] + [f'Prompt {i}' for i in range(1, 4)] + [f'Response {i}' for i in range(1, 4)] + ['Completion Objects']

    # Initialize a list to store new conversation data
    new_data = []
    original_prompt = prompt  
    combined_response_text = "" 
    section = 1
    stored_prompt = ""

    for prompt in prompts:
        try: prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}: {sections[section - 1]}"}
        except:prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}"}

        completion_objects = []

        for iteration in range(1, iterations + 1):
            response = llm(prompt)
            
            # Extract the response text
            response_text = response
            print(f"{iteration}\n", response_text)
            prompt_data[f'Prompt {iteration}'] = prompt 
            prompt_data[f'Response {iteration}'] = response_text
            completion_objects.append(json.dumps(response, default=str))
            combined_response_text += response_text
            # Append the response and new task
            if level == 3: 
                # response_text will refer to the previous response 
                if iteration == 1:
                    prompt = (level_3_prompt[section-1] + "\n" + response_text + 
                    "\nUser: Review the privacy behaviors identified . Ensure that "
                    "each behavior accurately reflects the content of the input section of the privacy policy."
                    "\nUser write out the privacy stories in the format of we (action) (data type) for the purpose of (purpose) "
                    "for all related behaviors within this policy\n System: Privacy stories: "
                    )
                    stored_prompt = prompt
                if iteration == 2: 
                    prompt = (stored_prompt + response_text + 
                    "\nUser: Review the privacy stories identified. Ensure that each story and behavior " 
                    "accurately reflects the content of the input section of the privacy policy and is found " 
                    "within the privacy behavior taxonomy. \nUser write out privacy stories inthe format of " 
                    "we (action) (data type) for the purpose of (purpose) for all related behaviors within this "
                    "policy for which you are confident\n System: Privacy stories: "
                    )
                '''to json iteration
                if iteration == 3:
                    prompt = "Here is a list of privacy stories generated from a language model. "
                    "Privacy stories follow the template of we {action} {data type} for ...{purpose}" 
                    "Parse this response and output the stories in json format "
                    "Where each story is mapped to its extracted action, data type, and purpose" 
                    "Privacy stories: " + response_text
                '''
                # Simple questions for CoVe
                if iteration == 3:
                    prompt = ("Here is a list of privacy stories which describe the behaviors an application "
                            "may make with private data, each story follows the format  "
                            "We {action} {data type} {purpose} :" + str(response_text)
                            + "\nGenerate a list of question for each story in the format: "
                            "Does this text indicate that the application {action} {data type} for the {purpose}?"
                            )
        else: prompt +="\n" + response_text + "\nUser: Review the privacy behaviors identified . Ensure that each behavior accurately reflects the content of the input section of the privacy policy. \nUser write out the privacy stories in the format of we (action) (data type) for the purpose of (purpose) for all related behaviors within this policy\n System: Privacy stories: "
            # Here include logic to break the loop based on certain condition if response_text meets certain criteria
        prompt_data['Completion Objects'] = ' '.join(completion_objects)
        new_data.append(prompt_data)
        section += 1
        
        response_data_with_prompt = {
            "original_prompt": original_prompt,
            "final_combined_text": prompt
        }
        response_json = json.dumps(response_data_with_prompt, indent=4)
        
        # filename by timestamp
        filename = f"phi_priv_stories.json"
        file_path = os.path.join(output_dir, filename)

        with open(file_path, 'w') as file:
            file.write(response_json)

    df_new = pd.DataFrame(new_data, columns=columns)

    # Combine new DataFrame with existing data if present
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Write combined DataFrame to the xlsx file, replacing it
    df_combined.to_xlsx(file_path_xlsx, index=False)

        
    return response_text # Return the last response text


iterations = 4 #Leave at 4 to complete all steps 
level = 3
level_2_prompt = behaviors_strings
prompts = level_1_prompt
app_id = 'com.aisecurity'
file_path = f'{app_id}_privacy_stories.xlsx'
stories = level_3_llm(prompts, iterations, app_id, level, file_path)
 #print(stories)


### Story prompt openAI and save results to excel file

In [7]:
import openai
from openai import OpenAI
client = OpenAI()
import os
import json
import time
import pandas as pd 


marker = time
output_dir = 'output'

max_section_size = 1000

models = ["gpt-4-turbo-2024-04-09","gpt-4-0125-preview","gpt-4-0125-preview","gpt-4-turbo-preview",
          "gpt-4-1106-preview","gpt-4-32k","gpt-3.5-turbo-0125","gpt-3.5-turbo-1106",]
model_name = "gpt-4o-mini"
def prompt_gpt(prompt, iterations, app, level, file_path_xlsx="privacy_stories_1_1.xlsx"):

    file_path_xlsx = os.path.join(output_dir, file_path_xlsx)

    # Check if the xlsx file exists to append data or to create a new DataFrame
    if os.path.exists(file_path_xlsx):
        df_existing = pd.read_xlsx(file_path_xlsx)
    else:
        df_existing = pd.DataFrame()

    # Prepare columns for the DataFrame according to the requirements
    columns = ['Section', 'App / level / model'] + [f'Prompt {i}' for i in range(1, 4)] + [f'Response {i}' for i in range(1, 4)] + ['Completion Objects']


    # Initialize a list to store new conversation data
    new_data = []
    original_prompt = prompt  
    combined_response_text = "" 
    section = 1
    stored_prompt = ""

    for prompt in prompts:
        try: prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}: {sections[section - 1]}"}
        except:prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}"}

        completion_objects = []

        for iteration in range(1, iterations + 1):
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": build_string},
                    {"role": "user", "content": prompt}
                ],
                max_tokens= 1000
            )
            
            # Extract the response text
            response_text = response.choices[0].message.content.strip()
            print(response_text)
            prompt_data[f'Prompt {iteration}'] = build_string + prompt 
            prompt_data[f'Response {iteration}'] = response_text
            completion_objects.append(json.dumps(response, default=str))
            combined_response_text += response_text
            # Append the response and new task
            if level == 3: 
                # response_text will refer to the previous response 
                if iteration == 1:
                    prompt = (level_3_prompt[section-1] + "\n" + response_text + 
                    "\nUser: Review the privacy behaviors identified . Ensure that "
                    "each behavior accurately reflects the content of the input section of the privacy policy."
                    "\nUser write out the privacy stories in the format of we (action) (data type) for the purpose of (purpose) "
                    "for all related behaviors within this policy\n System: Privacy stories: "
                    )
                    stored_prompt = prompt
                if iteration == 2: 
                    prompt = (stored_prompt + response_text + 
                    "\nUser: Review the privacy stories identified. Ensure that each story and behavior " 
                    "accurately reflects the content of the input section of the privacy policy and is found " 
                    "within the privacy behavior taxonomy. \nUser write out privacy stories inthe format of " 
                    "we (action) (data type) for the purpose of (purpose) for all related behaviors within this "
                    "policy for which you are confident\n System: Privacy stories: "
                    )
                '''to json iteration
                if iteration == 3:
                    prompt = "Here is a list of privacy stories generated from a language model. "
                    "Privacy stories follow the template of we {action} {data type} for ...{purpose}" 
                    "Parse this response and output the stories in json format "
                    "Where each story is mapped to its extracted action, data type, and purpose" 
                    "Privacy stories: " + response_text
                '''
                # Simple questions for CoVe
                if iteration == 3:
                    prompt = ("Here is a list of privacy stories which describe the behaviors an application "
                            "may make with private data, each story follows the format  "
                            "We {action} {data type} {purpose} :" + str(response_text)
                            + "\nGenerate a list of question for each story in the format: "
                            "Does this text indicate that the application {action} {data type} for the {purpose}?"
                            )
        else: prompt +="\n" + response_text + "\nUser: Review the privacy behaviors identified . Ensure that each behavior accurately reflects the content of the input section of the privacy policy. \nUser write out the privacy stories in the format of we (action) (data type) for the purpose of (purpose) for all related behaviors within this policy\n System: Privacy stories: "
            # Here include logic to break the loop based on certain condition if response_text meets certain criteria
        prompt_data['Completion Objects'] = ' '.join(completion_objects)
        new_data.append(prompt_data)
        section += 1
        
        response_data_with_prompt = {
            "original_prompt": original_prompt,
            "final_combined_text": prompt
        }
        response_json = json.dumps(response_data_with_prompt, indent=4)
        
        # filename by timestamp
        filename = f"privacy_stories_1_1.json"
        file_path = os.path.join(output_dir, filename)

        with open(file_path, 'w') as file:
            file.write(response_json)

    df_new = pd.DataFrame(new_data, columns=columns)

    # Combine new DataFrame with existing data if present
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Write combined DataFrame to the xlsx file, replacing it
    df_combined.to_xlsx(file_path_xlsx, index=False)

        
    return response_text # Return the last response text




iterations = 4 #Leave at 4 to complete all steps 
level = 3
level_2_prompt = behaviors_strings
prompts = level_1_prompt
app_id = 'com.aisecurity'
file_path = f'{app_id}_privacy_stories.xlsx'
stories = prompt_gpt(prompts, iterations, app_id, level, file_path)


1. **Actions:**
   - Collect: 
     - Personal Data: "We may ask You to provide Us with certain personally identifiable information that can be used to contact or identify You."
     - Usage Data: "Usage Data is collected automatically when using the Service."
     - Mobile Device Data: "We may collect certain information automatically... the type of mobile device You use, Your mobile device unique ID, the IP address of Your mobile device..."

   - Use:
     - "The Company may use Personal Data for the following purposes: To provide and maintain our Service, including to monitor the usage of our Service."
     - "To manage Your Account: to manage Your registration as a user of the Service."
     - "To contact You: To contact You by email, telephone calls, SMS..."

   - Share:
     - With Service Providers: "We may share Your personal information with Service Providers to monitor and analyze the use of our Service."
     - For business transfers: "We may share or transfer Your personal 

### Using LangChain prompt abstractions to complete the above 

In [None]:
import openai
from openai import OpenAI
client = OpenAI()
import os
import json
import time
import pandas as pd
from langchain import PromptTemplate
from langchain.chains.router.multi_prompt import MultiPromptChain

from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.chains.router.multi_prompt_prompt import MULTI_PROMPT_ROUTER_TEMPLATE


marker = time
output_dir = 'output'

max_section_size = 1000

models = ["gpt-4-turbo-2024-04-09", "gpt-4-0125-preview", "gpt-4-0125-preview", "gpt-4-turbo-preview", "gpt-4-1106-preview", "gpt-4-32k", "gpt-3.5-turbo-0125", "gpt-3.5-turbo-1106"]
model_name = "gpt-4-turbo-2024-04-09"

from langchain import PromptTemplate
from langchain.chains.base import Chain

class StoryTemplateChain(Chain):
    def _call(self, inputs):
        # Implement the logic for all templates here
        build_string = inputs.get("build_string", "")
        prompt = inputs.get("prompt", "")
        level_3_prompt = inputs.get("level_3_prompt", "")
        response_1 = inputs.get("response_1", "")
        stored_prompt = inputs.get("stored_prompt", "")
        response_2 = inputs.get("response_2", "")

        # Combine the inputs based on the context
        if build_string and prompt:
            # Template 1 logic
            return build_string + "\n" + prompt
        elif level_3_prompt and response_1:
            # Template 2 logic
            return level_3_prompt + "\n" + response_1 + "\nUser: Review the privacy behaviors identified. Ensure that each behavior accurately reflects the content of the input section of the privacy policy. User write out the privacy stories in the format of we (action) (data type) for the purpose of (purpose) for all related behaviors within this policy\nSystem: Privacy stories: "
        elif stored_prompt and response_2:
            # Template 3 logic
            return stored_prompt + "\n" + response_2 + "\nUser: Review the privacy stories identified. Ensure that each story and behavior accurately reflects the content of the input section of the privacy policy and is found within the privacy behavior taxonomy. User write out privacy stories in the format of we (action) (data type) for the purpose of (purpose) for all related behaviors within this policy for which you are confident\nSystem: Privacy stories: "
        else:
            # Default behavior
            return "No valid inputs provided"

    @property
    def input_keys(self):
        return ["build_string", "prompt", "level_3_prompt", "response_1", "stored_prompt", "response_2"]

    @property
    def output_keys(self):
        return ["final_response"]

# Define the combined template
combined_template = PromptTemplate(
    input_variables=["build_string", "prompt", "level_3_prompt", "response_1", "stored_prompt", "response_2"],
    template="{build_string}\n{prompt}\n{level_3_prompt}\n{response_1}\n{stored_prompt}\n{response_2}"
)

# Create a MultiPromptChain
sequence_chain = MultiPromptChain(
    default_chain=StoryTemplateChain(),
    destination_chains={
        "combined_template": StoryTemplateChain(),
    },
    router_chain=None,  # Set real router chain
    silent_errors=False 
)

def level_3_gpt(prompts, iterations, app, level):
    xlsx_filename = "privacy_stories_1_1.xlsx"
    file_path_xlsx = os.path.join(output_dir, xlsx_filename)

    # Check if the xlsx file exists to append data or to create a new DataFrame
    if os.path.exists(file_path_xlsx):
        df_existing = pd.read_xlsx(file_path_xlsx)
    else:
        df_existing = pd.DataFrame()

    # Prepare columns for the DataFrame according to the requirements
    columns = ['Section', 'App / level / model'] + [f'Prompt {i}' for i in range(1, iterations + 1)] + [f'Response {i}' for i in range(1, iterations + 1)] + ['Completion Objects']

    # Initialize a list to store new conversation data
    new_data = []
    section = 1

    for prompt in prompts:
        try:
            prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}: {sections[section - 1]}"}
        except:
            prompt_data = {'App / level / model': f"{app}\n{level}\n{model_name}", 'Section': f"Section {section} Size ~{max_section_size}"}

        completion_objects = []

        for iteration in range(1, iterations + 1):
            # Execute the MultiPromptChain
            response = sequence_chain({"build_string": build_string, "prompt": prompt, "level_3_prompt": level_3_prompt[section - 1]})
            response_text = response["final_response"]
            print(response_text)
            prompt_data[f'Prompt {iteration}'] = build_string + prompt
            prompt_data[f'Response {iteration}'] = response_text
            completion_objects.append(json.dumps(response, default=str))

        prompt_data['Completion Objects'] = ' '.join(completion_objects)
        new_data.append(prompt_data)
        section += 1

        response_data_with_prompt = {
            "original_prompt": prompt,
            "final_combined_text": response_text
        }
        response_json = json.dumps(response_data_with_prompt, indent=4)

        # Filename by timestamp
        filename = f"privacy_stories_1_1.json"
        file_path = os.path.join(output_dir, filename)

        with open(file_path, 'w') as file:
            file.write(response_json)

    df_new = pd.DataFrame(new_data, columns=columns)

    # Combine new DataFrame with existing data if present
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Write combined DataFrame to the xlsx file, replacing it
    df_combined.to_xlsx(file_path_xlsx, index=False)


    return response_text  # Return the last response text

iterations = 4 #Leave at 4 to complete all steps 
level = 3
level_2_prompt = behaviors_strings
prompts = level_1_prompt
app_id = 'com.aisecurity'
file_path = f'{app_id}_privacy_stories.xlsx'
stories = level_3_llm(prompts, iterations, app_id, level, file_path)
 #print(stories)


### Extract behaviors from llm output


In [None]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

# Load the JSON ontology
with open('privacy_ontology2.json', 'r') as file:
    ontology = json.load(file)

df = pd.read_xlsx('output/privacy_stories_1_1.xlsx')

# Function to extract n-grams from sentence
def extract_ngrams(sentence, n):
    words = word_tokenize(sentence)
    n_grams = ngrams(words, n)
    return [' '.join(grams).lower() for grams in n_grams]

# Function to find patterns based on ontology
def find_patterns(text, ontology):
    patterns = []
    # Tokenizing the text into sentences
    sentences = sent_tokenize(text)
    for sentence in sentences:
        # Create n-grams for up to three words
        all_ngrams = []
        for n in range(1, 4):
            all_ngrams.extend(extract_ngrams(sentence, n))

        for ngram in all_ngrams:
            for main_branch, sub_branches in ontology.items():
                for key, value in sub_branches.items():
                    # Check direct matches and synonyms
                    terms_to_check = [key.lower()] + [syn.lower() for syn in value.get("Synonyms", [])]
                    if any(ngram == term for term in terms_to_check):
                        # If patterns are available, add them
                        if 'Patterns' in value:
                            patterns.extend(value['Patterns'])
                        # Check further nested branches if any
                        for nested_key, nested_value in value.items():
                            if isinstance(nested_value, dict) and 'Patterns' in nested_value:
                                patterns.extend(nested_value['Patterns'])
    return list(set(patterns))  # Remove duplicates

# Processing the columns that start with 'Response'
for column in df.columns:
    if column.startswith('Response'):
        df[f'Patterns from {column}'] = df[column].apply(lambda x: find_patterns(x, ontology) if isinstance(x, str) else [])

df.to_xlsx('stories_patterns_1_0.xlsx', index=False)
