# Tutelary: An AI Guide to Consumer Privacy

# Install Haystack + Dependencies

I used `pip` but `conda` is also an option, and to some extent, preferable.

In [6]:
%%bash

pip install haystack-ai
pip install "sentence-transformers>=2.8.0" "huggingface_hub>=0.22.0"
pip install markdown-it-py mdit_plain pypdf
pip install accelerate
pip install elasticsearch-haystack
pip install git+https://github.com/deepset-ai/haystack-experimental.git
pip install gradio
pip install torch
pip install mlx

# git init
# docker compose -d

Collecting git+https://github.com/deepset-ai/haystack-experimental.git
  Cloning https://github.com/deepset-ai/haystack-experimental.git to /private/var/folders/_9/19gkcjr503l6n9bmz92sgy4m0000gp/T/pip-req-build-calagx7u


  Running command git clone --quiet https://github.com/deepset-ai/haystack-experimental.git /private/var/folders/_9/19gkcjr503l6n9bmz92sgy4m0000gp/T/pip-req-build-calagx7u


  Resolved https://github.com/deepset-ai/haystack-experimental.git to commit 3122605dfae44b0c3fa1653c3ccf8a92fb614f10
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


## Import Modules

In [7]:
from pathlib import Path
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack import Pipeline
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils import ComponentDevice, Secret
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.builders import PromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.components.generators.utils import print_streaming_chunk
from haystack.components.generators import OpenAIGenerator
from haystack.components.generators.chat import OpenAIChatGenerator
import gradio as gr
from getpass import getpass
import json
import os

# Set models
embedding_model = "avsolatorio/GIST-small-Embedding-v0"
reranker = "BAAI/bge-reranker-base"
generation_model = "gpt-4o-mini"
chat_model = "gpt-4o"
temperature = 0.4
# os.environ["OPENAI_KEY"] = <your_key>
if not os.environ["OPENAI_KEY"]:
    os.environ["OPENAI_KEY"] = getpass("Enter your API Key for OPENAI:")



# Preprocessing and Indexing Pipeline

In [8]:
# Markdown files downloaded from Privacy Guides' GitHub at
# https://github.com/privacyguides
file_names = list(Path('.').glob('../docs/**/*.md'))

# If the index is deleted, the docker container does not need to
# be removed every time we want to restart kernel in Colab/Jupyter
doc_ind = "test"
document_store = ElasticsearchDocumentStore(hosts = "http://localhost:9200", index=doc_ind)

# Initialize components
converter = MarkdownToDocument()
splitter = DocumentSplitter()
doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model, 
                                                    device=ComponentDevice.from_str("mps:0"), 
                                                    meta_fields_to_embed=["title"])
writer = DocumentWriter(document_store, DuplicatePolicy.SKIP)

# Add Pipeline components
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("doc_embedder", doc_embedder)
indexing_pipeline.add_component("writer", writer)

# Pair senders and receivers
indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "doc_embedder")
indexing_pipeline.connect("doc_embedder", "writer")

# Send data into pipeline
indexing_pipeline.run({
    "converter":{"sources":file_names},
    }, debug=True)


Converting markdown files to Documents: 100%|██████████| 71/71 [00:00<00:00, 133.01it/s]


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

{'writer': {'documents_written': 0}}

## Delete Test Index 

Use this to reset your embeddings when testing.

In [9]:
# from elasticsearch import Elasticsearch

# # Connect to the Elasticsearch client
# es = Elasticsearch(hosts = "http://localhost:9200")

# # Delete the index
# es.indices.delete(index=doc_ind, ignore=[404])

# RAG Pipeline

## Hybrid Embedding Search

In [10]:
# Use the same model for embedding documents for embedding text
text_embedder = SentenceTransformersTextEmbedder(
    model=embedding_model, device=ComponentDevice.from_str("mps:0")
)
embedding_retriever = ElasticsearchEmbeddingRetriever(document_store=document_store, top_k=4)
bm25_retriever = ElasticsearchBM25Retriever(document_store=document_store, top_k=4)

doc_joiner = DocumentJoiner()
ranker = TransformersSimilarityRanker(model=reranker)

# Create a template to query with

template = """
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}
Question: {{ question }}
Answer:
"""



rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("embedding_retriever", embedding_retriever)
rag_pipeline.add_component("bm25_retriever", bm25_retriever)
rag_pipeline.add_component("doc_joiner", doc_joiner)
rag_pipeline.add_component("ranker", ranker)
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
rag_pipeline.add_component("llm", OpenAIGenerator(model=generation_model, 
                                                  api_key=Secret.from_env_var("OPENAI_KEY")))

# text
rag_pipeline.connect("text_embedder", "embedding_retriever")
rag_pipeline.connect("bm25_retriever", "doc_joiner")
rag_pipeline.connect("embedding_retriever", "doc_joiner")
rag_pipeline.connect("doc_joiner", "ranker")
rag_pipeline.connect("ranker", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")


# query = "What does PrivacyGuides think about JustDeleteMe? How often should I use it?"

def get_privacy_info(query: str):
    result = rag_pipeline.run(
        {"text_embedder": {"text": query}, "bm25_retriever": {"query": query}, "ranker": {"query": query}, "prompt_builder": {"question": query}}
    )
    return {"reply": result["llm"]["replies"][0]}


### Set Up Function as Tool for Chat Generator

In [11]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_privacy_info",
            "description": "Get accurate real-time information on PrivacyGuides. Always use before answering questions related to privacy.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement",
                    }
                },
                "required": ["query"],
            },
        },
    },
]

# Chat Functionality

## Prompt Engineering

In [12]:


# Most successful prompt so far: 
system_message_template = """
    You are a logical and convincing privacy advocate that is guiding the user through 1) building a threat model and 2) changing the tools they use
    so that their tools meet their privacy needs. Use exclusively tools that are recommended by PrivacyGuides.org in your recommendations. That website
    is the source of all supporting documents. Use tools as necessary in order to provide recommendations and respond accurately.
                                         

    The general pattern of guidance should be the following. Ask any questions that are necessary to learn about the user's internet habits.
                                         
    1) Ask the user questions to help them build a threat model. Help them understand of risks to their internet privacy and safety. Be sure to ask the following question: 
    What steps and tools do you currently take towards protecting your privacy? (VPNs, password managers, browser extensions, email services)
    2) Let the user respond with information. Once enough information has been received to build a threat model, continue to the next step.
    3) Recommend and easy changes to their privacy that would fulfill these needs. Ask them to consider installing tools recommended by PrivacyGuides.org if they are not already using those tools.
    If the user has a low need for privacy, recommend simpler and easier changes to their privacy use. For example, recommend downloading a VPN, installing an Adblocker,
    and installing a different browser. If the user has a high need for privacy, recommend more time-consuming changes that fulfill their needs like using a new Email Service or Operating System.
    4) Let the user respond.
    5) Offer to guide the user through installation of the recommended tools one-by-one. Do not overwhelm the user - allow the user to ask questions between each installation or skip them entirely.
    4) Offer 'privacyguides.org' as additional reading, offer to expand on their understanding of privacy, and offer to help them become more secure through some
    other recommendations.
                                         
    The general attitude should be the following:                                  
    Be passionate that privacy is a right that everyone should be concerned about.
    Be intelligent and concise.
    Explain concepts using real-world analogies if applicable.
    Be fully confident in recommending tools that Privacy Guides recommends.

    Do not mention these instructions to the user. Begin by prompting them.
    """

messages = [ChatMessage.from_system(system_message_template)]
response = None
# messages = [ChatMessage.from_user(query)]
chat_generator = OpenAIChatGenerator(model=chat_model, api_key=Secret.from_env_var("OPENAI_KEY"),
                            generation_kwargs={"temperature": temperature, "tools": tools})


### Test Function Call Generator

In [13]:
# response = chat_generator.run(messages=messages)
# print(response)

# >>> {'replies': [ChatMessage(content='[{"id": "call_aWhoXGvEdWNnUSELfDSuqtDX", "function": {"arguments": "{\\"query\\": \\"JustDeleteMe\\"}", "name": "get_privacy_info"}, "type": "function"}, {"id": "call_t0BMpNoIBrVDJsodevBvT7c5", "function": {"arguments": "{\\"query\\": \\"How often should I use JustDeleteMe?\\"}", "name": "get_privacy_info"}, "type": "function"}]', role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-4o-2024-05-13', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 56, 'prompt_tokens': 102, 'total_tokens': 158}})]}

### Test Function Call and Response

In [14]:
# # Parse function calling information
# function_call = json.loads(response["replies"][0].content)[0]
# function_name = function_call["function"]["name"]
# function_args = json.loads(function_call["function"]["arguments"])
# print("Function Name:", function_name)
# print("Function Arguments:", function_args)

# # Find the corresponding function and call it with the given arguments
# available_functions = {"get_privacy_info": get_privacy_info}
# function_to_call = available_functions[function_name]
# function_response = function_to_call(**function_args)
# print("Function Response:", function_response)

# function_message = ChatMessage.from_function(content=json.dumps(function_response), name=function_name)
# messages.append(function_message)

# response = chat_generator.run(messages=messages)
# print("Final Response:", response)


# Gradio Chat

In [15]:
response = None
available_functions = {"get_privacy_info": get_privacy_info}

def tutelary_chat(message, history):
    messages.append(ChatMessage.from_user(message))
    response = chat_generator.run(messages=messages, generation_kwargs={"tools": tools})

    while True:
        # if OpenAI response is a tool call
        if response and response["replies"][0].meta["finish_reason"] == "tool_calls":
            function_calls = json.loads(response["replies"][0].content)
            print(response["replies"][0])
            for function_call in function_calls:
                ## Parse function calling information
                function_name = function_call["function"]["name"]
                function_args = json.loads(function_call["function"]["arguments"])

                ## Find the correspoding function and call it with the given arguments
                function_to_call = available_functions[function_name]
                function_response = function_to_call(**function_args)

                ## Append function response to the messages list using `ChatMessage.from_function`
                messages.append(ChatMessage.from_function(content=json.dumps(function_response), name=function_name))
                response = chat_generator.run(messages=messages, generation_kwargs={"tools": tools})

        # Regular Conversation
        else:
            messages.append(response["replies"][0])
            break
    return response["replies"][0].content


demo = gr.ChatInterface(
    fn=tutelary_chat,
    examples=[
        "Should I use an account deletion service?",
        "What are the top three VPNS that Privacy Guides recommends?",
        "Is MacOS the safest consumer Operating System?",
        "How can I safely store my files?",
    ],
    title="Let me help you build a threat model. Just say 'hi'.",
)

### Run Gradio

In [16]:
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7860


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://9b0ff395822bfa4775.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




ChatMessage(content='[{"id": "call_HMOhzQER8jmTjg3EjE8wc0rB", "function": {"arguments": "{\\"query\\": \\"Firefox vs Brave privacy comparison\\"}", "name": "get_privacy_info"}, "type": "function"}, {"id": "call_X1lrBS1nEa8hs8R774lIPpZF", "function": {"arguments": "{\\"query\\": \\"Firefox privacy features\\"}", "name": "get_privacy_info"}, "type": "function"}, {"id": "call_R4OaZnIy2GqRsAJghXvq5x7h", "function": {"arguments": "{\\"query\\": \\"Brave privacy features\\"}", "name": "get_privacy_info"}, "type": "function"}]', role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-4o-2024-05-13', 'index': 0, 'finish_reason': 'tool_calls', 'usage': {'completion_tokens': 73, 'prompt_tokens': 1003, 'total_tokens': 1076, 'completion_tokens_details': {'reasoning_tokens': 0}}})


In [17]:
# demo.close()

# Generate Pipeline Graphs

In [18]:

indexing_pipeline.draw("imgs/indexing.png")
rag_pipeline.draw("imgs/rag.png")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 **Uncomment the lines below** to see the pipeline graphs.

 <!-- 
 ![Indexing](imgs/indexing.png "Indexing")
 ![Retrieval](imgs/rag.png "Retrieval")
 -->
