## What is RAG (Retrieval-Augmented Generation)?

### Importing and Installing Required Libraries

In [6]:
!pip install -U langchain langchain-openai pypdf chromadb langchain_community


Defaulting to user installation because normal site-packages is not writeable


In [7]:
import os
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader

from langchain_community.vectorstores import Chroma  
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, AIMessage

load_dotenv()  


### Creating an OpenAI Client Instance

openai_key = os.environ.get('OPENAI_API_KEY')

llm = ChatOpenAI(
    openai_api_key = openai_key ,
    model = 'gpt-4',
    temperature = 0.7
)


### Data Ingestion into Chroma DB

In [8]:
data_url = "https://www.hse.ie/eng/services/list/5/cancer/pubs/reports/national-survey-on-lung-cancer-awareness-report-january-2020.pdf"
loader = PyPDFLoader(data_url)
lung_cancer_docs = loader.load_and_split()

data_url = "https://s21.q4cdn.com/399680738/files/doc_financials/2024/q3/META-Q3-2024-Earnings-Call-Transcript.pdf"
loader = PyPDFLoader(data_url)
meta_docs = loader.load_and_split()

data_url = "https://s2.q4cdn.com/299287126/files/doc_financials/2024/q3/AMZN-Q3-2024-Earnings-Release.pdf"
loader = PyPDFLoader(data_url)
amazon_docs = loader.load_and_split()

def add_metadata(docs, source, month):
    for doc in docs:
        doc.metadata["source"] = source
        doc.metadata["month"] = month
    return docs

lung_cancer_docs = add_metadata(lung_cancer_docs, "lung_cancer_doc", "June")
meta_docs = add_metadata(meta_docs, "meta_doc", "October")
amazon_docs = add_metadata(amazon_docs, "amazon_doc", "November")

embeddings = OpenAIEmbeddings(openai_api_key = openai_key)

lung_cancer_vectorstore= Chroma.from_documents(
    documents=lung_cancer_docs,
    embedding=embeddings,
    collection_name="lung_cancer_collection"
)

earning_calls_vectorstore = Chroma.from_documents(
    documents= meta_docs + amazon_docs,
    embedding=embeddings,
    collection_name="earning_calls_collection"
)


### Retrieval Augmented Generation with ChromaDB


In [9]:


prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

Question: {input}

Context: {context}
"""
)

document_chain = create_stuff_documents_chain(llm, prompt)


lung_cancer_retriever = lung_cancer_vectorstore.as_retriever()
lung_cancer_retrieval_chain = create_retrieval_chain(lung_cancer_retriever, document_chain)

query = "What is the revenue generated by Meta in Q3 2024?"
response = lung_cancer_retrieval_chain.invoke({"input": query})
print(response["answer"])


The context does not provide information on the revenue generated by Meta in Q3 2024.


In [10]:

query = "What are the major causes of Lung Cancer?"
response = lung_cancer_retrieval_chain.invoke({"input": query})
print(response["answer"])


The major causes of lung cancer are smoking, working environment, hereditary or genetic factors, air pollution, toxic chemicals, asbestos, second hand smoke, and alcohol. Other less common causes include poor diet, inhaling dust, lifestyle choices, previous cancer or other illnesses, obesity, unspecified pollution, vaping, lack of exercise, drug use, stress, and radon gas.


In [11]:
earning_calls_retriever = earning_calls_vectorstore.as_retriever()
earning_calls_retrieval_chain = create_retrieval_chain(earning_calls_retriever, document_chain)

query = "What is the revenue generated by Meta in Q3 2024?"
response = earning_calls_retrieval_chain.invoke({"input": query})
print(response["answer"])



The revenue generated by Meta in Q3 2024 was $40.6 billion.


In [12]:
query = "What is the revenue generated by Amazon?"
response = earning_calls_retrieval_chain.invoke({"input": query})
print(response["answer"])


The revenue generated by Amazon in the third quarter of 2024 was $158.9 billion.


In [13]:
earning_calls_retriever = earning_calls_vectorstore.as_retriever(search_kwargs={"filter": {"source": "meta_doc"}})
earning_calls_retrieval_chain = create_retrieval_chain(earning_calls_retriever, document_chain)

query = "What is the revenue generated by Amazon in Q3 2024?"
response = earning_calls_retrieval_chain.invoke({"input": query})
print(response["answer"])

The context does not provide information on the revenue generated by Amazon in Q3 2024.


In [14]:
earning_calls_retriever = earning_calls_vectorstore.as_retriever(search_kwargs={"filter": {"source": "meta_doc"}})
earning_calls_retrieval_chain = create_retrieval_chain(earning_calls_retriever, document_chain)

query = "What is the revenue generated by Meta in Q3 2024?"
response = earning_calls_retrieval_chain.invoke({"input": query})
print(response["answer"])

The revenue generated by Meta in Q3 2024 was $40.6 billion.


## Implementing Authorization in RAG Application using Cerbos

### Setting up the Environment

In [15]:
import os
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader

from langchain_community.vectorstores import Chroma  
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, AIMessage


In [45]:

from cerbos.sdk.grpc.client import CerbosClient
from cerbos.engine.v1 import engine_pb2
from cerbos.response.v1 import response_pb2
from google.protobuf.struct_pb2 import Value

In [17]:
load_dotenv()

# Cerbos Client Initialization
cerbos_client = CerbosClient("localhost:3593", tls_verify=False)

# OpenAI Client Initialization
openai_key = os.environ.get('OPENAI_API_KEY')
llm = ChatOpenAI(
    openai_api_key=openai_key,
    model="gpt-4",
    temperature=0.7
)

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

### Data Loading and Metadata Addition

In [18]:
data_url = "https://www.hse.ie/eng/services/list/5/cancer/pubs/reports/national-survey-on-lung-cancer-awareness-report-january-2020.pdf"
loader = PyPDFLoader(data_url)
lung_cancer_docs = loader.load_and_split()

data_url = "https://s21.q4cdn.com/399680738/files/doc_financials/2024/q3/META-Q3-2024-Earnings-Call-Transcript.pdf"
loader = PyPDFLoader(data_url)
meta_docs = loader.load_and_split()

data_url = "https://s2.q4cdn.com/299287126/files/doc_financials/2024/q3/AMZN-Q3-2024-Earnings-Release.pdf"
loader = PyPDFLoader(data_url)
amazon_docs = loader.load_and_split()

def add_metadata(docs, source, month):
    for doc in docs:
        doc.metadata["source"] = source
        doc.metadata["month"] = month
    return docs

lung_cancer_docs = add_metadata(lung_cancer_docs, "lung_cancer_doc", "June")
meta_docs = add_metadata(meta_docs, "meta_doc", "October")
amazon_docs = add_metadata(amazon_docs, "amazon_doc", "November")


# Create empty vector stores
lung_cancer_vectorstore = Chroma(
    collection_name="lung_cancer_collection",
    embedding_function=embeddings
)

earning_calls_vectorstore = Chroma(
    collection_name="earning_calls_collection",
    embedding_function=embeddings
)


  lung_cancer_vectorstore = Chroma(


### RBAC Authorization on RAG with Cerbos

apiVersion: "api.cerbos.dev/v1"
resourcePolicy:
  resource: "vector_store"
  version: "default"
  rules:
    - actions: ["ingest"]
      effect: EFFECT_ALLOW
      roles: ["data_ingestor", "admin"]


In [19]:
def ingest_data_with_rbac(vector_store, docs, principal, resource):
    
    with CerbosClient("localhost:3592", tls_verify=False) as client:
        
        if client.is_allowed("ingest", principal, resource):
            print(f"Access granted for {principal.id} to ingest data in resource {resource.id}.")
            vector_store.add_documents(docs)
            return True
            
        else:
            print(f"Access denied for {principal.id}.")
            return False

In [20]:
# Define Principals with different roles 
principal_user1 = engine_pb2.Principal(
    id="user1",
    roles=["data_retriever"],  
    policy_version="default",
)


principal_user2 = engine_pb2.Principal(
    id="user2",
    roles=["data_ingestor"], 
    policy_version="default",
)


principal_user3 = engine_pb2.Principal(
    id="admin",
    roles=["admin"], 
    policy_version="default",
)


In [21]:
resource_rbac = engine_pb2.Resource(
    id="lung_cancer_vectorstore",
    kind="vector_store",
)

In [23]:
for principal in [principal_user1, principal_user2, principal_user3]:
    print("=====================")
    result = ingest_data_with_rbac(lung_cancer_vectorstore,
                                   lung_cancer_docs,
                                   principal, 
                                   resource_rbac)
    
    if result:
        print("Operation successfull - data ingested")
    else:
        print("You do not have permission to ingest the data")

Access denied for user1.
You do not have permission to ingest the data
Access granted for user2 to ingest data in resource lung_cancer_vectorstore.
Operation successfull - data ingested
Access granted for admin to ingest data in resource lung_cancer_vectorstore.
Operation successfull - data ingested


### ABAC Authorization on RAG with Cerbos

apiVersion: "api.cerbos.dev/v1"
resourcePolicy:
  resource: "vector_store"
  version: "default"
  rules:

    - actions: ["ingest"]
      effect: EFFECT_ALLOW
      roles: ["data_ingestor", "admin"]


    - actions: ["ingest, retrieve"]
      effect: EFFECT_ALLOW
      roles: ["finance", "health"]
      condition:
        match:
          expr: request.principal.attr.department == request.resource.attr.type


#### Data Ingestion with ABAC

In [24]:
def ingest_data_with_abac(vector_store, docs, principal, resource):

    with CerbosClient("localhost:3592", tls_verify=False) as client:
        
        if client.is_allowed("department_ingest", principal, resource):
            print(f"Access granted for {principal.id} to ingest data in resource {resource.id}.")
            vector_store.add_documents(docs)
            return True
            
        else:
            print(f"Access denied for {principal.id}.")
            return False

In [25]:
principal_user4 = engine_pb2.Principal(
    id="user4",
    roles=["department_data_ingestor"],
    policy_version="default",
    attr={"department": Value(string_value="finance")}
)

principal_user5 = engine_pb2.Principal(
    id="user5",
    roles=["department_data_ingestor"],
    policy_version="default",
    attr={"department": Value(string_value="health")}
)

resource_abac_finance = engine_pb2.Resource(
    id="finance_vectorstore",
    kind="vector_store",
    attr={"type": Value(string_value="finance")}
)

resource_abac_health = engine_pb2.Resource(
    id="health",
    kind="vector_store",
    attr={"type": Value(string_value="health")}
)


In [26]:
for principal in [principal_user4, principal_user5]:
    print("=====================")
    result = ingest_data_with_abac(lung_cancer_vectorstore,
                                   lung_cancer_docs,
                                   principal, 
                                   resource_abac_health)
    
    if result:
        print("Operation successfull - data ingested")
    else:
        print("You do not have permission to ingest the data")

Access denied for user4.
You do not have permission to ingest the data
Access granted for user5 to ingest data in resource health.
Operation successfull - data ingested


In [27]:
docs = meta_docs + amazon_docs
for principal in [principal_user4, principal_user5]:
    
    result = ingest_data_with_abac(earning_calls_vectorstore,
                                   docs,
                                   principal, 
                                   resource_abac_finance)
    
    if result:
        print("Operation successfull - data ingested")
    else:
        print("You do not have permission to ingest the data")

Access granted for user4 to ingest data in resource finance_vectorstore.
Operation successfull - data ingested
Access denied for user5.
You do not have permission to ingest the data


#### Data Retrieval with ABAC

In [28]:
def generate_response(vector_store, query, doc_type) :

    prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:
    
    Question: {input}
    
    Context: {context}
    """
    )
    
    document_chain = create_stuff_documents_chain(llm, prompt)
    
    
    retriever = vector_store.as_retriever(search_kwargs={"filter": {"source": doc_type}})
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    
    response = retrieval_chain.invoke({"input": query})
    return response['answer']

In [29]:
generate_response(earning_calls_vectorstore, 
                  "What is the revenue of Meta for Q3 2024?", 
                  "meta_doc")

'The revenue of Meta for Q3 2024 was $40.6 billion.'

In [60]:
def retrieve_data_with_abac(vector_store, query, principal, resource, resource_plan):
    
    with CerbosClient("localhost:3592", tls_verify=False) as client:
        
        if client.is_allowed("retrieve", principal, resource):
            print(f"Access granted for {principal.id} to access the resource {resource.id}.")

            plan = client.plan_resources(action="retrieve", 
                                 principal=principal, 
                                 resource=plan_resource)

            doc_type = plan.filter.condition.expression.operands[0].value.string_value
            
            response = generate_response(vector_store, 
                                         query, 
                                         doc_type)
            return response
            
        else:
            return (f"Access denied for {principal.id} to access the resource {resource.id}.")


In [61]:
principal_user6 = engine_pb2.Principal(
    id="user6",
    roles=["doc_retriever"],
    policy_version="default",
    attr={"doc_type": Value(string_value="meta_doc")}
)

principal_user7 = engine_pb2.Principal(
    id="user7",
    roles=["doc_retriever"],
    policy_version="default",
    attr={"doc_type": Value(string_value="amazon_doc")}
)

resource_abac_meta = engine_pb2.Resource(
    id="meta_docs_vectorstore",
    kind="vector_store",
    attr={"doc_type": Value(string_value="meta_doc")}
)

resource_abac_amazon = engine_pb2.Resource(
    id="amazon_docs_vectorstore",
    kind="vector_store",
    attr={"doc_type": Value(string_value="amazon_doc")}
)

In [69]:
plan_resource = engine_pb2.PlanResourcesInput.Resource(
    kind="vector_store",
)

with CerbosClient("localhost:3592", tls_verify=False) as client:
    plan = plan = client.plan_resources(action="retrieve", 
                             principal=principal_user7, 
                             resource=plan_resource)

    print(plan)

request_id: "288b4689-215d-4baf-8b47-062d901be23a"
action: "retrieve"
resource_kind: "vector_store"
filter {
  kind: KIND_CONDITIONAL
  condition {
    expression {
      operator: "eq"
      operands {
        value {
          string_value: "amazon_doc"
        }
      }
      operands {
        variable: "request.resource.attr.doc_type"
      }
    }
  }
}
cerbos_call_id: "01JE10QSD53NF54Y18CCTKC92J"



In [64]:
for principal in [principal_user6, principal_user7]:
    print("=====================")
    query = "What is the revenue of Meta for Q3 2024?"
    doc_type = "meta_doc"
    
    result = retrieve_data_with_abac(earning_calls_vectorstore,
                                   query,
                                   principal, 
                                   resource_abac_meta,
                                   plan_resource
                                    )
    
    print(result)

Access granted for user6 to access the resource meta_docs_vectorstore.
The revenue of Meta for Q3 2024 was $40.6 billion.
Access denied for user7 to access the resource meta_docs_vectorstore.


In [71]:
for principal in [principal_user6, principal_user7]:
    print("=====================")
    query = "What is the revenue of Amazon for Q3 2024?"
    doc_type = "amazon_doc"
    
    result = retrieve_data_with_abac(earning_calls_vectorstore,
                                   query,
                                   principal, 
                                   resource_abac_amazon,
                                   plan_resource)
    
    print(result)

Access denied for user6 to access the resource amazon_docs_vectorstore.
Access granted for user7 to access the resource amazon_docs_vectorstore.
The revenue of Amazon for Q3 2024 is $158.9 billion.


### ReBAC Approach on RAG with Cerbos

In [84]:
def retrieve_data_with_abac(vector_store, query, principal, resource, resource_plan):
    
    with CerbosClient("localhost:3592", tls_verify=False) as client:
        
        if client.is_allowed("retrieve", principal, resource):
            print(f"Access granted for {principal.id} to access the resource {resource.id}.")

            plan = client.plan_resources(action="retrieve", 
                     principal=principal, 
                     resource=plan_resource)

            doc_type = plan.filter.condition.expression.operands[0].value.string_value
            
            response = generate_response(vector_store, 
                                         query, 
                                         doc_type)
            return response
            
        else:
            return (f"Access denied for {principal.id} to access the resource {resource.id}.")

In [85]:
principal_user8 = engine_pb2.Principal(
    id="user8",
    roles=["team_leader"],
    policy_version="default",
    attr={"team_name": Value(string_value="finance"),
         "doc_type": Value(string_value="meta_doc")}
)

principal_user9 = engine_pb2.Principal(
    id="user9",
    roles=["team_leader"],
    policy_version="default",
    attr={"team_name": Value(string_value="health"),
         "doc_type": Value(string_value="lung_cancer_doc")}
)

resource_rebac_finance_meta = engine_pb2.Resource(
    id="finance_vectorstore_meta",
    kind="vector_store",
    attr={"team_name": Value(string_value="finance"),
         "doc_type": Value(string_value="meta_doc")}
)

resource_rebac_finance_amazon = engine_pb2.Resource(
    id="finance_vectorstore_amazon",
    kind="vector_store",
    attr={"team_name": Value(string_value="finance"),
         "doc_type": Value(string_value="amazon_doc")}
)

resource_rebac_health = engine_pb2.Resource(
    id="health_vectorstore",
    kind="vector_store",
    attr={"team_name": Value(string_value="health"),
         "doc_type": Value(string_value="lung_cancer_doc")}
)

In [86]:
for principal in [principal_user8, principal_user9]:
    for resource in [resource_rebac_finance_meta, resource_rebac_finance_amazon]:
        print("=====================")
        query = "What is the revenue of Meta for Q3 2024?"
        doc_type = "meta_doc"
        
        result = retrieve_data_with_abac(earning_calls_vectorstore,
                                       query,
                                       principal, 
                                       resource,
                                       plan_resource)
        
        print(result)

Access granted for user8 to access the resource finance_vectorstore_meta.
The context does not provide information on Meta's revenue for Q3 2024.
Access denied for user8 to access the resource finance_vectorstore_amazon.
Access denied for user9 to access the resource finance_vectorstore_meta.
Access denied for user9 to access the resource finance_vectorstore_amazon.
