In [1]:
!pip install langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker faiss-cpu tiktoken


Collecting langchain-experimental
  Using cached langchain_experimental-0.0.51-py3-none-any.whl (167 kB)
Collecting presidio-analyzer
  Using cached presidio_analyzer-2.2.353-py3-none-any.whl (85 kB)
Collecting presidio-anonymizer
  Using cached presidio_anonymizer-2.2.353-py3-none-any.whl (31 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.7.4-cp39-cp39-win_amd64.whl (10.8 MB)
Collecting langchain<0.2.0,>=0.1.5
  Using cached langchain-0.1.7-py3-none-any.whl (815 kB)
Collecting phonenumbers<9.0.0,>=8.12
  Using cached phonenumbers-8.13.30-py2.py3-none-any.whl (2.6 MB)
Collecting pycryptodome>=3.10.1
  Using cached pycryptodome-3.20.0-cp35-abi3-win_amd64.whl (1.8 MB)
Installing collected packages: pycryptodome, phonenumbers, langchain, presidio-anonymizer, presidio-analyzer, langchain-experimental, faiss-cpu
  Attempting uninstall: langchain
    Found existing installation: langchain 0.1.4
    Uninstalling langchain-0.1.4:
      Successfully uninstalled langchain-0.1.4
Successfully

In [43]:
document_content = """Date: October 19, 2021
 Witness: John Doe
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.

 Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@example.com.

 Please consider this information to be highly confidential and respect my privacy.

 The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, support@bankname.com.
 My representative there is Victoria Cherry (her business phone: 987-654-3210).

 Thank you for your assistance,

 John Doe"""

In [44]:
from langchain.schema import Document
documents=[Document(page_content=document_content)]

In [45]:
import re

def print_color_pii(string):
    colored_string=re.sub(
       r"(<[^>]*>)",lambda m: "\033[31m"+ m.group(1)+"\033[0m", string
    )
    print(colored_string)

In [46]:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
    
anonymizer=PresidioReversibleAnonymizer(
    add_default_faker_operators=False,
)
print_color_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME_2>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_NUMBER>[0m,

In [47]:
import pprint

pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021', '<DATE_TIME_2>': '9:30 AM'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


In [48]:
# Define the regex pattern in a Presidio `Pattern` object:
from presidio_analyzer import Pattern, PatternRecognizer

polish_id_pattern = Pattern(
    name="polish_id_pattern",
    regex="[A-Z]{3}\d{6}",
    score=1,
)
time_pattern = Pattern(
    name="time_pattern",
    regex="(1[0-2]|0?[1-9]):[0-5][0-9] (AM|PM)",
    score=1,
)

# Define the recognizer with one or more patterns
polish_id_recognizer = PatternRecognizer(
    supported_entity="POLISH_ID", patterns=[polish_id_pattern]
)
time_recognizer = PatternRecognizer(supported_entity="TIME", patterns=[time_pattern])

In [49]:
anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

In [50]:
anonymizer.reset_deanonymizer_mapping()

In [51]:
print_color_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<TIME>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_NUMBER>

In [52]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'POLISH_ID': {'<POLISH_ID>': 'ABC123456'},
 'TIME': {'<TIME>': '9:30 AM'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


In [53]:
anonymizer=PresidioReversibleAnonymizer(
    add_default_faker_operators=True,
    faker_seed=42,
)

anonymizer.add_recognizer(time_recognizer)

print_color_pii(anonymizer.anonymize(document_content))

Date: 1986-04-18
 Witness: Brian Cox DVM
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Brian Cox DVM and on 1986-04-18, my wallet was stolen in the vicinity of New Rita during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 6584801845146275, which is registered under my name and linked to my bank account, GB78GSWK37672423884969.

 Additionally, the wallet had a driver's license - DL No: 781802744 issued to my name. It also houses my Social Security Number, 719-88-1170.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<TIME>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 001-103-413-1647x525, or through my personal email, jamesmichael@example.com.

 Please consider thi

In [54]:
from faker import Faker

fake = Faker()


def fake_polish_id(_=None):
    return fake.bothify(text="???######").upper()


fake_polish_id()

'EVM147406'

In [55]:
from faker import Faker

fake=Faker()

def fake_time(_=None):
    return fake.time(pattern="%I:%M %p")
fake_time()

'06:19 AM'

In [56]:
from presidio_anonymizer.entities import OperatorConfig

new_operators = {
    "POLISH_ID": OperatorConfig("custom", {"lambda": fake_polish_id}),
    "TIME": OperatorConfig("custom", {"lambda": fake_time}),
}

anonymizer.add_operators(new_operators)

In [57]:
anonymizer.reset_deanonymizer_mapping()
print_color_pii(anonymizer.anonymize(document_content))

Date: 1974-12-26
 Witness: Jimmy Murillo
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Jimmy Murillo and on 1974-12-26, my wallet was stolen in the vicinity of South Dianeshire during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 213108121913614, which is registered under my name and linked to my bank account, GB17DBUR01326773602606.

 Additionally, the wallet had a driver's license - DL No: 532311310 issued to my name. It also houses my Social Security Number, 690-84-1613.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 07:39 AM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 001-737-631-1656, or through my personal email, briannasmith@example.net.

 Please consider this in

In [60]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'6584801845146275': '4111 1111 1111 1111'},
 'DATE_TIME': {'1986-04-18': 'October 19, 2021'},
 'EMAIL_ADDRESS': {'blakeerik@example.com': 'support@bankname.com',
                   'jamesmichael@example.com': 'johndoe@example.com'},
 'IBAN_CODE': {'GB78GSWK37672423884969': 'PL61109010140000071219812874'},
 'LOCATION': {'New Rita': 'Kilmarnock'},
 'PERSON': {'Brian Cox DVM': 'John Doe', 'Cristian Santos': 'Victoria Cherry'},
 'PHONE_NUMBER': {'001-103-413-1647x525': '999-888-7777'},
 'TIME': {'01:04 AM': '9:30 AM'},
 'UK_NHS': {'2812140441': '987-654-3210'},
 'US_DRIVER_LICENSE': {'781802744': '999000680'},
 'US_SSN': {'719-88-1170': '602-76-4532'}}


# QA System using PII Anonymization

In [72]:
anonymizer = PresidioReversibleAnonymizer(
    # Faker seed is used here to make sure the same fake data is generated for the test purposes
    # In production, it is recommended to remove the faker_seed parameter (it will default to None)
    faker_seed=42,
)

anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

anonymizer.add_operators(new_operators)

In [62]:
!pip install --upgrade openai



In [74]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
# model_kwargs = {'device': 'cuda'}
encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity
local_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    # model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Represent this sentence for searching relevant passages:",
)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [75]:
documents = [Document(page_content=document_content)]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

docsearch = FAISS.from_documents(chunks, local_embeddings)
retriever = docsearch.as_retriever()

In [77]:
GPT_MODEL = "gpt-3.5-turbo"

template = """Answer the question based only on the following context:
{context}

Question: {anonymized_question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = GPT_MODEL
temperature=0

In [78]:
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import format_document

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)


chain_with_deanonymization = (
    RunnableParallel({"question": RunnablePassthrough()})
    | {
        "context": itemgetter("question")
        | retriever
        | _combine_documents
        | anonymizer.anonymize,
        "anonymized_question": lambda x: anonymizer.anonymize(x["question"]),
    }
    | prompt
    | model
    | StrOutputParser()
    | RunnableLambda(anonymizer.deanonymize)
)

TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'str'>

In [64]:
GPT_MODEL = "gpt-3.5-turbo"
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

for doc in documents:
    doc.page_content=anonymizer.anonymize(doc.page_content)
    
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks=text_splitter.split_documents(documents)

embeddings=OpenAIEmbeddings(openai_api_key=api_key)
docsearch=FAISS.from_documents(chunks, embeddings)
retriever=docsearch.as_retriever()

In [67]:
GPT_MODEL = "gpt-3.5-turbo"

from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_openai import ChatOpenAI

# 6. Create anonymizer chain
template = """Answer the question based only on the following context:
{context}

Question: {anonymized_question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = GPT_MODEL
temperature=0


_inputs = RunnableParallel(
    question=RunnablePassthrough(),
    # It is important to remember about question anonymization
    anonymized_question=RunnableLambda(anonymizer.anonymize),
)

anonymizer_chain = (
    _inputs
    | {
        "context": itemgetter("anonymized_question") | retriever,
        "anonymized_question": itemgetter("anonymized_question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'str'>

In [68]:
anonymize_chain.invoke(
    "Where did the theft occur, at what time, and who was it"
)

NameError: name 'anonymize_chain' is not defined

In [69]:
chain_with_deanonymization = anonymizer_chain |
RunnableLambda(anonymizer.deanonymize)

print(
    chain_with_deanonymization.invoke(
        "Where did the theft occur, at what time, and who was it"
    )

)

SyntaxError: invalid syntax (Temp/ipykernel_15908/2465647413.py, line 1)

In [70]:
print(
    chain_with_deanonymization.invoke(
        "What was the content of the wallet in detail")
    )

The content of the wallet included:
1. Credit card number: 4111 1111 1111 1111
2. Bank account number: PL61109010140000071219812874
3. Driver's license number: 999000680
4. Social Security Number: 602-76-4532
5. Polish identity card number: ABC123456

NameError: name 'chain_with_deanonymization' is not defined

In [71]:
print(chain_with_deanonymization.invoke("Whose phone number is it?"))


The phone number 999-888-7777 belongs to John Doe.

NameError: name 'chain_with_deanonymization' is not defined