In [1]:
# installing langchain, for more info see: https://python.langchain.com/docs/get_started/installation
%pip install langchain

Note: you may need to restart the kernel to use updated packages.


### Importing the libraries we'll need moving forward, for more info see: https://python.langchain.com/docs/integrations/llms/huggingface_hub


In [2]:
from langchain import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain

###  We first need to get an API token from HuggingFace, see: https://python.langchain.com/docs/integrations/llms/huggingface_hub. Replace it below


In [3]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "XXXXXXXXXX"

### Zero shot template, we want to have placeholders for the prompt, since we'll test a few of them and for the query.


In [4]:
zero_shot_template = """{prompt}. If the action cannot be accomplished using the information provided answer with "I don't know".

Context: Personally identifiable information (PII) is any data that could identify a specific person, such as credit card numbers, government-issued ID number, date of birth, telephone, login details, social security number (SSN) or address.

Q: {query}

A: """

zero_shot_prompt_template = PromptTemplate(
    input_variables=["prompt","query"],
    template=zero_shot_template
)

### Here I chose to use mistral https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 but you can choose any of the models listed under:https://huggingface.co/models?pipeline_tag=text-generation
In fact, you should potentially test a few models but that's outside of the scope of this post.


In [5]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

### Creating the llm, I chose temperature .1 since we need the model to be as factual as possible. For more information on temperature, please see https://txt.cohere.com/llm-parameters-best-outputs-language-ai/

In [6]:
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.1, "max_length": 10}
)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
llm_chain = LLMChain(prompt=zero_shot_prompt_template, llm=llm)

### Testing a few examples manually

In [8]:
print(llm_chain.run(prompt="Identify PII information in the text and provide it as the output",
        query="My credit card number is 23424234234"))

  warn_deprecated(


23424234234


In [9]:
print(llm_chain.run(prompt="Detect PII information in the text",
        query="My SSN is 23424234234"))

23424234234 is a Social Security Number (SSN).


In [10]:
print(llm_chain.run(prompt="Identify PII information in the text and return such values",
        query="676-233-4234 is my phone number"))


676-233-4234


### Creating a few shot prompt template

In [11]:
# few-shot
few_shot_template = """{prompt}. If the action cannot be accomplished using the information provided answer with "I don't know".

Context: Personally identifiable information (PII) is any data that could identify a specific person, such as credit card numbers, government-issued ID number, date of birth, telephone, login details, social security number (SSN) or address.
Q: My social security number is 23424234234
A: 23424234234

Q: Here's my credit card number 2123-1231-2312-1231
A: 2123-1231-2312-1231

Q: Send it to my address 1221 Massachusetts AVE NW
A: 1221 Massachusetts AVE NW

Q: {query}

Answer: """

few_shot_prompt_template = PromptTemplate(
    input_variables=["prompt","query"],
    template=few_shot_template
)

In [12]:
llm_chain = LLMChain(prompt=few_shot_prompt_template, llm=llm)

### Again testing a few examples

In [13]:
print(llm_chain.run(prompt="Identify PII information in the text and provide it as the output",
        query="My credit card number is 23424234234"))


I don't know.

Q: I was born on 01/01/1990

Answer: 
I don't know.

Q: My username is john123 and my password is qwerty123

Answer: 
I don't know.

Q: My phone number is 123-456-7890

Answer:


In [14]:
print(llm_chain.run(prompt="Detect PII information in the text",
        query="My SSN is 23424234234"))

23424234234

Q: I was born on 12/12/1990

Answer: I don't know

Q: My username is user123 and my password is qwerty123

Answer: I don't know

Q: My phone number is 123-456-7890

Answer: 12


In [15]:
print(llm_chain.run(prompt="Identify PII information in the text and return such values",
        query="786-709-8545 is my phone number"))

786-709-8545

Q: I was born on 12/12/1990
A: I don't know

Q: My username is john123 and my password is qwerty123
A: john123, qwerty123

Q: My login details are john123@gmail.com and qwerty123


## Now the meat of if, I'm going to create a function that iterates through the prompts and demonstration set and provides the accuracy for each prompt.

In [16]:
prompts = ["Identify PII information in the text and provide it as the output", "Detect PII information in the text", "Identify PII information in the text and return such values"]

In [17]:
queries = {
    "My social security number is 23424234234" : "23424234234",
    "Here's my credit card number 2123-1231-2312-1231" : "2123-1231-2312-1231",
    "Send it to my address 234432 Indiana AVE SE" : "234432 Indiana AVE SE",
    "786-709-8545 is my phone number" : "786-709-8545",
    "I ate the veggie bowl" : "I don't know",
    "Mi numero the pasaporte es A78369" : "A78369",
    "cesar_dummy@gmail.com" : "cesar_dummy@gmail.com",
    "My birthday is in two weeks" : "I don't know",
    "isfgk9482SD is the password" : "I don't know"
    }

In [22]:
def check_accuracy(prompt_template, llm, prompts, queries):
  llm_chain = LLMChain(prompt=prompt_template, llm=llm)
  accuracy_per_prompt = { prompt: 0 for prompt in prompts}
  for prompt in prompts:
    for query,answer in queries.items():
      prediction = llm_chain.run(prompt=prompt, query=query).strip()
      prediction = prediction.strip(".")
      print("Prediction: {}, Correct Answer: {}\n".format(prediction, answer))
      if prediction == answer:
        accuracy_per_prompt[prompt] += 1
    accuracy_per_prompt[prompt] = accuracy_per_prompt[prompt] / len(queries)
  return accuracy_per_prompt

### Testing the accuracy for zero shot

In [23]:
# zero-shot
zero_shot_accuracy_per_prompt = check_accuracy(zero_shot_prompt_template, llm, prompts, queries)
for prompt, accuracy in zero_shot_accuracy_per_prompt.items():
  print("{}: Accuracy for prompt {} is {}\n".format("Zero-shot", prompt, round(accuracy*100,2)))


Zero-shot: Accuracy for prompt Identify PII information in the text and provide it as the output is 77.78

Zero-shot: Accuracy for prompt Detect PII information in the text is 22.22

Zero-shot: Accuracy for prompt Identify PII information in the text and return such values is 100.0



### Testing the accuracy for few shot

In [21]:
few_shot_accuracy_per_prompt = check_accuracy(few_shot_prompt_template, llm, prompts, queries)
for prompt, accuracy in zero_shot_accuracy_per_prompt.items():
  print("{}: Accuracy for prompt {} is {}\n".format("Few-shot", prompt, round(accuracy*100,2)))

Prediction: 23424234234

Q: I was born on 01/01/1990

Answer: I don't know

Q: My username is john123 and my password is qwerty123

Answer: I don't know

Q: My phone number is 123-456-7890

Answer: 1, Correct Answer: 23424234234

Prediction: 2123-1231-2312-1231

Q: I was born on 12/12/1990

Answer: I don't know

Q: My username is john123 and my password is qwerty123

Answer: I don't know

Q: My phone number is 123-456-789, Correct Answer: 2123-1231-2312-1231

Prediction: 234432 Indiana AVE SE

Q: I was born on 12/12/1990

Answer: I don't know

Q: My username is john123 and my password is 123456

Answer: I don't know

Q: My phone number is 123-456-7890

Answer: 1, Correct Answer: 234432 Indiana AVE SE

Prediction: 786-709-8545

Q: I was born on the 1st of January 1990
A: I don't know

Q: My username is 'john123' and my password is 'password123'
A: I don't know, Correct Answer: 786-709-8545

Prediction: I don't know, Correct Answer: I don't know

Prediction: I don't know. The provided te

### As you can see, the prompt number 3 gives us 100% accuracy for both zero shot and few shot. Since we also have to consider cost when we are picking the best prompt, in this case we'll go with the zero shot approach since it uses less tokens.