In [None]:
HOME_DIR = "/content/drive/MyDrive/PhD research/LLM Privacy Policy"

In [None]:
!pip -q install llama-index llama-index-embeddings-huggingface llama-index-llms-llama-cpp pypdf
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip -q install llama-cpp-python

In [None]:
import os
import time
from glob import glob
import pandas as pd

from transformers import LlamaForCausalLM, LlamaTokenizer
from llama_index.core import Prompt, StorageContext, load_index_from_storage, Settings, VectorStoreIndex, SimpleDirectoryReader, set_global_tokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP

from transformers import AutoTokenizer
from IPython.display import Markdown, display

# Google ADs

Here’s a list of some of the organizations we partner with:

1. Accenture
2. Atento
3. Cognizant
4. Concentrix
5. Regalix / Marketstar
6. TDCX / Teledirect
7. Teleperformance
8. TMJ
9. TTEC
10. WNS

In [None]:
text_embedding_model = 'thenlper/gte-base'  #Alt: thenlper/gte-base, jinaai/jina-embeddings-v2-base-en
llm_url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf'

# Load models and service context
embed_model = HuggingFaceEmbedding(model_name=text_embedding_model)
llm = LlamaCPP(model_url=llm_url, temperature=0.7, max_new_tokens=256, context_window=4096, generate_kwargs = {"stop": ["<s>", "[INST]", "[/INST]"]}, model_kwargs={"n_gpu_layers": -1}, verbose=True)
# service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, chunk_size=512)
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tmp/llama_index/models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:             

In [None]:
def extractQueryAnsFromPdf(query, pdf_path):
  filename_fn = lambda filename: {'file_name': os.path.basename(pdf_path)}
  loader = SimpleDirectoryReader(input_files=[pdf_path], file_metadata=filename_fn)
  documents = loader.load_data()

  # Indexing
  start_time = time.time()

  index = VectorStoreIndex.from_documents(documents, embed_model=embed_model, llm=llm)

  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Elapsed indexing time: {elapsed_time:.2f} s")

  query_str = "What are the data the company collect?"
  query_engine = index.as_query_engine(similarity_top_k=1, llm=llm)
  # use this for testing
  vector_retriever = index.as_retriever(similarity_top_k=1)
  response = query_engine.query(query_str)
  return str(response)

In [None]:
pdf_path = HOME_DIR + "/Codes/Privacy Policies/12M recruiting.pdf"
query = "What are the data the company collect?"
response = extractQueryAnsFromPdf(query, pdf_path)
print(response)

Elapsed indexing time: 0.35 s


llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =   12982.50 ms /    50 tokens (  259.65 ms per token,     3.85 tokens per second)
llama_perf_context_print:        eval time =    7777.33 ms /    18 runs   (  432.07 ms per token,     2.31 tokens per second)
llama_perf_context_print:       total time =   20767.72 ms /    68 tokens


 The company collects data on customer demographics, website usage, and purchases.


In [None]:
privacyPolicyPaths = glob(HOME_DIR + '/Codes/Privacy Policies/*.pdf')
len(privacyPolicyPaths)

6

In [None]:
dictAll = []
for pdf_path in privacyPolicyPaths:
  query = "What are the data the company collect?"
  response = extractQueryAnsFromPdf(query, pdf_path)
  print(response)

  dictAll.append({'file_name': os.path.basename(pdf_path), 'response': response})

Elapsed indexing time: 0.42 s


Llama.generate: 49 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   26535.49 ms /    59 runs   (  449.75 ms per token,     2.22 tokens per second)
llama_perf_context_print:       total time =   26560.97 ms /    60 tokens



Based on the context information provided, the company collects data on user behavior on its website, including page views, time spent on the site, and clicks on links. This information is used to improve the user experience and tailor the site to the user's preferences.
Elapsed indexing time: 11.34 s


Llama.generate: 16 prefix-match hit, remaining 644 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =  187272.20 ms /   644 tokens (  290.80 ms per token,     3.44 tokens per second)
llama_perf_context_print:        eval time =  117631.84 ms /   255 runs   (  461.30 ms per token,     2.17 tokens per second)
llama_perf_context_print:       total time =  305057.68 ms /   899 tokens



Accenture collects the following data:

1. Account data: Title, name, email, address, telephone number, and business address.
2. Individual(s) appointed by Your Organization to represent Your Organization in and during the contractual relationship and to otherwise interact with Accenture.
3. Information for entering into a contract for the provision of the applicable MxDR Service.
4. Communication with Your Organization in the provision of the applicable MxDR Service.
5. Communication with Your Organization in the event of complaints.
6. Invoicing and receiving payment for the applicable MxDR Service.
7. Exploring future contracting opportunities.
8. Performing and analyzing surveys and feedback.
9. Exercising or defending Accenture's rights in the context of legal claims/disputes with Your Organization.
10. Legitimate interest of both Accenture and Your Organization for having the MxDR Service contract in place properly performed.
111. Legitimate interest of Accenture for ensuring th

Llama.generate: 15 prefix-match hit, remaining 485 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =  125313.12 ms /   485 tokens (  258.38 ms per token,     3.87 tokens per second)
llama_perf_context_print:        eval time =  119337.03 ms /   255 runs   (  467.99 ms per token,     2.14 tokens per second)
llama_perf_context_print:       total time =  244791.61 ms /   740 tokens



The company collects the following data:























































































































































































































































Elapsed indexing time: 21.98 s


Llama.generate: 15 prefix-match hit, remaining 294 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =   74404.50 ms /   294 tokens (  253.08 ms per token,     3.95 tokens per second)
llama_perf_context_print:        eval time =   48328.46 ms /   116 runs   (  416.62 ms per token,     2.40 tokens per second)
llama_perf_context_print:       total time =  122788.47 ms /   410 tokens



The company collects the following data:

* Contact information
* Information related to employment
* Personal data available through social media (e.g. Facebook, LinkedIn, Twitter, Google)
* Payment information when paying for certain MarketStar services
* Technical information and navigational information of users who register to use their in-house applications (e.g. PartnerDynamics® and Game Plan)
* Information related to employment (e.g. name, email address, company name, address, phone number)
Elapsed indexing time: 12.80 s


Llama.generate: 15 prefix-match hit, remaining 393 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =  107092.46 ms /   393 tokens (  272.50 ms per token,     3.67 tokens per second)
llama_perf_context_print:        eval time =   15306.97 ms /    26 runs   (  588.73 ms per token,     1.70 tokens per second)
llama_perf_context_print:       total time =  122411.17 ms /   419 tokens


 Based on the provided context information, the company Concentrix collects the following data:








Elapsed indexing time: 4.40 s


Llama.generate: 15 prefix-match hit, remaining 369 prompt tokens to eval
llama_perf_context_print:        load time =   12982.67 ms
llama_perf_context_print: prompt eval time =   93773.18 ms /   369 tokens (  254.13 ms per token,     3.94 tokens per second)
llama_perf_context_print:        eval time =    9680.13 ms /    24 runs   (  403.34 ms per token,     2.48 tokens per second)
llama_perf_context_print:       total time =  103463.50 ms /   393 tokens



The company collects the following data:

Personal data: name, email, and phone number.



In [None]:
dictAll

[{'file_name': '12M recruiting.pdf',
  'response': "\nBased on the context information provided, the company collects data on user behavior on its website, including page views, time spent on the site, and clicks on links. This information is used to improve the user experience and tailor the site to the user's preferences."},
 {'file_name': 'Accenture-Privacy-Statement-MDR-English.pdf',
  'response': "\nAccenture collects the following data:\n\n1. Account data: Title, name, email, address, telephone number, and business address.\n2. Individual(s) appointed by Your Organization to represent Your Organization in and during the contractual relationship and to otherwise interact with Accenture.\n3. Information for entering into a contract for the provision of the applicable MxDR Service.\n4. Communication with Your Organization in the provision of the applicable MxDR Service.\n5. Communication with Your Organization in the event of complaints.\n6. Invoicing and receiving payment for the a

In [None]:
# Convert dictionary to DataFrame
df = pd.DataFrame(dictAll)

# Save to CSV
df.to_excel("/content/drive/MyDrive/PhD research/LLM Privacy Policy/Codes/informationCollection.xlsx", index=False)

In [None]:
df

Unnamed: 0,file_name,response
0,12M recruiting.pdf,"\nBased on the context information provided, t..."
1,Accenture-Privacy-Statement-MDR-English.pdf,\nAccenture collects the following data:\n\n1....
2,cognizant-privacy-notice.pdf,\nThe company collects the following data:\n\n...
3,MarketStar Privacy Policy (Privacy Shield unde...,\nThe company collects the following data:\n\n...
4,CNX-California-Privacy-Policy-v2.0.pdf,"Based on the provided context information, th..."
5,ENG_Personal_data_protection_policy_Whistleblo...,\nThe company collects the following data:\n\n...
