In [None]:
HOME_DIR = "/content/drive/MyDrive/PhD research/LLM Privacy Policy"

In [None]:
!pip -q install llama-index llama-index-embeddings-huggingface llama-index-llms-llama-cpp pypdf
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip -q install llama-cpp-python

In [None]:
import os
import time
from glob import glob
import pandas as pd

from transformers import LlamaForCausalLM, LlamaTokenizer
from llama_index.core import Prompt, StorageContext, load_index_from_storage, Settings, VectorStoreIndex, SimpleDirectoryReader, set_global_tokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP

from transformers import AutoTokenizer
from IPython.display import Markdown, display

# LLM Model

In [None]:
text_embedding_model = 'thenlper/gte-base'  #Alt: thenlper/gte-base, jinaai/jina-embeddings-v2-base-en
llm_url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf'

# Load models and service context
embed_model = HuggingFaceEmbedding(model_name=text_embedding_model)
llm = LlamaCPP(model_url=llm_url, temperature=0.7, max_new_tokens=256, context_window=4096, generate_kwargs = {"stop": ["<s>", "[INST]", "[/INST]"]}, model_kwargs={"n_gpu_layers": -1}, verbose=True)
# service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, chunk_size=512)
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tmp/llama_index/models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:             

In [None]:
def extractQueryAnsFromPdf(query, pdf_path):
  filename_fn = lambda filename: {'file_name': os.path.basename(pdf_path)}
  loader = SimpleDirectoryReader(input_files=[pdf_path], file_metadata=filename_fn)
  documents = loader.load_data()

  # Indexing
  start_time = time.time()

  index = VectorStoreIndex.from_documents(documents, embed_model=embed_model, llm=llm)

  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Elapsed indexing time: {elapsed_time:.2f} s")

  query_str = "What is the company name?"
  query_engine = index.as_query_engine(similarity_top_k=1, llm=llm)
  # use this for testing
  vector_retriever = index.as_retriever(similarity_top_k=1)
  response = query_engine.query(query_str)
  return str(response)

In [None]:
privacyPolicyPaths = glob("/content/drive/MyDrive/PhD research/LLM Privacy Policy/Codes/automated privacy polixy/*.pdf")
len(privacyPolicyPaths)

27

In [None]:
dictAll = []
for pdf_path in privacyPolicyPaths[:10]:
  query = "What is the Compnay name?"
  response = extractQueryAnsFromPdf(query, pdf_path)
  print(response)

  dictAll.append({'file_name': os.path.basename(pdf_path), 'companyNameResponse': response})

Elapsed indexing time: 3.88 s


llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  293878.64 ms /   609 tokens (  482.56 ms per token,     2.07 tokens per second)
llama_perf_context_print:        eval time =    5141.19 ms /     7 runs   (  734.46 ms per token,     1.36 tokens per second)
llama_perf_context_print:       total time =  299025.36 ms /   616 tokens



Federal Credit Union
Elapsed indexing time: 24.11 s


Llama.generate: 15 prefix-match hit, remaining 648 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  291750.99 ms /   648 tokens (  450.23 ms per token,     2.22 tokens per second)
llama_perf_context_print:        eval time =    1504.98 ms /     2 runs   (  752.49 ms per token,     1.33 tokens per second)
llama_perf_context_print:       total time =  293259.84 ms /   650 tokens


 Jacobs
Elapsed indexing time: 85.24 s


Llama.generate: 15 prefix-match hit, remaining 647 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  299094.20 ms /   647 tokens (  462.28 ms per token,     2.16 tokens per second)
llama_perf_context_print:        eval time =    2308.22 ms /     3 runs   (  769.41 ms per token,     1.30 tokens per second)
llama_perf_context_print:       total time =  301406.33 ms /   650 tokens


 Parsons
Elapsed indexing time: 16.68 s


Llama.generate: 16 prefix-match hit, remaining 524 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  244744.66 ms /   524 tokens (  467.07 ms per token,     2.14 tokens per second)
llama_perf_context_print:        eval time =    5061.84 ms /     6 runs   (  843.64 ms per token,     1.19 tokens per second)
llama_perf_context_print:       total time =  249812.73 ms /   530 tokens


 Coremont LLP.
Elapsed indexing time: 20.72 s


Llama.generate: 15 prefix-match hit, remaining 512 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  229411.23 ms /   512 tokens (  448.07 ms per token,     2.23 tokens per second)
llama_perf_context_print:        eval time =   51147.16 ms /    64 runs   (  799.17 ms per token,     1.25 tokens per second)
llama_perf_context_print:       total time =  280602.03 ms /   576 tokens



The company name is not explicitly mentioned in the provided text. However, based on the context, it is likely that the company is the website owner or operator, as they are referred to as "We" and "Our" throughout the privacy policy. Therefore, the answer to the query is " unknown".
Elapsed indexing time: 2.98 s


Llama.generate: 15 prefix-match hit, remaining 624 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  282580.18 ms /   624 tokens (  452.85 ms per token,     2.21 tokens per second)
llama_perf_context_print:        eval time =    6934.28 ms /     5 runs   ( 1386.86 ms per token,     0.72 tokens per second)
llama_perf_context_print:       total time =  289521.22 ms /   629 tokens


 The Online payment service.
Elapsed indexing time: 17.29 s


Llama.generate: 18 prefix-match hit, remaining 606 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  271392.27 ms /   606 tokens (  447.84 ms per token,     2.23 tokens per second)
llama_perf_context_print:        eval time =    2995.75 ms /     4 runs   (  748.94 ms per token,     1.34 tokens per second)
llama_perf_context_print:       total time =  274392.84 ms /   610 tokens


 Accenture.
Elapsed indexing time: 21.62 s


Llama.generate: 18 prefix-match hit, remaining 633 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  284143.44 ms /   633 tokens (  448.88 ms per token,     2.23 tokens per second)
llama_perf_context_print:        eval time =   21219.62 ms /     3 runs   ( 7073.21 ms per token,     0.14 tokens per second)
llama_perf_context_print:       total time =  305373.08 ms /   636 tokens


 Deloitte
Elapsed indexing time: 8.05 s


Llama.generate: 15 prefix-match hit, remaining 486 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  216865.87 ms /   486 tokens (  446.23 ms per token,     2.24 tokens per second)
llama_perf_context_print:        eval time =    3139.75 ms /     4 runs   (  784.94 ms per token,     1.27 tokens per second)
llama_perf_context_print:       total time =  220009.68 ms /   490 tokens


 KPMG.
Elapsed indexing time: 2.97 s


Llama.generate: 15 prefix-match hit, remaining 594 prompt tokens to eval
llama_perf_context_print:        load time =  293879.83 ms
llama_perf_context_print: prompt eval time =  268360.33 ms /   594 tokens (  451.79 ms per token,     2.21 tokens per second)
llama_perf_context_print:        eval time =   12912.48 ms /    16 runs   (  807.03 ms per token,     1.24 tokens per second)
llama_perf_context_print:       total time =  281284.25 ms /   610 tokens


 The company name is Fiat Chrysler Automobiles (FCA).


In [None]:
dictAll

[{'file_name': 'US--EngCAOnlinesupp_8_12.pdf',
  'companyNameResponse': '\nFederal Credit Union'},
 {'file_name': 'jacobs-recruitment-privacy-notice-april-2019.pdf',
  'companyNameResponse': ' Jacobs'},
 {'file_name': 'Parsons-Code-of-Conduct_106895955_2_04-(003).pdf',
  'companyNameResponse': ' Parsons'},
 {'file_name': 'Coremont-Website-Cookies-and-Privacy-Policy-Jun-2023.pdf',
  'companyNameResponse': ' Coremont LLP.'},
 {'file_name': 'RAMP-Global-Privacy-Policy-2024.pdf',
  'companyNameResponse': '\nThe company name is not explicitly mentioned in the provided text. However, based on the context, it is likely that the company is the website owner or operator, as they are referred to as "We" and "Our" throughout the privacy policy. Therefore, the answer to the query is " unknown".'},
 {'file_name': 'Website-Privacy-Policy.pdf',
  'companyNameResponse': ' The Online payment service.'},
 {'file_name': 'Accenture-Privacy-Statement-MDR-English.pdf',
  'companyNameResponse': ' Accenture.'

In [None]:
# Convert dictionary to DataFrame
df = pd.DataFrame(dictAll)

# Save to CSV
df.to_excel("/content/drive/MyDrive/PhD research/LLM Privacy Policy/Codes/first10_companyName.xlsx", index=False)

In [None]:
df

Unnamed: 0,file_name,companyNameResponse
0,US--EngCAOnlinesupp_8_12.pdf,\nFederal Credit Union
1,jacobs-recruitment-privacy-notice-april-2019.pdf,Jacobs
2,Parsons-Code-of-Conduct_106895955_2_04-(003).pdf,Parsons
3,Coremont-Website-Cookies-and-Privacy-Policy-Ju...,Coremont LLP.
4,RAMP-Global-Privacy-Policy-2024.pdf,\nThe company name is not explicitly mentioned...
5,Website-Privacy-Policy.pdf,The Online payment service.
6,Accenture-Privacy-Statement-MDR-English.pdf,Accenture.
7,us-deloitte-privacy-notice-7-5-22.pdf,Deloitte
8,privacy-notice-and-terms-and-conditions.pdf,KPMG.
9,US--EngCAOnlinesupp_8_12 (1).pdf,The company name is Fiat Chrysler Automobiles...
