In [1]:
!pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers

In [2]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

#Stage1: Extract text from PDF

In [3]:
def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [4]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [14]:
# Define the directory containing the PDF files
pdf_directory = Path("")

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):
    print(pdf_path)
    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

Automated Service Suggestions.pdf
...Processing Automated Service Suggestions.pdf
Finished processing Automated Service Suggestions.pdf
Tag-based discovery in Service Mapping.pdf
...Processing Tag-based discovery in Service Mapping.pdf
Finished processing Tag-based discovery in Service Mapping.pdf
Pattern-based discovery in Service Mapping.pdf
...Processing Pattern-based discovery in Service Mapping.pdf
Finished processing Pattern-based discovery in Service Mapping.pdf
Choose the right method for discovery and mapping application services.pdf
...Processing Choose the right method for discovery and mapping application services.pdf
Finished processing Choose the right method for discovery and mapping application services.pdf
Discovery based on Predictive Intelligence.pdf
...Processing Discovery based on Predictive Intelligence.pdf
Finished processing Discovery based on Predictive Intelligence.pdf
Traffic-based discovery in Service Mapping.pdf
...Processing Traffic-based discovery in Serv

In [15]:
data

[  Page No.                                          Page_Text  \
 0   Page 1  20/06/2024, 16:29 Automated Service Suggestion...   
 1   Page 2  20/06/2024, 16:29 Automated Service Suggestion...   
 
                        Document Name  
 0  Automated Service Suggestions.pdf  
 1  Automated Service Suggestions.pdf  ,
   Page No.                                          Page_Text  \
 0   Page 1  20/06/2024, 16:27 Tag-based discovery in Servi...   
 1   Page 2  20/06/2024, 16:27 Tag-based discovery in Servi...   
 
                                 Document Name  
 0  Tag-based discovery in Service Mapping.pdf  
 1  Tag-based discovery in Service Mapping.pdf  ,
   Page No.                                          Page_Text  \
 0   Page 1  20/06/2024, 16:28 Pattern-based discovery in S...   
 1   Page 2  20/06/2024, 16:28 Pattern-based discovery in S...   
 
                                     Document Name  
 0  Pattern-based discovery in Service Mapping.pdf  
 1  Pattern-based discove

In [22]:
# Concatenate all the DFs in the list 'data' together

service_mapping_pdfs_data = pd.concat(data, ignore_index=True)

In [23]:
service_mapping_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf
1,Page 2,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf
2,Page 1,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf
3,Page 2,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf
4,Page 1,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf
5,Page 2,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf
6,Page 1,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...
7,Page 2,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...
8,Page 1,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf
9,Page 2,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf


In [24]:
service_mapping_pdfs_data['Text Length'] = service_mapping_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [25]:
service_mapping_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text Length
0,Page 1,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,220
1,Page 2,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,25
2,Page 1,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,628
3,Page 2,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,27
4,Page 1,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,438
5,Page 2,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,394
6,Page 1,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,881
7,Page 2,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,32
8,Page 1,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,754
9,Page 2,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,27


In [26]:
service_mapping_pdfs_data = service_mapping_pdfs_data[service_mapping_pdfs_data['Text Length'] > 20]

In [27]:
service_mapping_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text Length
0,Page 1,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,220
1,Page 2,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,25
2,Page 1,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,628
3,Page 2,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,27
4,Page 1,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,438
5,Page 2,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,394
6,Page 1,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,881
7,Page 2,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,32
8,Page 1,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,754
9,Page 2,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,27


In [50]:
service_mapping_pdfs_data['Metadata'] = service_mapping_pdfs_data.apply(lambda x: {'Document Name':x['Document Name'],'Page No':x['Page No.']}, axis=1)

In [51]:
service_mapping_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text Length,metadata,Metadata
0,Page 1,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,220,{'Document Name': 'Automated Service Suggestio...,{'Document Name': 'Automated Service Suggestio...
1,Page 2,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,25,{'Document Name': 'Automated Service Suggestio...,{'Document Name': 'Automated Service Suggestio...
2,Page 1,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,628,{'Document Name': 'Tag-based discovery in Serv...,{'Document Name': 'Tag-based discovery in Serv...
3,Page 2,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,27,{'Document Name': 'Tag-based discovery in Serv...,{'Document Name': 'Tag-based discovery in Serv...
4,Page 1,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,438,{'Document Name': 'Pattern-based discovery in ...,{'Document Name': 'Pattern-based discovery in ...
5,Page 2,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,394,{'Document Name': 'Pattern-based discovery in ...,{'Document Name': 'Pattern-based discovery in ...
6,Page 1,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,881,{'Document Name': 'Choose the right method for...,{'Document Name': 'Choose the right method for...
7,Page 2,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,32,{'Document Name': 'Choose the right method for...,{'Document Name': 'Choose the right method for...
8,Page 1,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,754,{'Document Name': 'Discovery based on Predicti...,{'Document Name': 'Discovery based on Predicti...
9,Page 2,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,27,{'Document Name': 'Discovery based on Predicti...,{'Document Name': 'Discovery based on Predicti...


In [53]:
service_mapping_pdfs_data.drop('metadata', axis=1)

Unnamed: 0,Page No.,Page_Text,Document Name,Text Length,Metadata
0,Page 1,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,220,{'Document Name': 'Automated Service Suggestio...
1,Page 2,"20/06/2024, 16:29 Automated Service Suggestion...",Automated Service Suggestions.pdf,25,{'Document Name': 'Automated Service Suggestio...
2,Page 1,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,628,{'Document Name': 'Tag-based discovery in Serv...
3,Page 2,"20/06/2024, 16:27 Tag-based discovery in Servi...",Tag-based discovery in Service Mapping.pdf,27,{'Document Name': 'Tag-based discovery in Serv...
4,Page 1,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,438,{'Document Name': 'Pattern-based discovery in ...
5,Page 2,"20/06/2024, 16:28 Pattern-based discovery in S...",Pattern-based discovery in Service Mapping.pdf,394,{'Document Name': 'Pattern-based discovery in ...
6,Page 1,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,881,{'Document Name': 'Choose the right method for...
7,Page 2,"20/06/2024, 16:29 Choose the right method for ...",Choose the right method for discovery and mapp...,32,{'Document Name': 'Choose the right method for...
8,Page 1,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,754,{'Document Name': 'Discovery based on Predicti...
9,Page 2,"20/06/2024, 16:29 Discovery based on Predictiv...",Discovery based on Predictive Intelligence.pdf,27,{'Document Name': 'Discovery based on Predicti...


In [None]:
#Initialize chroma collection

In [37]:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [38]:
chroma_data_path = 'chroma'

In [39]:
import chromadb

In [40]:
# Call PersistentClient()

client = chromadb.PersistentClient()

In [41]:
openai.api_key = 'sk-proj-BRC4WIZodN7n84tyYxgLT3BlbkFJTxzb5ZgPgBzHJumS0HpC'

In [42]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [43]:
document_collection = client.get_or_create_collection(name='RAG_on_ServiceNow_Documents', embedding_function=embedding_function)

In [54]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = service_mapping_pdfs_data["Page_Text"].tolist()
metadata_list = service_mapping_pdfs_data['Metadata'].tolist()

In [57]:
document_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [60]:
# Let's take a look at the first few entries in the collection

document_collection.get(
    ids = ['0'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0'],
 'embeddings': [[-0.021748917177319527,
   -0.006173198577016592,
   0.0024056558031588793,
   -0.009402520954608917,
   -0.028916018083691597,
   0.011108318343758583,
   -0.012504595331847668,
   0.008749090135097504,
   -0.03298792243003845,
   -0.016026241704821587,
   0.005863678641617298,
   0.0033840821124613285,
   -0.014334200881421566,
   -0.01145910657942295,
   -0.007297786418348551,
   0.024183806031942368,
   0.02437639608979225,
   -0.01214692834764719,
   -0.0006091519608162344,
   -0.019217733293771744,
   -0.006066585890948772,
   -0.004990145564079285,
   0.004154442343860865,
   -0.022533033043146133,
   -0.011204613372683525,
   0.018488643690943718,
   0.0014693588018417358,
   -0.04762476310133934,
   -0.0030951970256865025,
   0.021803943440318108,
   0.013660135678946972,
   -0.02996150590479374,
   -0.019396567717194557,
   -0.006448327098041773,
   0.0047631640918552876,
   0.005574793554842472,
   0.019878042861819267,
   -0.008370787836611271

In [97]:
query = input()

  What is a tag based service?


In [65]:
results = document_collection.query(
query_texts=query,
n_results=5
)

In [66]:
results

{'ids': [['2', '3', '11', '6', '10']],
 'distances': [[0.3082152685998024,
   0.3658776687214737,
   0.3957040648481214,
   0.43148983202573366,
   0.4408169355308555]],
 'metadatas': [[{'Document Name': 'Tag-based discovery in Service Mapping.pdf',
    'Page No': 'Page 1'},
   {'Document Name': 'Tag-based discovery in Service Mapping.pdf',
    'Page No': 'Page 2'},
   {'Document Name': 'Traffic-based discovery in Service Mapping.pdf',
    'Page No': 'Page 2'},
   {'Document Name': 'Choose the right method for discovery and mapping application services.pdf',
    'Page No': 'Page 1'},
   {'Document Name': 'Traffic-based discovery in Service Mapping.pdf',
    'Page No': 'Page 1'}]],
 'embeddings': None,
 'documents': [['20/06/2024, 16:27 Tag-based discovery in Service Mapping [["Table of Contents"], ["Tag-based discovery in Service Mapping\\nRelease version: Washington DC\\nUpdated Feb 1, 2024\\n3 minutes to read\\nSummarize [Beta]\\nGPT summary: This tool offers a brief overview. For mo

In [74]:
cache_collection = client.get_or_create_collection(name='Documents_Cache', embedding_function=embedding_function)

In [80]:
cache_collection.peek()

{'ids': ['What is a tag based service?'],
 'embeddings': [[-0.0217876136302948,
   -0.018208423629403114,
   -0.022341536357998848,
   -0.007350123953074217,
   -0.006529892794787884,
   0.007208092603832483,
   -0.026091163977980614,
   -0.002675516065210104,
   -0.013308340683579445,
   -0.02370503731071949,
   0.02298067696392536,
   0.017086375504732132,
   -0.009793063625693321,
   -0.02805119752883911,
   -0.0008153488743118942,
   0.003193930722773075,
   0.02431577257812023,
   0.005709661636501551,
   -0.01097902562469244,
   -0.00851478148251772,
   -0.016844920814037323,
   0.00032844755332916975,
   -0.005237407051026821,
   -0.009203633293509483,
   -0.014089512638747692,
   0.0018694880418479443,
   0.021318910643458366,
   -0.016802312806248665,
   0.002885012421756983,
   0.023250536993145943,
   0.018719736486673355,
   -0.023719239979982376,
   -0.036644097417593,
   0.0051486375741660595,
   -0.006267134565860033,
   -0.00043652456952258945,
   0.0026719653978943825,

In [110]:
# Searh the Cache collection first
# Query the collection against the user query and return the top 20 results

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [111]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = document_collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if val is None or key=='included':
          continue
        for i in range(10):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })

Found in cache!


In [112]:
results_df

Unnamed: 0,IDs,Documents,Distances,Metadatas
0,2,"20/06/2024, 16:27 Tag-based discovery in Servi...",0.2394507686184825,{'Document Name': 'Tag-based discovery in Serv...
1,3,"20/06/2024, 16:27 Tag-based discovery in Servi...",0.2805265307409776,{'Document Name': 'Tag-based discovery in Serv...
2,11,"20/06/2024, 16:28 Traffic-based discovery in S...",0.2989274129769158,{'Document Name': 'Traffic-based discovery in ...
3,6,"20/06/2024, 16:29 Choose the right method for ...",0.312189141739491,{'Document Name': 'Choose the right method for...
4,10,"20/06/2024, 16:28 Traffic-based discovery in S...",0.3417459316567535,{'Document Name': 'Traffic-based discovery in ...
5,0,"20/06/2024, 16:29 Automated Service Suggestion...",0.3428102646311327,{'Document Name': 'Automated Service Suggestio...
6,4,"20/06/2024, 16:28 Pattern-based discovery in S...",0.350781140248529,{'Document Name': 'Pattern-based discovery in ...
7,5,"20/06/2024, 16:28 Pattern-based discovery in S...",0.3582856419551724,{'Document Name': 'Pattern-based discovery in ...
8,7,"20/06/2024, 16:29 Choose the right method for ...",0.3751033478948917,{'Document Name': 'Choose the right method for...
9,8,"20/06/2024, 16:29 Discovery based on Predictiv...",0.3753600195742919,{'Document Name': 'Discovery based on Predicti...


In [109]:
query = input()

 Can you explain me about the taag based service mapping?


In [2]:
!ls "data"

Automated Service Suggestions.pdf
Choose the right method for discovery and mapping application services.pdf
Discovery based on Predictive Intelligence.pdf
Pattern-based discovery in Service Mapping.pdf
Tag-based discovery in Service Mapping.pdf
Traffic-based discovery in Service Mapping.pdf
