In [1]:
# from vectordb.chromadb import ChromaDB
from vectorDB.qdrantdb import QdrantDB
from chunking.sentence_chunker import PDFSentenceChunker, CSVDataLoader, pdf_reader
from retrieval.mxbai_retriever import MxbaiRetriever
import pandas as pd
import requests
import json
import re
import os
import fitz
from qdrant_client import models

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Set up the QDrant DB client
db = QdrantDB()

In [5]:
pdf_reader('./documents/temp')

[{'text': '\x00\x00\x00\x01\n\x02\n\x03\n\x04\n\x05\n\x06\n\x03\n\x07\n\x08\n\t\n\n\x02\n\x0b\n\x0c\n\x06\n\x03\n\x02\n\t\n\r\n\x0e\n\x08\n\x08\n\x02\n\x01\n\x06\n\x05\n\t\n\x0f\n\x10\x11\n\t\n\x12\n\t\n\x13\n\x14\n\x15\n\x16\n\x17\n\x15\n\x16\n\x18\n\x19\n\x03\n\x1a\n\x15\n\x1b\n\x1c\n\x15\n\x16\n\x18\n\x19\n\x0b\n\x1d\n\x1e\n\x1b\n\x1f\n \n\x1e\n\x15\n\x16\n\x18\n\t\n\x05\n!\n\x15\n"\n \n\t\n#\n\x14\n!\n$\t\n#\n\x14\n!\n\x16\n\x1e\n\t\n\x10\x1f\n \n \n\x1a\n\t\n\x07\n\x14\n \n\x1d\n%\n%\n\x12\n%\n%\n&\n\'\n\x00(\n)\n\x00*\n+\n,\n-\n\x00(\n)\n(\n(\n\x00.\n/\n0\n1\n\x002\n+\n3\n3\n-\n4\n0\n,\n\x001\n+\n5\n-\n6\n1\n-\n7\n-\n1\n\x00(\n8\n9\n(\n)\n:\n;\n<\n.\n/\n0\n1\n\x002\n+\n3\n3\n-\n4\n0\n,\n\x001\n+\n5\n-\n6\n1\n-\n7\n-\n1\n\x00(\n)\n9\n(\n=\n)\n:\n<\n>\n!\n\x17\n \n\x1a\n?\n#\n!\n\x14\n\x17\n(\n)\n)\n=\n9\n(\n)\n(\n)\n\x00@\nA\n5\n-\n7\n0\n4\n0\nB\n,\nC\nDE\n(\n)\n)\n;\n9\n(\n)\n(\n)\n\x00F\n9\n8\nG\n)\nH\nB\n,\n9\nI\nJ\n5\n4\nB\n6\nC\nDE\n\x08\n\x15\n\x16\n\x1b\n!\n\x1a\n\x16\n(\n)

In [8]:
# Create chunks of all pdfs and csv files present in the documents folder
chunks = PDFSentenceChunker(file_dir='./documents').chunk() + CSVDataLoader(source_dir='./documents').load_data()

In [9]:
chunks[-1]

 'page': 79,
 'file_name': 'toyota_camry_2020.csv'}

In [10]:
# setup our retriever
retriever = MxbaiRetriever('mixedbread-ai/mxbai-embed-large-v1',db)



In [11]:
# Get the embeddings for all the chunks of text we have
chunk_sents = [chunk['text'] for chunk in chunks]
emb = retriever.embed(chunk_sents)

In [12]:
emb.shape

(4807, 1024)

In [13]:
# Load the text and the metadata (file_name, page num) into the Vector DB in the form of a new collection "test1"
db.create_collection('test1', emb, chunks)

In [14]:
# Run this cell if you want to see the list of all the collections.
retriever.vectordb_client.client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='parts_taxonomy_combined_collection'), CollectionDescription(name='nhtsa_recalls'), CollectionDescription(name='test1'), CollectionDescription(name='part_catalog_toyota'), CollectionDescription(name='parts_lookup'), CollectionDescription(name='catalog_local'), CollectionDescription(name='pcdb_catalog'), CollectionDescription(name='predii_pcdb_taxonomy'), CollectionDescription(name='repair_procedures'), CollectionDescription(name='repair_job_parts'), CollectionDescription(name='Ford_F150_2009_2010'), CollectionDescription(name='test_db'), CollectionDescription(name='nhtsa_complaints'), CollectionDescription(name='parts_taxonomy_combined_collection_intelli'), CollectionDescription(name='test_2'), CollectionDescription(name='predii_taxonomy'), CollectionDescription(name='tsb_summary'), CollectionDescription(name='pcdb')])

In [9]:
# Run this cell if you want to see the number of points of each doc in the collection
db.client.count(
    collection_name="test2",
    count_filter=models.Filter(
        must=[
            models.FieldCondition(key="file_name", match=models.MatchValue(value='31.pdf')),
        ]
    ),
    exact=True,
)

# '17.pdf'
# '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf'
# 'nvidia_10k.pdf'
# 'toyota_camry_2020.csv'

CountResult(count=11)

In [10]:
db.client.scroll(
    collection_name="test2",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="file_name", match=models.MatchValue(value="31.pdf")),
        ]
    ),
    # limit=1,
    with_payload=True,
    with_vectors=True,
)

# print(x)

([Record(id=395, payload={'file_name': '31.pdf', 'metadata': 'TECHNICAL SERVICE BULLETIN Torque-On-Demand (TOD) Transfer Case -  Grinding/Clicking/Ratcheting Noise From Front Wheel Area 20-2307 15 October  2020 Model: Ford 2003-2020 Expedition 2006-2020 F-150 Lincoln 2003-2020 Navigator Issue: Some 2003-2020 Expedition/Navigator and 2006-2020 F-150 non-Raptor vehicles equipped with a  TOD transfer case may exhibit grinding/clicking/ratcheting noise from the front wheel area. This may be  due to partial engagement of the integrated wheel ends (IWE). To correct this condition, follow the Service  Procedure steps to remove and cap the vacuum supply line. Action: Follow the Service Procedure steps to correct the condition on vehicles that meet all the following  criteria: • One of the following vehicles: - 2003-2020 Expedition/Navigator - 2006-2020 F-150 except Raptor • Equipped with TOD transfer cases • Grinding/clicking/ratcheting noise from the front axle area NOTE: Part quantity refers

In [19]:
db.client

AttributeError: 'QdrantClient' object has no attribute 'help'

In [31]:
from qdrant_client.http.models import models

In [33]:
models.Prefetch()

AttributeError: module 'qdrant_client.http.models.models' has no attribute 'Prefetch'

In [48]:
# from qdrant_client import models

db.client.search(
    collection_name="test10",
    query_vector=("text", [0 for i in range(1024)]),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(key="file_name", match=models.MatchValue(value="3.pdf")),
        ]
    ),
    limit=100,
)



[ScoredPoint(id=6, version=0, score=0.0, payload={'file_name': '3.pdf', 'metadata': 'TECHNICAL SERVICE BULLETIN\n \nAluminum Panel Corrosion\n19-2026\n06 February\n2019\n \nThis bulletin supersedes 17-0062 . Reason for update: Concern Carryover to New Model\nModel:\nFord\n2000-2007 Crown Victoria\n2015-2018 Edge\n2003-2018 Expedition\n2002-2018 Explorer\n2007-2010 Explorer Sport Trac\n2004-2018 F-150\n2017-2018 F-Super Duty\n2013-2018 Fusion\n2005-2006 GT\n2005-2018 Mustang\n2000-2003 Ranger\n2000-2007 Taurus\nLincoln\n2017-2018 Continental\n2000-2006 LS\n2010-2016 MKT\n2016-2018 MKX\n2013-2018 MKZ\n2000-2018 Navigator\n2000-2007 Town Car\nMercury\n2000-2007 Grand Marquis\n2003-2004 Marauder\n2002-2010 Mountaineer\n2002-2007 Sable\n \nSummary\nThis article supersedes TSB 17-0062 to update the model years.\nIssue: Some 2000 and newer Ford, Lincoln and Mercury vehicles equipped with aluminum body panels may exhibit\ncorrosion concerns appearing as bubbled and/or peeling paint with or wit

In [17]:
db.client.get_collection(collection_name="test10")

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=9032, indexed_vectors_count=0, points_count=2188, segments_count=7, config=CollectionConfig(params=CollectionParams(vectors={'metadata': VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None), 'text': VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_thr

In [9]:
os.listdir('./streamlit/streamlit_documents')

['highlighted_docs']

RAG

In [6]:
from generator.llama_old import LlamaGenerator
# from generator.phi import PhiGenerator
from reranker.mxbai_reranker import MxbaiReranker
# from utils import validate_output, get_citations, highlight_pdf

In [7]:
# Load the generator and the re-ranker
# ranker = MxbaiReranker('mixedbread-ai/mxbai-rerank-large-v1')
generator = LlamaGenerator('meta-llama/Meta-Llama-3.1-8B-Instruct', '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')

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.42s/it]


In [8]:
text = """{
  "answer": "TSBs around HVAC for your 2018 Ford F-150 include: TSB 20-2034, TSB 20-2236, and TSB 23-2115.",
  "citations": [
    {
      "source_id": 2,
      "quote": "Model: Ford 2015-2019 F-150"
    },
    {
      "source_id": 3,
      "quote": "Issue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate settings are adjusted between hot and cold."
    },
    {
      "source_id": 8,
      "quote": "• One of the following vehicles: - 2015-2019 F-150"
    },
    {
      "source_id": 9,
      "quote": "Reinstall the climate control housing into the vehicle. Refer to Workshop Manual (WSM), Section 412-00."
    },
    {
      "source_id": 10,
      "quote": "Issue: Some 2015-2020 F-150 and 2017-2022 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate control settings are adjusted between hot and cold."
    }
  ]
}"""

In [9]:
def output_formatter(raw_output):
    output = {
        "answer": "",
        "citations": []
    }
    
    citations_list = []
    out = ' '.join(raw_output.splitlines())
    content = re.split(r'"?[A|a]nswer"?:', out)[1]
    content = re.split(r'"?[C|c]itations"?:', content)
    answer_text = content[0].strip()
    citations_text = content[1]
    citations_text = re.split(r'"?[N|n]ote"?:', citations_text)[0].strip()
    
    citations = re.split(r'\n', citations_text)
    
    for cit in citations:
        cit_text = ''
        cit_index = ''
        
        match = re.search(""" ["|'](.*)["|'] """, cit)
        if match != None:
            cit_text = match.group(1).strip()
        else:
            match = re.split("""\\([S|s]ource[ |\\-|\\_][I|i][D|d]""", cit)[0].strip()
            match = re.sub("^\\W", "", match)
            match = re.sub("\\W$", "", match)
            cit_text = match.strip()
            
        match = re.search("""\\([S|s]ource[ |\\-|\\_][I|i][D|d][\\:|\\-]? (\\d+)""", cit)
        if match != None:
            cit_index = match.group(1).strip()
        else:
            print(f'Cite Index Not Found for Citation-\n{cit}')
            
        if cit_index.isnumeric():
            cit_index = int(cit_index)
        else:
            print(f'Cite Index Is Not INT for Citation-\n{cit}')
            
        cit_dict = {
            "source_id": cit_index,
            "quote": cit_text
        }
        
        citations_list.append(cit_dict)
        
    output['answer'] = answer_text
    output['citations'] = citations_list
    
    return str(output)
    

def clean_output(raw_output):
    is_raw_output = False
    print('\n\nRaw Output from the LLM: ', raw_output, '\n\n')
    output = None
    
    st = re.search('{\n?(\s{1,4})?"[A|a]nswer"', raw_output)
    end = re.search('](\n)?}', raw_output)
    
    if (st == None or end == None):
        match = re.search(r'"?[C|c]itations"?:', raw_output)
        if match == None:
            output = raw_output
            is_raw_output = True
        
        else:
            raw_output = output_formatter(raw_output)
            output = json.loads(raw_output, strict=False)
            print('The output was not a JSON. Fixed it.')
        return output, is_raw_output

    st = st.start()
    end = end.end()
    
    if (st<0 or end<0 or st > end):
        output = raw_output
        is_raw_output = True
        print('JSON structure is not correct!')
    else:
        raw_output = raw_output[st:end]
        output = json.loads(raw_output, strict=False)
        
    return output, is_raw_output

  st = re.search('{\n?(\s{1,4})?"[A|a]nswer"', raw_output)


In [10]:
clean_output(text)



Raw Output from the LLM:  {
  "answer": "TSBs around HVAC for your 2018 Ford F-150 include: TSB 20-2034, TSB 20-2236, and TSB 23-2115.",
  "citations": [
    {
      "source_id": 2,
      "quote": "Model: Ford 2015-2019 F-150"
    },
    {
      "source_id": 3,
      "quote": "Issue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate settings are adjusted between hot and cold."
    },
    {
      "source_id": 8,
      "quote": "• One of the following vehicles: - 2015-2019 F-150"
    },
    {
      "source_id": 9,
      "quote": "Reinstall the climate control housing into the vehicle. Refer to Workshop Manual (WSM), Section 412-00."
    },
    {
      "source_id": 10,
      "quote": "Issue: Some 2015-2020 F-150 and 2017-2022 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate control settings are adjusted between hot and cold."
    }
  ]
} 




({'answer': 'TSBs around HVAC for your 2018 Ford F-150 include: TSB 20-2034, TSB 20-2236, and TSB 23-2115.',
  'citations': [{'source_id': 2, 'quote': 'Model: Ford 2015-2019 F-150'},
   {'source_id': 3,
    'quote': 'Issue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate settings are adjusted between hot and cold.'},
   {'source_id': 8,
    'quote': '• One of the following vehicles: - 2015-2019 F-150'},
   {'source_id': 9,
    'quote': 'Reinstall the climate control housing into the vehicle. Refer to Workshop Manual (WSM), Section 412-00.'},
   {'source_id': 10,
    'quote': 'Issue: Some 2015-2020 F-150 and 2017-2022 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate control settings are adjusted between hot and cold.'}]},
 False)

In [17]:
query = 'what is 911 assist and how does it work. Answer in detail'
# query = 'What are seatbelt pretensioners and cinch tongue. Explain in detail.'
# query = "What are some of the customer complaints for Toyota Camry?"
# query = "what types of brake pads should i use for Ford F-150 2021?"
# query = "How do I engage emergency brake in Ford F-150 2021?"

In [18]:
# Get the results from the retriever, rank them and create the list of text from these to send to the generator.
result = retriever.retrieve(collection_name='test1', query=[query], topk=5)
ranked_result = ranker.rank(query, result, topk=5)
llm_context = [res['text'] for res in ranked_result]

In [19]:
# Load the data and the query into the generator
answer = generator.generate(query, llm_context, max_new_tokens=1024, top_p=0.9, temperature=0.1)
out = answer['response']

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [None]:
"""
    The generator is supposed to give the output in a JSON format like below - 
    
    {
        "answer": "911 Assist is a SYNC system feature that can call for help in the event of a crash. It works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off. The system transmits vehicle data to the emergency service during an emergency call.",
        "citations": [
            {
                "source_id": 1,
                "quote": "911 Assist is a SYNC system feature that can call for help."
            },
            {
                "source_id": 2,
                "quote": "If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, 
                or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired 
                and connected Bluetooth-enabled phone."
            },
            ...
    }
"""

In [20]:
# function to validate the output of the generator to get a JSON structure.
def clean_output(raw_output):
    output = None
    st = re.search('{(\n)?"answer"', raw_output)
    end = re.search('](\n)?}', raw_output)
    
    if (st == None or end == None):
        output = raw_output
        print('Generator response if not in the form of JSON!')
        return output
    
    st = st.start()
    end = end.end()

    if (st<0 or end<0 or st > end):
        output = raw_output
        print('JSON structure is not correct!')
    else:
        raw_output = raw_output[st:end]
        output = json.loads(raw_output)
        
    return output

In [21]:
# This is the raw LLM output
out

'{\n"answer": "911 Assist is a SYNC system feature that can call for help in the event of a crash. It works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off. The system transmits vehicle data to the emergency service during an emergency call.",\n"citations": [\n{\n"source_id": 1,\n"quote": "911 Assist is a SYNC system feature that can call for help."\n},\n{\n"source_id": 2,\n"quote": "If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired and connected Bluetooth-enabled phone."\n},\n{\n"source_id": 4,\n"quote": "During an emergency call the system transmits vehicle data to the emergency service."\n}\n]\n}'

In [22]:
# this is the cleaned and validated LLM output
output = clean_output(out)
output

{'answer': '911 Assist is a SYNC system feature that can call for help in the event of a crash. It works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off. The system transmits vehicle data to the emergency service during an emergency call.',
 'citations': [{'source_id': 1,
   'quote': '911 Assist is a SYNC system feature that can call for help.'},
  {'source_id': 2,
   'quote': 'If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired and connected Bluetooth-enabled phone.'},
  {'source_id': 4,
   'quote': 'During an emergency call the system transmits vehicle data to the emergency service.'}]}

In [23]:
output

{'answer': '911 Assist is a SYNC system feature that can call for help in the event of a crash. It works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off. The system transmits vehicle data to the emergency service during an emergency call.',
 'citations': [{'source_id': 1,
   'quote': '911 Assist is a SYNC system feature that can call for help.'},
  {'source_id': 2,
   'quote': 'If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired and connected Bluetooth-enabled phone.'},
  {'source_id': 4,
   'quote': 'During an emergency call the system transmits vehicle data to the emergency service.'}]}

In [24]:
# These functions are for validating the output JSON and highlighting the text in the documents...
# All the highlighted docs are present in the ./documents/highlighted_docs folder...

def iterate_contexts(context, text):
    for i in range(len(context)):
        res = context[i].find(text) 
        if res > -1:
            break
    
    if res < 0:
        return -1
    
    return i


def validate_output(context, output):
    context = [re.sub('\n', ' ', c) for c in context]
    context = [c.lower() for c in context]
    output = output['citations']
    
    final_idxs = []
    
    for out in output:
        idx = out['source_id']-1
        text = out['quote'].lower()
        res = context[idx].find(text) 
        
        # print('LLM output: \n',out)
       
        if res < 0:
            final_idx = iterate_contexts(context, text)
            
            if final_idx < 0:
                print(f"LLM Output is not matching any item in the Context List for citation with source_id: {idx+1}")
                
            else:
                final_idxs.append(final_idx)
        
        else:
            final_idxs.append(idx)
            
        # print('\n\ncontext id: \n', final_idxs, '\n\n')
        
    final_idxs = list(set(final_idxs))
    return final_idxs
            
            
def get_citations(llm_citation_indices, ranked_result):
    return [ranked_result[i] for i in range(len(ranked_result)) if i in llm_citation_indices]

def highlight_text_cell(s, rows):
    to_highlight = ['color: yellow;' if i in rows else '' for i in range(len(list(s)))]
    return to_highlight


def highlight_csv(highlights_list, output_path = './documents/highlighted_docs', source_path = './documents'):
    docs = list(set([doc['file_name'] for doc in highlights_list]))
    
    dfs_list = []

    for doc in docs:
        file_path = os.path.join(source_path, doc)
        output_file_extension = "_highlighted.xlsx"
        output_file_name = doc.replace(".csv",output_file_extension) 
        output_file_path = os.path.join(output_path, output_file_name)

        highlight_doc = pd.read_csv(file_path)
        
        rows = list(set([d['page'] for d in highlights_list if d['file_name'] == doc]))
        
        # print('rows: ', rows)
            
        highlight_doc = highlight_doc.style.apply(highlight_text_cell, rows = rows, axis=0)
        highlight_doc.to_excel(output_file_path)
        
        dfs_list.append(highlight_doc)
        
    return dfs_list
            
               
def highlight_pdf(highlights_list, output_path = './documents/highlighted_docs', source_path = './documents'):
    
    docs = list(set([doc['file_name'] for doc in highlights_list]))
    
    highlighted_pdfs = []
    
    for doc in docs:
        pdf_path = os.path.join(source_path, doc)
        output_file_extension = "_highlighted.pdf"
        output_file_name = doc.replace(".pdf",output_file_extension) 
        output_pdf_path = os.path.join(output_path, output_file_name)

        highlight_doc = fitz.open(pdf_path)
        
        for doc_dict in highlights_list:
            file = doc_dict['file_name']
            if file != doc:
                continue
            page_num = doc_dict['page']
            text_to_highlight = doc_dict['text']
            
            page = highlight_doc.load_page(page_num-1)
            text_instances = page.search_for(text_to_highlight.strip())
            
            print('\n\ndoc: \n', re.sub('\n', ' ', text_to_highlight), '\n\n')
            
            print('\npage num: \n', page_num-1, '\n\n')
            
            print('\npage: \n', page, '\n\n')
            
            print('\ntext_instances: \n', text_instances, '\n\n')
            
            for inst in text_instances:
                print("HIGHLIGHTING", inst)
                page.add_highlight_annot(inst)
                
            highlight_doc.save(output_pdf_path, garbage=0, deflate=False, clean=False)
            
            highlighted_pdfs.append(highlight_doc)
    
    return highlighted_pdfs

In [25]:
# Run this cell to use the above functions to validate and re-order the output JSON and highlight the text in the documents.
# All the highlighted docs are present in the ./documents/highlighted_docs folder...

final_indices = validate_output(llm_context, output)
highlights_list = get_citations(final_indices, ranked_result)

pdf_highlights_list = [data for data in highlights_list if data['file_name'].split('.')[-1] == 'pdf']
csv_highlights_list = [data for data in highlights_list if data['file_name'].split('.')[-1] == 'csv']

if len(pdf_highlights_list) > 0:
    highlighted_pdfs = highlight_pdf(pdf_highlights_list)
    
if len(csv_highlights_list) > 0:
    highlighted_dfs = highlight_csv(csv_highlights_list)



doc: 
  WHAT IS 911 ASSIST 911 Assist is a SYNC system feature that can call for help.  For more information, visit www.owner.ford.com .  



page num: 
 70 



page: 
 page 70 of ./documents/2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf 



text_instances: 
 [Rect(22.67799949645996, 57.86797332763672, 131.1280059814453, 74.59796905517578), Rect(22.67799949645996, 78.9859619140625, 176.85403442382812, 90.6099624633789), Rect(22.67799949645996, 87.78594970703125, 86.38199615478516, 99.40995025634766), Rect(22.67799949645996, 100.5859375, 124.43800354003906, 112.2099380493164), Rect(22.67799949645996, 109.38592529296875, 107.07799530029297, 121.04192352294922)] 


HIGHLIGHTING Rect(22.67799949645996, 57.86797332763672, 131.1280059814453, 74.59796905517578)
HIGHLIGHTING Rect(22.67799949645996, 78.9859619140625, 176.85403442382812, 90.6099624633789)
HIGHLIGHTING Rect(22.67799949645996, 87.78594970703125, 86.38199615478516, 99.40995025634766)
HIGHLIGHTING Rect(22.6779994964

In [26]:
# This is the LLM answer. 
output['answer']

'911 Assist is a SYNC system feature that can call for help in the event of a crash. It works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off. The system transmits vehicle data to the emergency service during an emergency call.'

In [27]:
# These are the LLM citations.
output['citations']

[{'source_id': 1,
  'quote': '911 Assist is a SYNC system feature that can call for help.'},
 {'source_id': 2,
  'quote': 'If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired and connected Bluetooth-enabled phone.'},
 {'source_id': 4,
  'quote': 'During an emergency call the system transmits vehicle data to the emergency service.'}]

In [28]:
highlighted_dfs[0]

NameError: name 'highlighted_dfs' is not defined

In [29]:
# Sometimes the row number (page in citations) might not match the exact row in the csv file. The function validate_output 
# makes sure to correct the LLM output by looking again at all the ranked results. This inconsistency is prominent for csv 
# files only.

csv_highlights_list

[]

In [30]:
ranked_result

[{'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf',
  'page': 71,
  'text': ' WHAT IS 911 ASSIST\n911 Assist is a SYNC system feature that\ncan call for help.\n For more information, visit\nwww.owner.ford.com .\n'},
 {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf',
  'page': 71,
  'text': 'HOW DOES 911 ASSIST WORK\nIf a crash deploys an airbag, excluding knee\nairbags and rear inflatable seatbelts, or\nactivates the fuel pump shut-off, your\nvehicle may be able to contact emergency\nservices by dialing 911 through a paired and\nconnected Bluetooth-enabled phone.\n Not all crashes will deploy an airbag or\nactivate the fuel pump shut-off.'},
 {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf',
  'page': 71,
 {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf',
  'page': 71,
 {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf',
  'page': 71,
  'text': '"\nIf

Rough - No Need to Run any cell from here

In [None]:
import webbrowser
webbrowser.open_new(r'./documents/highlighted_docs/2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020_highlighted.pdf')

In [209]:
import fitz
import os

In [210]:
highlight_doc = fitz.open("./documents/2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf")

In [211]:
page_num = highlights_list[0]['page']
        
page = highlight_doc.load_page(page_num-1)

In [212]:
page.get_text("text")



In [213]:
highlights_list[0]['text']

' WHAT IS 911 ASSIST\n911 Assist is a SYNC system feature that\ncan call for help.\n For more information, visit\nwww.owner.ford.com .\n'

In [None]:
page_num

In [None]:
ll = page.search_for(highlights_list[2]['text'])
for l in ll:
    page.add_highlight_annot(l)

highlight_doc.save('./documents/highlighted_docs/new.pdf', garbage=0, deflate=False, clean=False)

In [None]:
ll

In [None]:
output

In [None]:
"""
    <cited_answer>
    911 Assist is a SYNC system feature that can call for help. It is a system that can contact emergency services by dialing 911 through
    a paired and connected Bluetooth-enabled phone in the event of a crash.
        <answer>
            911 Assist is a feature that can automatically call for help in the event of a crash. It works by using the vehicle's sensors
            to detect a crash and then using the paired and connected Bluetooth-enabled phone to dial 911. The system can also transmit 
            vehicle data to the emergency services during the emergency call.
        </answer>    
        <citations>        
            <citation><source_id>1</source_id><quote>911 Assist is a SYNC system feature that can call for help.</quote></citation>
            
            <citation><source_id>2</source_id><quote>If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, 
            or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired 
            and connected Bluetooth-enabled phone.</quote></citation>
            
            <citation><source_id>4</source_id><quote>During an emergency call the system transmits vehicle data to the emergency service.
            </quote></citation>
        </citations>
"""

In [None]:
s = """{\n"answer": "911 Assist is a SYNC system feature that can call for help in the event of a crash, and it works by using a paired and connected Bluetooth-enabled phone to dial 911 if a crash deploys an airbag or activates the fuel pump shut-off, excluding knee airbags and rear inflatable seatbelts. The system transmits vehicle data to the emergency service during an emergency call.",\n"citations": [\n{\n"source_id": 2,\n"quote": "If a crash deploys an airbag, excluding knee airbags and rear inflatable seatbelts, or activates the fuel pump shut-off, your vehicle may be able to contact emergency services by dialing 911 through a paired and connected Bluetooth-enabled phone."\n},\n{\n"source_id": 4,\n"quote": "During an emergency call the system transmits vehicle data to the emergency service."\n}\n]\n}"""

json.loads(s)

In [None]:
prompt = """
            Question: what is 911 assist and how does it work. Answer in detail
            
            Context:
            1. WHAT IS 911 ASSIST\n911 Assist is a SYNC system feature that\ncan call for help.\n For more information, visit\nwww.owner.ford.com .
            2. HOW DOES 911 ASSIST WORK\nIf a crash deploys an airbag, excluding knee\nairbags and rear inflatable seatbelts, or\nactivates the fuel pump shut-off, your\nvehicle may be able to contact emergency\nservices by dialing 911 through a paired and\nconnected Bluetooth-enabled phone.\n Not all crashes will deploy an airbag or\nactivate the fuel pump shut-off.
            3. Failure to do so may\ncause serious injury to someone or\ndamage the phone which could prevent\n911 Assist from working properly.\n WARNING: Unless the 911 Assist\nsetting is set on before a crash, the\nsystem will not dial for help which could\ndelay response time, potentially\nincreasing the risk of serious injury or\ndeath after a crash.
            4. During an emergency call the system\ntransmits vehicle data to the emergency\nservice.\n EMERGENCY CALL\nREQUIREMENTS\nWARNING: Do not wait for 911\nAssist to make an emergency call if you\ncan do it yourself. Dial emergency\nservices immediately to avoid delayed\nresponse time which could increase the\nrisk of serious injury or death after a\ncrash.
            5. If you do not cancel the call and SYNC\nmakes a successful call a pre-recorded\nmessage plays for the 911 operator. The\noccupants in your vehicle are able to talk\nwith the operator. Be prepared to provide\nyour name, phone number and location\nimmediately because not all 911 systems\nare capable of receiving this information\nelectronically.
            
            Now answer the above question using the information from the context provided above. The answer should be generated using the contexts only. If the contexts seems insufficient to answer the question respond with a message stating that question cannot be answered due to lack of information. Remember, the final output must contain both an answer and citations. A citation consists of a VERBATIM quote that \
            justifies the answer and the ID of the quote article. Remember the ID of the quote articles start from 0. Return a citation for every quote across all articles \
            that justify the answer. Do not send the question in the final output.
            
            The final output should only be in this format:

            <cited_answer>
                <answer></answer>
                <citations>
                    <citation><source_id></source_id><quote></quote></citation>
                    <citation><source_id></source_id><quote></quote></citation>
                    ...
                </citations>
            </cited_answer>
        
"""

In [None]:
import json

llm_response = answer['response']


In [None]:
answer

In [None]:
answer = generator.generate(query, llm_context)

In [None]:
prompt = """
            Question: what is 911 assist and how does it work. Answer in detail
            
            Context:
            1. WHAT IS 911 ASSIST\n911 Assist is a SYNC system feature that\ncan call for help.\n For more information, visit\nwww.owner.ford.com .
            2. HOW DOES 911 ASSIST WORK\nIf a crash deploys an airbag, excluding knee\nairbags and rear inflatable seatbelts, or\nactivates the fuel pump shut-off, your\nvehicle may be able to contact emergency\nservices by dialing 911 through a paired and\nconnected Bluetooth-enabled phone.\n Not all crashes will deploy an airbag or\nactivate the fuel pump shut-off.
            3. Failure to do so may\ncause serious injury to someone or\ndamage the phone which could prevent\n911 Assist from working properly.\n WARNING: Unless the 911 Assist\nsetting is set on before a crash, the\nsystem will not dial for help which could\ndelay response time, potentially\nincreasing the risk of serious injury or\ndeath after a crash.
            4. During an emergency call the system\ntransmits vehicle data to the emergency\nservice.\n EMERGENCY CALL\nREQUIREMENTS\nWARNING: Do not wait for 911\nAssist to make an emergency call if you\ncan do it yourself. Dial emergency\nservices immediately to avoid delayed\nresponse time which could increase the\nrisk of serious injury or death after a\ncrash.
            5. If you do not cancel the call and SYNC\nmakes a successful call a pre-recorded\nmessage plays for the 911 operator. The\noccupants in your vehicle are able to talk\nwith the operator. Be prepared to provide\nyour name, phone number and location\nimmediately because not all 911 systems\nare capable of receiving this information\nelectronically.
            
            Now answer the above question using the information from the context provided above. The answer should be generated using the contexts only. If the contexts seems insufficient to answer the question respond with a message stating that question cannot be answered due to lack of information. Remember, the final output must contain both an answer and citations. A citation consists of a VERBATIM quote that \
            justifies the answer and the ID of the quote article. Remember the ID of the quote articles start from 0. Return a citation for every quote across all articles \
            that justify the answer. Do not send the question in the final output.
            
            The final output should only be in this format:

            <cited_answer>
                <answer></answer>
                <citations>
                    <citation><source_id></source_id><quote></quote></citation>
                    <citation><source_id></source_id><quote></quote></citation>
                    ...
                </citations>
            </cited_answer>
        
"""

In [None]:
llm_context = [' WHAT IS 911 ASSIST\n911 Assist is a SYNC system feature that\ncan call for help.\n For more information, visit\nwww.owner.ford.com .\n',
 'HOW DOES 911 ASSIST WORK\nIf a crash deploys an airbag, excluding knee\nairbags and rear inflatable seatbelts, or\nactivates the fuel pump shut-off, your\nvehicle may be able to contact emergency\nservices by dialing 911 through a paired and\nconnected Bluetooth-enabled phone.\n Not all crashes will deploy an airbag or\nactivate the fuel pump shut-off.',
 'Failure to do so may\ncause serious injury to someone or\ndamage the phone which could prevent\n911 Assist from working properly.\n WARNING: Unless the 911 Assist\nsetting is set on before a crash, the\nsystem will not dial for help which could\ndelay response time, potentially\nincreasing the risk of serious injury or\ndeath after a crash.\n',
 'During an emergency call the system\ntransmits vehicle data to the emergency\nservice.\n EMERGENCY CALL\nREQUIREMENTS\nWARNING: Do not wait for 911\nAssist to make an emergency call if you\ncan do it yourself. Dial emergency\nservices immediately to avoid delayed\nresponse time which could increase the\nrisk of serious injury or death after a\ncrash.',
 '"\nIf you do not cancel the call and SYNC\nmakes a successful call a pre-recorded\nmessage plays for the 911 operator. The\noccupants in your vehicle are able to talk\nwith the operator. Be prepared to provide\nyour name, phone number and location\nimmediately because not all 911 systems\nare capable of receiving this information\nelectronically.\n']

Using deployed LLM API

In [None]:
# never change top_k and top_p

data = {
    "prompt": prompt,
    "temperature": 1e-12,
    "max_tokens": 1024,
    "top_p": 1,
    "top_k": -1
}

In [None]:

response = requests.post(url="http://127.0.0.1:8081/vllm/models/generate", data=json.dumps(data))


In [None]:
output = eval(response.text)

Rough

In [1]:
from qdrant_client import QdrantClient, models
client = QdrantClient("localhost", port=6333)

In [2]:
def get_all_collections():
    try:
        collections_list = []
        collections = client.get_collections()
        for collection in collections:
            for c in list(collection[1]):
                collections_list.append(c.name)
    except Exception as e:
        st.error(f"Error fetching collections from Qdrant: {e}")

In [5]:
for collection in collections:
    print(collection)

('collections', [CollectionDescription(name='nhtsa_recalls'), CollectionDescription(name='test_2'), CollectionDescription(name='nhtsa_complaints'), CollectionDescription(name='parts_lookup'), CollectionDescription(name='predii_pcdb_taxonomy'), CollectionDescription(name='parts_taxonomy_combined_collection_intelli'), CollectionDescription(name='test1'), CollectionDescription(name='repair_procedures'), CollectionDescription(name='parts_taxonomy_combined_collection'), CollectionDescription(name='test_db'), CollectionDescription(name='predii_taxonomy'), CollectionDescription(name='part_catalog_toyota'), CollectionDescription(name='pcdb_catalog'), CollectionDescription(name='repair_job_parts'), CollectionDescription(name='Ford_F150_2009_2010'), CollectionDescription(name='pcdb'), CollectionDescription(name='tsb_summary'), CollectionDescription(name='smart_insights'), CollectionDescription(name='catalog_local')])


In [9]:
documents = ['a', 'bc', 'c']
for idx, doc in enumerate(documents):
    print(idx)
    print

0
1
2


In [3]:
import streamlit as st
from streamlit_pdf_viewer import pdf_viewer

pdf_viewer("./streamlit/streamlit_documents/highlighted_docs/2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020_highlighted.pdf")

2024-06-30 22:50:05.062 
  command:

    streamlit run /opt/predii/miniconda3/envs/RAG-citations/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]


0

In [17]:
s = """Here is the Answer:

"Programming the garage door opener involves several steps. To program a device to a previously trained button, press and hold the desired button, then follow programming steps. To program a garage door opener to your gate opener motor, press the learn button on the garage door opener motor, then return to your vehicle and press and hold one of the three HomeLink function buttons you want to program for two seconds, then release. Repeat this step. To complete the programming, press and hold the HomeLink button you programmed for two seconds, then release. You may need to do this twice to activate the door."

Citations:

* "Press and hold the desired button, then follow programming steps." (Source ID: 2)
* "Press the learn button on the garage door opener motor, then return to your vehicle and press and hold one of the three HomeLink function buttons you want to program for two seconds, then release. Repeat this step." (Source ID: 1)
* "To complete the programming, press and hold the HomeLink button you programmed for two seconds, then release. You may need to do this twice to activate the door." (Source ID: 2)

Note: The answer is generated based on the provided contexts, and the citations are provided to justify the answer. """

# ' '.join(s.splitlines()

In [19]:
ss = ' '.join(s.splitlines())
re.split(r'[A|a]nswer:', ss)

['Here is the ',
 '  "Programming the garage door opener involves several steps. To program a device to a previously trained button, press and hold the desired button, then follow programming steps. To program a garage door opener to your gate opener motor, press the learn button on the garage door opener motor, then return to your vehicle and press and hold one of the three HomeLink function buttons you want to program for two seconds, then release. Repeat this step. To complete the programming, press and hold the HomeLink button you programmed for two seconds, then release. You may need to do this twice to activate the door."  Citations:  * "Press and hold the desired button, then follow programming steps." (Source ID: 2) * "Press the learn button on the garage door opener motor, then return to your vehicle and press and hold one of the three HomeLink function buttons you want to program for two seconds, then release. Repeat this step." (Source ID: 1) * "To complete the programming, 

In [67]:
cit = """* "Press and hold the desired button, then follow programming steps." (Source ID: 2)"""

cit_text = ''
cit_index = ''

match = re.search(""" ["|'](.*)["|'] """, cit)
if match != None:
    cit_text = match.group(1).strip()
else:
    match = re.split("""\\([S|s]ource[ |\\-|\\_][I|i][D|d]""", cit)[0].strip()
    match = re.sub("^\\W", "", match)
    match = re.sub("\\W$", "", match)
    cit_text = match.strip()
    
match = re.search("""\\([S|s]ource[ |\\-|\\_][I|i][D|d][\\:|\\-]? (\\d+)""", cit)
if match != None:
    cit_index = match.group(1).strip()
else:
    print(f'Cite Index Not Found for Citation-\n{cit}')
    
if cit_index.isnumeric():
    cit_index = int(cit_index)
else:
    print(f'Cite Index Is Not INT for Citation-\n{cit}')

In [71]:
str({
    'answer': "hfjfff",
    "citations": [
        {
            "source_id": 1,
            "quote": "hfhfkf"
        }
    ]
})

"{'answer': 'hfjfff', 'citations': [{'source_id': 1, 'quote': 'hfhfkf'}]}"

In [40]:
ss = "1 Reset your oil change reminder after each engine oil and filter change."
# ss = "Replace the engine-mounted and frame-mounted fuel filters. Every 30,000 mi (48,000 km), 6 months or 600 engine hours Replace the air inlet foam filter. Every 30,000 mi (48,000 km)"

d = [{'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf', 'page': 605, 'text': 'Make\nsure to reset the Intelligent Oil-Life Monitor\nafter each oil change.   See Resetting the\nEngine Oil Change Reminder (page 463).\n If your information display resets\nprematurely or becomes inoperative, you\nshould perform the oil change interval at\nsix months or 5,000 mi (8,000 km) from\nyour last oil change. Never exceed one year\nor 10,000 mi (16,000 km) between oil\nchange intervals.\n'}, {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf', 'page': 619, 'text': '1\nEvery 5,000 mi (8,000 km)\nor six months\n1Reset your oil change reminder after each engine oil and filter change.   See Resetting\nthe Engine Oil Change Reminder (page 463).\n Off-road Operation\nInspect the steering linkage, ball joints and the U-joints.\n Lubricate grease fittings, if applicable.\nInspect frequently, service\nas required\nReplace the engine air filter.\n Change the engine oil and filter.\n'}, {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf', 'page': 616, 'text': 'Every 30,000 mi\n(48,000 km), six months or\n600 engine hours\nReplace the air inlet foam filter.\n Every 30,000 mi\n(48,000 km)\n1 Reset the oil change reminder after engine oil and filter changes.   See Resetting the\nEngine Oil Change Reminder (page 463).\n'}, {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf', 'page': 612, 'text': '4 After initial inspection, inspect every other oil change until replaced.\n 5 Non-hybrid vehicles only.\n 6 Four-wheel drive vehicles only.\n 7 Initial replacement at 10 years or 200,000 mi (322,000 km), then every five years or\n100,000 mi (160,000 km).\n SPECIAL OPERATING\nCONDITIONS SCHEDULED\nMAINTENANCE - DIESEL\n'}, {'file_name': '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf', 'page': 619, 'text': '1\nEvery 5,000 mi (8,000 km)\nor six months\nInspect the wheels and related components for abnormal\nnoise, wear, looseness or drag.\n Rotate the tires, inspect tires for wear and the measure\nthe tread depth.\n 1Reset your oil change reminder after each engine oil and filter change.   See Resetting\nthe Engine Oil Change Reminder (page 463).\n'}]

ll = [dd['text'] for dd in d]

# ss.find(s)

In [41]:
ll

['Make\nsure to reset the Intelligent Oil-Life Monitor\nafter each oil change.   See Resetting the\nEngine Oil Change Reminder (page 463).\n If your information display resets\nprematurely or becomes inoperative, you\nshould perform the oil change interval at\nsix months or 5,000 mi (8,000 km) from\nyour last oil change. Never exceed one year\nor 10,000 mi (16,000 km) between oil\nchange intervals.\n',
 '1\nEvery 5,000 mi (8,000 km)\nor six months\n1Reset your oil change reminder after each engine oil and filter change.   See Resetting\nthe Engine Oil Change Reminder (page 463).\n Off-road Operation\nInspect the steering linkage, ball joints and the U-joints.\n Lubricate grease fittings, if applicable.\nInspect frequently, service\nas required\nReplace the engine air filter.\n Change the engine oil and filter.\n',
 'Every 30,000 mi\n(48,000 km), six months or\n600 engine hours\nReplace the air inlet foam filter.\n Every 30,000 mi\n(48,000 km)\n1 Reset the oil change reminder after engi

In [42]:
ss

'1 Reset your oil change reminder after each engine oil and filter change.'

In [43]:
nl = ll[1].strip()
nl = re.sub(r'\n', ' ', nl)
nl = ' '.join(nl.split())
ss = ' '.join(ss.split())

In [44]:
nl

'1 Every 5,000 mi (8,000 km) or six months 1Reset your oil change reminder after each engine oil and filter change. See Resetting the Engine Oil Change Reminder (page 463). Off-road Operation Inspect the steering linkage, ball joints and the U-joints. Lubricate grease fittings, if applicable. Inspect frequently, service as required Replace the engine air filter. Change the engine oil and filter.'

In [45]:
ss

'1 Reset your oil change reminder after each engine oil and filter change.'

In [48]:
ss = set(ss.split())
nl = set(nl.split())

In [53]:
len([i for i in ss if i not in nl])/len(ss)

0.08333333333333333

In [47]:
set(ss.split()) <= set(nl.split())

False

In [22]:
' '.join(ss.split()) in ' '.join(nl.split())

True

In [21]:
' '.join(nl.split())

'Replace the engine-mounted and frame-mounted fuel filters. Every 30,000 mi (48,000 km), 6 months or 600 engine hours Replace the air inlet foam filter. Every 30,000 mi (48,000 km) 1 Reset the oil change reminder after engine oil and filter changes. See Resetting the Engine Oil Change Reminder (page 463). 612 F-150 (TFD) Canada/United States of America, enUSA, Edition date: 202007, First-Printing Scheduled Maintenance'

In [26]:
ll[2].find('hihhhihi')

-1

In [5]:
for l in ll:
    nl = l.strip()
    nl = re.sub(r'\n', ' ', nl)
    ' '.join(ss.split()) in ' '.join(nl.split())

'Replace the engine-mounted and frame-mounted fuel filters.  Every 30,000 mi (48,000 km), 6 months or 600 engine hours Replace the air inlet foam filter.  Every 30,000 mi (48,000 km) 1 Reset the oil change reminder after engine oil and filter changes. See Resetting the Engine Oil Change Reminder (page 463).  612 F-150 (TFD) Canada/United States of America, enUSA, Edition date: 202007, First-Printing Scheduled Maintenance'

In [4]:
import spacy

In [46]:
text = ["What the hell, this is a gone case. What are you doing?"]

nlp = spacy.load("en_core_web_sm")  # use your model here

# doc = nlp(text)
# sentences = [sentence.text for sentence in doc.sents]
for doc in nlp.pipe(text, n_process=2):
    print(doc.sents)
    sentences = [sentence.text for sentence in doc.sents]

# docs_tokens = []
# for doc in nlp.pipe(docs, n_process=2):
#     tokens = [t.text for t in doc]
#     docs_tokens.append(tokens)

<generator object at 0x7f38c81ff2e0>


In [47]:
sentences

['What the hell, this is a gone case.', 'What are you doing?']

In [37]:
doc

What the hell, this is a gone case

In [12]:
# base_path = '/opt/predii/kamal/RAG/streamlit/streamlit_documents'

# doc_path = base_path + '/' + '98.pdf'

doc_path = './streamlit_documents/46.pdf'

doc = fitz.open(doc_path)

page = doc.load_page(0)
text = page.get_text()
text

'TECHNICAL SERVICE BULLETIN\nLack Of Heat Or Cooling From The Cabin Vents - Temperature Door \nOr Door Actuator Binding/Inoperative - DTC B1081:07 \n20-\n2236\n15 July 2020\nModel:\nFord\n2015-2019 F-150\n2017-2019 F-Super Duty\nIssue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling \nfrom the cabin vents when the climate settings are adjusted between hot and cold. This may be due to a \nstuck/binding temperature door within the climate control housing or binding/inoperative temperature door \nactuator. Diagnostic trouble code (DTC) B1081:07 may be present in the front controls interface module \n(FCIM). To correct the condition, follow the Service Procedure steps to remove the climate control housing \nand replace the temperature door actuator(s) and/or lubricate the temperature blend door bearing surface.\nAction: Follow the service procedure steps to correct the condition on vehicles that meet all of the \nfollowing criteria:\n• One of the

In [16]:
text_to_highlight = ' TECHNICAL SERVICE BULLETIN\nLack Of Heat Or Cooling From The Cabin Vents - Temperature Door \nOr Door Actuator Binding/Inoperative - DTC B1081:07 \n20-\n2236\n15 July 2020\nModel:\n Ford\n2015-2019 F-150\n2017-2019 F-Super Duty\nIssue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling \nfrom the cabin vents when the climate settings are adjusted between hot and cold.'
page.search_for(text_to_highlight.strip())

[]

In [18]:
text.find(text_to_highlight)

-1

In [13]:
text[:re.search(r'Parts|SERVICE PROCEDURE|WARRANTY STATUS|Warranty Status:', text).start()]

'TECHNICAL SERVICE BULLETIN\n2000-2020 Ford/Lincoln/Mercury Vehicles - Aluminum Panel \nCorrosion\n20-\n2233\n10 July 2020\nThis bulletin supersedes 19-2026 . Reason for update: Concern Carryover to New Model\nModel:\nFord\n2000-2020 All\nLincoln\n2000-2020 All\nMercury\n2000-2011 All\nSummary\nThis article supersedes TSB 19-2026 to update the vehicle model years affected.\nIssue: Some 2000 and newer Ford/Lincoln/Mercury vehicles equipped with aluminum body panels may \nexhibit corrosion concerns appearing as bubbled and/or peeling paint with or without accompanying white \ndust. Panel replacement is recommended.\nAction: Follow the Service Procedure to correct the condition on vehicles that meet all of the following \ncriteria:\n• 2000-2020 Ford/Lincoln or 2000-2011 Mercury vehicle\n• Equipped with an aluminum body panel\n• Corrosion concerns appearing as bubbled and/or peeling paint with or without white dust\nNOTE: Part quantity refers to the number of that service part number requi

In [18]:
import numpy as np
emb = np.zeros((2,4))
meta = [('0.pdf', 'bhjbbk'), ('1.pdf', 'hkkhjk')]

In [19]:
{meta[idx][0]:emb[idx] for idx, _ in enumerate(meta)}

{'0.pdf': array([0., 0., 0., 0.]), '1.pdf': array([0., 0., 0., 0.])}

In [53]:
page = doc.load_page(0)
rect = page.rect
height = 150
clip = fitz.Rect(0, 0, rect.width, height)
text = page.get_text(clip=clip)
text

'TECHNICAL SERVICE BULLETIN\n2000-2020 Ford/Lincoln/Mercury Vehicles - Aluminum Panel \nCorrosion\n20-\n2233\n10 July 2020\nPage 1 of 2\n'

In [21]:
# text = 'All of the good Parts are gone now. There is no disregard. \nIssue: All parts are gone. \nAction: We have order new ones. \nParts: Wheel.'

# match = re.search(r'((Action|ACTION)(.|\n)+)\n(Parts|SERVICE PROCEDURE|WARRANTY STATUS|Warranty Status:)', text)
act_match = re.search(r'Action:|ACTION', text)

metadata = ''

if act_match != None:
    metadata += text[:act_match.start()]
    sp_text = text[act_match.start():]
    
    match = re.search(r'\n(Parts|SERVICE PROCEDURE|WARRANTY STATUS|Warranty Status:)', sp_text)
    
    if match != None:
        start = match.start()
        metadata += sp_text[:start]
    
    

In [57]:
metadata

'TECHNICAL SERVICE BULLETIN\nTorque-On-Demand (TOD) Transfer Case - \nGrinding/Clicking/Ratcheting Noise From Front Wheel Area\n20-2307\n15 October \n2020\nModel:\nFord\n2003-2020 Expedition\n2006-2020 F-150\nLincoln\n2003-2020 Navigator\nIssue: Some 2003-2020 Expedition/Navigator and 2006-2020 F-150 non-Raptor vehicles equipped with a \nTOD transfer case may exhibit grinding/clicking/ratcheting noise from the front wheel area. This may be \ndue to partial engagement of the integrated wheel ends (IWE). To correct this condition, follow the Service \nProcedure steps to remove and cap the vacuum supply line.\nAction: Follow the Service Procedure steps to correct the condition on vehicles that meet all the following \ncriteria:\n• One of the following vehicles:\n- 2003-2020 Expedition/Navigator\n- 2006-2020 F-150 except Raptor\n• Equipped with TOD transfer cases\n• Grinding/clicking/ratcheting noise from the front axle area\nNOTE: Part quantity refers to the number of that service part nu

In [23]:
print(sp_text)

Action: Follow the Service Procedure to correct the condition on vehicles that meet the following criteria:
• One of the following vehicle lines:
- 2013-2018 C-Max
- 2013-2020 Escape/Fusion/MKZ
- 2015-2018 Focus
- 2015-2020 Edge/F-150/MKC/Mustang
- 2016-2018 MKX
- 2016-2019 Explorer
- 2017-2020 F-Super Duty



In [27]:
print(text[:match.start()+len(match[1])])

All of the good Parts are gone now. There is no disregard. 
Issue: All parts are gone. 
Action: We have order new ones. 


In [36]:
text

'TECHNICAL SERVICE BULLETIN\nTorque-On-Demand (TOD) Transfer Case - \nGrinding/Clicking/Ratcheting Noise From Front Wheel Area\n20-2307\n15 October \n2020\nModel:\nFord\n2003-2020 Expedition\n2006-2020 F-150\nLincoln\n2003-2020 Navigator\nIssue: Some 2003-2020 Expedition/Navigator and 2006-2020 F-150 non-Raptor vehicles equipped with a \nTOD transfer case may exhibit grinding/clicking/ratcheting noise from the front wheel area. This may be \ndue to partial engagement of the integrated wheel ends (IWE). To correct this condition, follow the Service \nProcedure steps to remove and cap the vacuum supply line.\nAction: Follow the Service Procedure steps to correct the condition on vehicles that meet all the following \ncriteria:\n• One of the following vehicles:\n- 2003-2020 Expedition/Navigator\n- 2006-2020 F-150 except Raptor\n• Equipped with TOD transfer cases\n• Grinding/clicking/ratcheting noise from the front axle area\nNOTE: Part quantity refers to the number of that service part nu

In [33]:
print(text)

TECHNICAL SERVICE BULLETIN
Torque-On-Demand (TOD) Transfer Case - 
Grinding/Clicking/Ratcheting Noise From Front Wheel Area
20-2307
15 October 
2020
Model:
Ford
2003-2020 Expedition
2006-2020 F-150
Lincoln
2003-2020 Navigator
Issue: Some 2003-2020 Expedition/Navigator and 2006-2020 F-150 non-Raptor vehicles equipped with a 
TOD transfer case may exhibit grinding/clicking/ratcheting noise from the front wheel area. This may be 
due to partial engagement of the integrated wheel ends (IWE). To correct this condition, follow the Service 
Procedure steps to remove and cap the vacuum supply line.
Action: Follow the Service Procedure steps to correct the condition on vehicles that meet all the following 
criteria:
• One of the following vehicles:
- 2003-2020 Expedition/Navigator
- 2006-2020 F-150 except Raptor
• Equipped with TOD transfer cases
• Grinding/clicking/ratcheting noise from the front axle area
NOTE: Part quantity refers to the number of that service part number required, which may

In [4]:
import torch
from sentence_transformers import SentenceTransformer

In [24]:
query = ['Show me TSBs around HVAC for my Ford F-150.']

model_name = 'intfloat/e5-large'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
prompt = 'query: '

model = SentenceTransformer(model_name)
query_emb = model.encode(query, show_progress_bar=False, convert_to_numpy=True, batch_size=32, device=device, prompt=prompt)
contexts = db.search(collection_name='test10', query_emb=query_emb, topk=10)



In [25]:
contexts

[{'file_name': '94.pdf',
  'metadata': '4/28/23, 8:49 AM https://www.fordservicecontent.com/Ford_Content/vdirsnet/TSB/EU/~WTSB23-2115/US/EN/~UEmployee/default.aspx?VIN=&vers… https://www.fordservicecontent.com/Ford_Content/vdirsnet/TSB/EU/~WTSB23-2115/US/EN/~UEmployee/default.aspx?VIN=&version=4.0&environme… 1/9 \xa0\xa0  TECHNICAL SERVICE BULLETIN Lack Of Heat Or Cooling From The Cabin Vents, Temperature Door Or Door Actuator Binding/Inoperative - DTC B1081:07 23-2115 14 April 2023 This bulletin supersedes 21-2199. Model: Ford 2015-2020 F-150 2017-2022 F-Super Duty Summary This article supersedes TSB 21-2199 to update the vehicle model years affected, Service Procedure, and the Part List. Issue: Some 2015-2020 F-150 and 2017-2022 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate control settings are adjusted between hot and cold. This may be due to a stuck/binding temperature door within the climate control housing or binding/inoperative

In [16]:
query_emb

array([[-0.34180883,  0.42854288, -0.4481907 , ..., -0.3228751 ,
         0.14512675, -0.45524088]], dtype=float32)

In [7]:
old_q_emb = query_emb
old_q_emb

array([[-0.47079986,  0.6046135 , -0.74429107, ..., -0.29396424,
         0.18610922, -0.36622542]], dtype=float32)

In [59]:
list(set([m['file_name'] for m in contexts]))

['5.pdf', '35.pdf', '42.pdf', '80.pdf', '43.pdf', '31.pdf', '47.pdf', '1.pdf']

In [43]:
# len(contexts[0]['metadata'].split())
len(x[0][0].payload['metadata'].split())

189

In [33]:
x = db.client.scroll(
    collection_name="test10",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="file_name", match=models.MatchValue(value="50.pdf")),
        ]
    ),
    # limit=1,
    with_payload=True,
    with_vectors=True,
)

x

# print(x)

([Record(id=889, payload={'file_name': '50.pdf', 'metadata': 'TECHNICAL SERVICE BULLETIN Seat Pan Looseness, Lateral Movement, Or Clunk/Thump Noise On  Turns 20- 2171 15 May 2020 This bulletin supersedes 18-2361 . Reason for update: Concern Carryover to New Model Model: Ford 2013-2018 C-MAX 2018-2020 EcoSport 2015-2020 Edge 2013-2020 Escape 2018-2020 Expedition 2016-2019 Explorer 2015-2020 F-150 2017-2020 F-Super Duty 2015-2018 Focus 2013-2020 Fusion 2015-2020 Mustang 2019-2020 Ranger Lincoln 2015-2019 MKC 2016-2018 MKX 2013-2020 MKZ 2019-2020 Nautilus Summary This article supersedes TSB 18-2361 update the vehicle model years affected and vehicles affected. Issue: Some 2013-2018 C-Max, 2013-2020 Escape/Fusion/MKZ, 2015-2018 Focus,2015-2019 MKC,  2015-2020 Edge/F150/Mustang, 2016-2018 MKX, 2016-2019 Explorer, 2017-2020 F-Super Duty, 2018- 2020 EcoSport/Expedition, 2019-2020 Ranger/Nautilus vehicles may experience seat pan looseness,  lateral movement, or a clunk/thump noise on turns. Th

In [32]:
# Run this cell if you want to see the number of points of each doc in the collection
db.client.count(
    collection_name="test10",
    count_filter=models.Filter(
        must=[
            models.FieldCondition(key="file_name", match=models.MatchValue(value='50.pdf')),
        ]
    ),
    exact=True,
)

# '17.pdf'
# '2021-Ford-F-150-Owners-Manual-version-1_om_EN-US_09_2020.pdf'
# 'nvidia_10k.pdf'
# 'toyota_camry_2020.csv'

CountResult(count=23)

In [30]:
s = """{
"answer": "The Technical Service Bulletins (TSBs) related to HVAC for your 2018 Ford F-150 are:

* WTSB23-2115: Lack Of Heat Or Cooling From The Cabin Vents, Temperature Door Or Door Actuator Binding/Inoperative - DTC B1081:07
* WTSB20-2034: With Block Heater - Lack Of Heat When Operating In Temperatures Below -20°C (4°F) - Built On Or Before 1-Jan-2019
* WTSB20-2236: Lack Of Heat Or Cooling From The Cabin Vents - Temperature Door Or Door Actuator Binding/Inoperative - DTC B1081:07

These TSBs provide information on how to diagnose and repair issues related to the heating and cooling system in your vehicle, including stuck or binding temperature doors and inoperative temperature door actuators.",
"citations": [
{
"source_id": 1,
"quote": "This article supersedes TSB 21-2199 to update the vehicle model years affected, Service Procedure, and the Part List."
},
{
"source_id": 2,
"quote": "This bulletin supersedes 19-2121. Reason for update: Incorrect or Missing Parts"
},
{
"source_id": 3,
"quote": "Issue: Some 2015-2019 F-150, 2017-2019 F-Super Duty vehicles may exhibit a lack of heat or cooling from the cabin vents when the climate settings are adjusted between hot and cold."
},
{
"source_id": 4,
"quote": "This article supersedes TSB 15-0052 to update the vehicle model years, Service Procedure and Part List."
},
{
"source_id": 5,
"quote": "Reassemble the lower half of the climate control housing. Refer to WSM, Section 412-00, Installation procedure."
},
{
"source_id": 6,
"quote": "The Temperature Door Bearing Surfaces Includes Time To Remove And Install Climate Control Housing And Install Flock Tape (Do Not Use With Any Other Labor Operation)"
},
{
"source_id": 7,
"quote": "Lubricate the climate control housing and temperature blend door bearing surfaces with Motorcraft HVAC Duct Component Grease."
},
{
"source_id": 8,
"quote": "Action: Follow the service procedure steps to correct the condition on vehicles that meet all of the following criteria:"
}
]
}"""

json.loads(s, strict=False)

{'answer': 'The Technical Service Bulletins (TSBs) related to HVAC for your 2018 Ford F-150 are:\n\n* WTSB23-2115: Lack Of Heat Or Cooling From The Cabin Vents, Temperature Door Or Door Actuator Binding/Inoperative - DTC B1081:07\n* WTSB20-2034: With Block Heater - Lack Of Heat When Operating In Temperatures Below -20°C (4°F) - Built On Or Before 1-Jan-2019\n* WTSB20-2236: Lack Of Heat Or Cooling From The Cabin Vents - Temperature Door Or Door Actuator Binding/Inoperative - DTC B1081:07\n\nThese TSBs provide information on how to diagnose and repair issues related to the heating and cooling system in your vehicle, including stuck or binding temperature doors and inoperative temperature door actuators.',
 'citations': [{'source_id': 1,
   'quote': 'This article supersedes TSB 21-2199 to update the vehicle model years affected, Service Procedure, and the Part List.'},
  {'source_id': 2,
   'quote': 'This bulletin supersedes 19-2121. Reason for update: Incorrect or Missing Parts'},
  {'so