In [1]:
# Data Handling and Manipulation
import pandas as pd

import re  # For regex operations
import logging

# Natural Language Processing (NLP) and Embeddings
import spacy

# Machine Learning Pipelines
from sklearn.pipeline import Pipeline

#Custome pipelines classes
from pipeline.chroma_db import ChromaDBSaver
from pipeline.pdf_reader import PDFReader
from pipeline.text_proccessor import TextFormatter
from pipeline.chunk_proccessor import SentenceChunkerWithSummarization
from pipeline.question_generator import QuestionAnswerGenerator
from pipeline.embedding_proccessor import EmbeddingGenerator

#ChromaDBSearcher
from common.chroma_db import ChromaDBSearcher

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def log_output(string):
    #logger.info(string)
    return 

### Document Processing Pipeline

This cell processes a document through a series of transformations and generates embeddings, QA pairs, and chunk-level data. The final output is saved to a CSV file and stored in ChromaDB.

1. **Pipeline Steps**:
   - The `process_document()` function orchestrates a pipeline of transformations on a PDF document.
   - The pipeline includes several steps that are applied sequentially to the document:
     1. **PDFReader**: Reads the PDF document from the specified file path.
     2. **TextFormatter**: Formats the text extracted from the PDF (e.g., removing unwanted characters or formatting).
     3. **SentenceChunkerWithSummarization**: Divides the document into chunks and summarizes the content.
     4. **QuestionAnswerGenerator**: Generates questions and corresponding answers for each chunk of text.
     5. **EmbeddingGenerator**: Generates embeddings for the text using a pre-trained model.
     6. **ChromaDBSaver**: Saves the embeddings and document data into a ChromaDB collection.

2. **Document Processing**:
   - A document ID is created based on the document's attributes (`make`, `model`, `year`, `style`).
   - The `PDFReader` class is instantiated manually, as it requires the file path to read the PDF.
   - The document is processed through the pipeline, where the text is formatted, chunked, questions are generated, and embeddings are created.

3. **Saving Results**:
   - The embeddings are saved to ChromaDB for further retrieval and analysis.
   - The document's chunk data and corresponding generated questions and answers are extracted into separate lists (`all_chunk_data` and `all_QandA`).
   - The chunk data and QA pairs are stored in pandas DataFrames.

4. **Exporting to CSV**:
   - The chunk-level data is saved to a CSV file, with the name based on the document ID (`document_id + ".csv"`).
   - The generated QA pairs are saved to a separate CSV file (`document_id + "_QA.csv"`).

This pipeline allows for automated document processing, transforming raw PDF data into structured and searchable information, which can then be used for various retrieval and analysis tasks.


In [3]:
def process_document(document):
    # Create the pipeline
    pipeline = Pipeline(steps=[
        ('pdf_reader', PDFReader),  # Step 1: Read PDF (pass the class, not an instance)
        ('text_formatter', TextFormatter()),  # Step 2: Format text
        #('bullet_point_transformer', BulletPointTransformer()),  # Step 3: Transform bullet points
        ('sentence_chunker', SentenceChunkerWithSummarization()),  # Step 4: Chunk sentences
        ('question_answer_generator', QuestionAnswerGenerator()),  # Step 5: Generate QA pairs (call the class)
        ('embedding_generator', EmbeddingGenerator()),  # Step 6: Generate embeddings
        ('chromadb_saver', ChromaDBSaver())  # Step 7: Save to ChromaDB
    ])

    # Create a document ID based on attributes
    document_id = f"{document['make']}_{document['model']}_{document['year']}_{document['style']}"

    # Instantiate PDFReader manually, as it requires the file path
    pdf_reader = PDFReader(document.get('pdf_path'), logger)
    result = pdf_reader.fit_transform(document.get('pdf_path'))  # Read the PDF file

    # Process the document through each pipeline step
    result = pipeline.named_steps['text_formatter'].transform(result)
    #result = pipeline.named_steps['bullet_point_transformer'].transform(result)
    result = pipeline.named_steps['sentence_chunker'].transform(result)
    result = pipeline.named_steps['question_answer_generator'].transform(result)  # Generate questions and answers

    # Generate embeddings and add them to the result
    embeddings = pipeline.named_steps['embedding_generator'].transform(result, document)

    # Save the embeddings and document data to ChromaDB
    pipeline.named_steps['chromadb_saver'].transform(embeddings, [document] * len(embeddings))

    # Process each chunk and add the data to the list
    all_chunk_data = []
    all_QandA =[]
    for chunk in result:
        chunk_data = {
            "sentence_chunk": chunk["sentence_chunk"],
            "chunk_char_count": chunk["chunk_char_count"],
            "chunk_word_count": chunk["chunk_word_count"],
            "chunk_token_count": chunk["chunk_token_count"],
            "page_number": chunk["page_number"],
            "summary_text": chunk["summary_text"],
            "summary_char_count": chunk["summary_char_count"],
            "summary_word_count": chunk["summary_word_count"],
            "para_id" : chunk["para_id"],               
        }
        for index, question in enumerate(chunk["generated_questions"], 0):
            qa_data  = {
               
               "page_number": chunk["page_number"],
               "para_id" : chunk["para_id"],   
               "sentence_chunk": chunk["sentence_chunk"],
               "question" : question,
               "answer" :  chunk["generated_answers"][index]
            }
            all_QandA.append(qa_data)
  
        all_chunk_data.append(chunk_data)

    # Convert the list of chunks into a pandas DataFrame
    df = pd.DataFrame(all_chunk_data)
    df_qa = pd.DataFrame(all_QandA)

    # Save the DataFrame to a CSV file
    df.to_csv("manuals/" + document_id + ".csv", index=False)
    df_qa.to_csv("manuals/" + document_id + "_QA.csv", index=False)


### Main Pipeline Execution: Processing Multiple Documents

This cell runs the document processing pipeline on a list of input documents. It demonstrates how to apply the pipeline to multiple documents in sequence.

1. **Input Documents**:
   - A list of dictionaries is defined, where each dictionary contains metadata for a document:
     - `make`, `model`, `year`, and `style`: These are attributes of the document (e.g., the make and model of a vehicle or product).
     - `pdf_path`: The file path to the PDF document that will be processed.
   
   Two example documents are provided:
   - "Fraggles X500 2024 FMS"
   - "Fraggles X700 2022 HCM"

2. **Processing Each Document**:
   - The code loops through each document in the `input_documents` list.
   - For each document, the `process_document()` function is called, which processes the document using the predefined pipeline (as described in the previous markdown explanation).
   - This includes reading the PDF, extracting and formatting text, chunking the text, generating question-answer pairs, creating embeddings, and saving results to ChromaDB and CSV files.

By using this loop, you can easily process multiple documents in batch, allowing for scalable processing and storage of information for various documents in the collection.


In [4]:

input_documents = [
    {
        "make": "Fraggles",
        "model": "X500",
        "year": "2024",
        "style": "FMS",
        "pdf_path": "manuals/FragglesX500FMS-2024-V4.pdf"  
    },
    {
        "make": "Fraggles",
        "model": "X700",
        "year": "2022",
        "style": "HCM",
        "pdf_path": "manuals/FragglesX700HCM-2022-V2.pdf"
    }
]

# Process each document
for doc in input_documents:
    process_document(doc)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpo

Reading PDF pages:   0%|          | 0/97 [00:00<?, ?it/s]

INFO:__main__:Successfully read 97 pages from manuals/FragglesX500FMS-2024-V4.pdf
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 0
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 1
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 2
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 3
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 4
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 5
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 6
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 7
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 8
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 9
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 10
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 11
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 12
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 13
INFO:pipeline.chun

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-mpnet-base-v2


Reading PDF pages:   0%|          | 0/28 [00:00<?, ?it/s]

INFO:__main__:Successfully read 28 pages from manuals/FragglesX700HCM-2022-V2.pdf
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 0
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 1
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 2
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 3
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 4
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 5
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 6
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 7
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 8
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 9
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 10
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 11
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 12
INFO:pipeline.chunk_proccessor:Extracted sentences from page: 13
INFO:pipeline.chun

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

### Example Usage: Searching with ChromaDB

This cell demonstrates how to use the `ChromaDBSearcher` class to search for relevant context within a document using a query.

1. **Initialization**:
   - A `ChromaDBSearcher` object (`searcher`) is instantiated. This object will interact with a ChromaDB collection to retrieve relevant document chunks.
   
2. **Setting the Document Source**:
   - The variable `document_source` is set to `"Fraggles_X500_2024_FMS"`, which is the document ID you wish to search within.
   - You can replace this with any other document ID (e.g., `"Ford_Mustang_2023_MACH-E"`) depending on the document you are interested in.

3. **Defining the Query**:
   - The `query` variable contains the text string `"how to use parking breakes?"`, which will be used to search for relevant answers within the specified document.
   - This is the search term or question for which you want to find relevant content from the document.

4. **Performing the Search**:
   - The `search_by_id()` method of `ChromaDBSearcher` is called with the `document_source` and `query` as arguments.
   - This method will return the top results (up to 10 by default) based on the relevance of the query and the document chunks stored in ChromaDB.

This demonstrates how you can use ChromaDB to search for specific information in a document based on a query. It retrieves relevant text chunks that may provide an answer or context to the question posed in the query.


In [5]:
# Example usage
searcher = ChromaDBSearcher()
document_source = "Fraggles_X500_2024_FMS"  # Replace with the actual document ID you want to search for
#document_source = "Ford_Mustang_2023_MACH-E"
query = "how to use parking breakes?"  # Replace with the query you want to search for

searcher.search_by_id(document_source, query)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

["['FragglesX500FMS 2024          49  8. Release the brake pedal and allow the system to  maneuver the vehicle. 9. The vehicle will back into the parking space and shift  into park (P) when complete. Note: The system centers the vehicle between objects, not  based on parking lines. Exiting a Parallel Parking Space  1. Press the Parking Aid button. 2. Tap the Active Park Assist icon on the touchscreen. 3.', 'Select Parallel Park Exit mode. 4. Use the turn signal to choose the direction for exit. 5. Press and hold the brake pedal. 6. Shift into neutral (N), then press and hold the  Parking Aid button. 7. Release the parking brake and let the system  maneuver the vehicle out of the space. 8.', 'Tap the Active Park Assist icon on the touchscreen. 3. Select Perpendicular Parking mode. 4. Use the turn signal to indicate the side you want to  park. 5. Drive about 3 ft (1 m) away from parked vehicles. 6. The system detects the space; press and hold the  brake pedal. 7.']"]

In [6]:
!nvidia-smi

Sat Nov 30 22:31:28 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


|   0  Tesla V100-PCIE-16GB           On  |   00000000:3B:00.0 Off |                    0 |
| N/A   36C    P0             36W /  250W |    1751MiB /  16384MiB |      1%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-PCIE-16GB           On  |   00000000:D8:00.0 Off |                    0 |
| N/A   34C    P0             38W /  250W |       3MiB /  16384MiB |      0%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                      