### First is cell experimental, pip install camelot

In [None]:
# install dependencies
!pip install PyMuPDF, "camelot-py[cv]",

In [None]:
# import fitz  # PyMuPDF
# import pandas as pd
# from PIL import Image
# import io
# import os

# def extract_text_from_pdf(pdf_path):
#     document = fitz.open(pdf_path)
#     text = ""
#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         text += page.get_text()
#     return text

# def extract_tables_from_pdf(pdf_path):
#     # Note: PyMuPDF does not directly support table extraction.
#     #       You would need to integrate with a library like Camelot for more accurate table extraction.
#     import camelot

#     tables = camelot.read_pdf(pdf_path, pages='all')
#     tables_list = [table.df for table in tables]
#     return tables_list

# def extract_images_from_pdf(pdf_path, output_folder):
#     document = fitz.open(pdf_path)
#     image_list = []

#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         images = page.get_images(full=True)
        
#         for img_index, img in enumerate(images):
#             xref = img[0]
#             base_image = document.extract_image(xref)
#             image_bytes = base_image["image"]
#             image_ext = base_image["ext"]
#             image = Image.open(io.BytesIO(image_bytes))

#             image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
#             image_path = os.path.join(output_folder, image_filename)
#             image.save(image_path)
#             image_list.append(image_path)

#     return image_list

# # Usage example:
# pdf_path = 'path/to/your/document.pdf'
# output_folder = 'path/to/save/images'

# # Extract text
# text = extract_text_from_pdf(pdf_path)
# print("Extracted Text:\n", text)

# # Extract tables
# tables = extract_tables_from_pdf(pdf_path)
# for i, table in enumerate(tables):
#     print(f"Table {i + 1}:\n", table)

# # Extract images
# if not os.path.exists(output_folder):
#     os.makedirs(output_folder)
# images = extract_images_from_pdf(pdf_path, output_folder)
# print("Extracted Images:", images)


In [6]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os
import json
import torch
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
model = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text("text")
        text_data.append({"page": page_num + 1, "text": text})
    return text_data

def extract_images_from_pdf(pdf_path, output_folder):
    document = fitz.open(pdf_path)
    image_list = []

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))

            image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)
            image.save(image_path)
            image_list.append({"page": page_num + 1, "image_path": image_path})

    return image_list

def extract_sections_and_headings(text):
    # Use the NLP model to extract named entities which can be used as headings/sections
    nlp_results = nlp(text)
    sections = []
    current_section = {"title": "Introduction", "content": ""}
    
    for result in nlp_results:
        if result['entity'] == 'B-MISC':  # Assuming sections/headings are categorized as MISC
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"title": result['word'], "content": ""}
        current_section["content"] += " " + text[result['start']:result['end']]
    
    if current_section["content"]:
        sections.append(current_section)
    
    return sections

def create_json_structure(text_data, image_data):
    manual_data = {
        "title": "Car Owner's Manual",
        "author": "Manufacturer Name",
        "content": []
    }

    for text in text_data:
        page_num = text["page"]
        images_on_page = [img for img in image_data if img["page"] == page_num]
        
        sections = extract_sections_and_headings(text["text"])
        
        page_data = {
            "page": page_num,
            "sections": sections,
            "images": images_on_page
        }
        manual_data["content"].append(page_data)

    return manual_data

# Usage example:
pdf_path = '/Users/varunbharadwaj/Desktop/BOSCH_Hackathon/manuals/exter.pdf'
output_folder = 'images_exter'

# Extract text
text_data = extract_text_from_pdf(pdf_path)

# Extract images
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
image_data = extract_images_from_pdf(pdf_path, output_folder)

# Create JSON structure
manual_json = create_json_structure(text_data, image_data)

# Save to JSON file
json_path = 'exter.json'
with open(json_path, 'w') as json_file:
    json.dump(manual_json, json_file, indent=4)

print(f"JSON saved to {json_path}")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


JSON saved to exter.json


In [5]:
import os
import json
from datetime import datetime

def get_json_metadata(json_path):
    # Get file size
    file_size = os.path.getsize(json_path)

    # Get creation and modification date
    creation_time = os.path.getctime(json_path)
    modification_time = os.path.getmtime(json_path)

    # Convert to human-readable format
    creation_date = datetime.fromtimestamp(creation_time).isoformat()
    modification_date = datetime.fromtimestamp(modification_time).isoformat()

    # Read the JSON file to extract custom metadata
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)

    # Example custom metadata extraction (adjust according to your JSON structure)
    metadata = {
        "title": data.get("title", "N/A"),
        "author": data.get("author", "N/A"),
        "creation_date": data.get("creation_date", "N/A")
    }

    # Combine all metadata
    file_metadata = {
        "file_size": file_size,
        "creation_date": creation_date,
        "modification_date": modification_date,
        "custom_metadata": metadata
    }

    return file_metadata

# Usage example
json_path = 'exter.json'
metadata = get_json_metadata(json_path)
print(metadata)


{'file_size': 2309819, 'creation_date': '2024-05-18T20:19:09.220644', 'modification_date': '2024-05-18T20:19:09.220644', 'custom_metadata': {'title': "Car Owner's Manual", 'author': 'TATA Nexon', 'creation_date': 'N/A'}}


In [None]:
{
    "title": "Car Owner's Manual",
    "author": "TATA Nexon",
    "content": [
        {
            "page": 12,
            "sections": [
                {
                    "title": "Introduction",
                    "content": " turning \nDriving On Gradients \nWhen climbing gradient, the vehicle may \nbegin to slow down and show a lack of \npower. If this happens, shift to a lower gear \nand apply power smoothly so that there is \nno loss of traction. \nWhen driving down a hill, the engine brak\u00ad\ning should be used by shifting into a lower \ngear. Do not drive in neutral gear or switch \noff the engine.\n \n \nDriving On Highway  \nStopping distance progressively, in-\ncreases with vehicle speed. Maintain a \nsufficient distance between your vehicle \nand the vehicle ahead.  \nFor long distance driving, perform safety \nchecks before starting a trip and take rest \nat certain intervals to prevent fatigue\nSEAT BELTS \nThis section of user manual describes \nyour Vehicle\u2019s seat belt, airbag and Child \nrestraints system. Please read and follow \nall these instructions care-fully to minimise \nrisk of severe injury or death. \nSeat belts are the primary restraints \n\u2022\nsystem in the vehicle. All occupants, \nincluding the driver, should always \nwear their seat belts to minimize the \nrisk of injury.  \nSit back and adjust (if equipped), the \n\u2022\nseat. Make sure that your seat is ad\u00ad\njusted to a good driving position and \nthe back of the seat is upright. \nBuckling The Shoulder Seat Belt \nGrasp the tongue then slowly pull out \n\u2022\nthe seat belt over the shoulder and \nacross the chest. When the seat belt is \nlong enough to fit, insert the tongue \ninto the lock buckle until you hear a \n\u201cCLICK\u201d which indicates that the seat \nbelt is securely locked. \nPosition the lap portion of seat belt \n\u2022\nacross your pelvic bone , below your \n WARNING\nOn long and steep gradients you must \nreduce the load on the brakes by shift\u00ad\ning early to a lower gear. This allows \nyou to take ad-vantage of the engine \nbraking effect and helps avoid over\u00ad\nheating of service brakes resulting in re\u00ad\nduced braking efficiency\nSAFETY\n3\n"
                }
            ],
            "images": [
                {
                    "page": 12,
                    "image_path": "images/page_12_img_1.png"
                },
                {
                    "page": 12,
                    "image_path": "images/page_12_img_2.png"
                }
            ]
        },
    ]
}

In [7]:
import os
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [8]:


# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to process a single JSON file
def process_json_file(json_path):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)

    texts = []
    metadata = []

    for page in data['content']:
        page_num = page['page']
        for section in page['sections']:
            section_title = section['title']
            section_content = section['content']
            text = f"Page {page_num}, Section: {section_title}, Content: {section_content}"
            texts.append(text)
            metadata.append({
                "file": os.path.basename(json_path),
                "page": page_num,
                "section_title": section_title
            })

    return texts, metadata

# Directory containing JSON files
json_dir = '/Users/varunbharadwaj/Desktop/BOSCH_Hackathon/trial1/json_files'

# Initialize lists to hold all texts and metadata
all_texts = []
all_metadata = []

# Process each JSON file in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        json_path = os.path.join(json_dir, filename)
        texts, metadata = process_json_file(json_path)
        all_texts.extend(texts)
        all_metadata.extend(metadata)

# Convert texts to embeddings
embeddings = model.encode(all_texts, convert_to_tensor=False)

# Convert embeddings to a numpy array
embeddings = np.array(embeddings)

# Create a FAISS index
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index and metadata
faiss.write_index(index, 'manual_index.faiss')

with open('manual_metadata.json', 'w') as meta_file:
    json.dump(all_metadata, meta_file)

print("FAISS index and metadata saved.")

FAISS index and metadata saved.


In [9]:
import faiss
from sentence_transformers import SentenceTransformer
import json

# Load the FAISS index and metadata
index = faiss.read_index('manual_index.faiss')

with open('manual_metadata.json', 'r') as meta_file:
    metadata = json.load(meta_file)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to query the vector database
def query_manual(query_text, top_k=5):
    # Vectorize the query
    query_embedding = model.encode([query_text])
    
    # Search the index
    distances, indices = index.search(query_embedding, top_k)
    
    # Retrieve and print the results
    results = []
    for idx in indices[0]:
        result = metadata[idx]
        results.append(result)
    
    return results

# Example query
query = "How to drive on gradients?"
results = query_manual(query)
print("Top 5 results:")
for res in results:
    print(res)

Top 5 results:
{'file': 'nexon.json', 'page': 12, 'section_title': 'Introduction'}
{'file': 'nexon.json', 'page': 145, 'section_title': 'Introduction'}
{'file': 'exter.json', 'page': 237, 'section_title': 'Introduction'}
{'file': 'exter.json', 'page': 259, 'section_title': 'Introduction'}
{'file': 'nexon.json', 'page': 124, 'section_title': 'Introduction'}


In [11]:
import os
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Directory containing JSON files
json_dir = 'json_files'

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize lists to hold the text and metadata
texts = []
metadata = []

# Loop through each JSON file in the directory
for json_file_name in os.listdir(json_dir):
    if json_file_name.endswith('.json'):
        json_path = os.path.join(json_dir, json_file_name)
        
        # Load JSON data
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
        
        # Extract text and metadata from the JSON structure
        for page in data['content']:
            page_num = page['page']
            for section in page['sections']:
                section_title = section['title']
                section_content = section['content']
                text = f"Page {page_num}, Section: {section_title}, Content: {section_content}"
                texts.append(text)
                metadata.append({
                    "file_name": json_file_name,
                    "page": page_num,
                    "section_title": section_title,
                    "section_content": section_content
                })

# Convert texts to embeddings
embeddings = model.encode(texts, convert_to_tensor=False)

# Convert embeddings to a numpy array
embeddings = np.array(embeddings)

# Create a FAISS index
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index and metadata
faiss.write_index(index, 'manuals_index.faiss')

with open('manuals_metadata.json', 'w') as meta_file:
    json.dump(metadata, meta_file)

print("FAISS index and metadata saved.")

FAISS index and metadata saved.


In [14]:
import faiss
from sentence_transformers import SentenceTransformer
import json

# Load the FAISS index and metadata
index = faiss.read_index('manuals_index.faiss')

with open('manuals_metadata.json', 'r') as meta_file:
    metadata = json.load(meta_file)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to query the vector database
def query_manual(query_text, top_k=5):
    # Vectorize the query
    query_embedding = model.encode([query_text])
    
    # Search the index
    distances, indices = index.search(query_embedding, top_k)
    
    # Retrieve and print the results
    results = []
    for idx in indices[0]:
        result = metadata[idx]
        results.append(result)
    
    return results

# Example query
query = "How to drive on gradients?"
results = query_manual(query,1)
for res in results:
    print(f"File: {res['file_name']}, Page: {res['page']}, Section Title: {res['section_title']}")
    print(f"Content: {res['section_content']}\n")


Top 5 results:
File: nexon.json, Page: 12, Section Title: Introduction
Content:  turning 
Driving On Gradients 
When climbing gradient, the vehicle may 
begin to slow down and show a lack of 
power. If this happens, shift to a lower gear 
and apply power smoothly so that there is 
no loss of traction. 
When driving down a hill, the engine brak­
ing should be used by shifting into a lower 
gear. Do not drive in neutral gear or switch 
off the engine.
 
 
Driving On Highway  
Stopping distance progressively, in-
creases with vehicle speed. Maintain a 
sufficient distance between your vehicle 
and the vehicle ahead.  
For long distance driving, perform safety 
checks before starting a trip and take rest 
at certain intervals to prevent fatigue
SEAT BELTS 
This section of user manual describes 
your Vehicle’s seat belt, airbag and Child 
restraints system. Please read and follow 
all these instructions care-fully to minimise 
risk of severe injury or death. 
Seat belts are the primary rest

In [15]:
query = "Oil change"
results = query_manual(query,1)
for res in results:
    print(f"File: {res['file_name']}, Page: {res['page']}, Section Title: {res['section_title']}")
    print(f"Content: {res['section_content']}\n")

File: exter.json, Page: 302, Section Title: Introduction
Content:  9-22
Maintenance
ENGINE OIL
Engine oil is used for lubricating, 
cooling, and operating various hydraulic 
components in the engine. Engine oil 
consumption while driving is normal, 
and it is necessary to check and refill the 
engine oil regularly. Also, check and refill 
the oil level within the recommended 
maintenance schedule to prevent 
deterioration of oil performance.
Check the engine oil following the below 
procedure.
Checking the engine oil level
1. Follow all of the oil manufacturer’s 
precautions.
2. Be sure the vehicle is on the lovel 
ground in P (Park) with the parking 
brake set. if possible, block the 
wheels.
3. Turn the engine on and warm 
the engine up until the coolant 
temperature reaches a constant 
normal temperature.
4. Turn the engine off, remove the oil 
filler cap and pull the dipstick out. 
Wait for 15 minutes for the oil to return 
to the oil pan.
5. Wipe the dipstick clean and re-insert i

In [13]:
import os
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Directory containing JSON files
json_dir = 'json_files'

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize lists to hold the text and metadata
texts = []
metadata = []

# Loop through each JSON file in the directory
for json_file_name in os.listdir(json_dir):
    if json_file_name.endswith('.json'):
        json_path = os.path.join(json_dir, json_file_name)
        
        # Load JSON data
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
        
        # Extract text and metadata from the JSON structure
        for page in data['content']:
            page_num = page['page']
            for section in page['sections']:
                section_title = section['title']
                section_content = section['content']
                text = f"Page {page_num}, Section: {section_title}, Content: {section_content}"
                texts.append(text)
                metadata.append({
                    "file_name": json_file_name,
                    "page": page_num,
                    "section_title": section_title,
                    "section_content": section_content
                })

# Convert texts to embeddings
embeddings = model.encode(texts, convert_to_tensor=False)

# Convert embeddings to a numpy array
embeddings = np.array(embeddings)

# Create a FAISS index
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index and metadata
faiss.write_index(index, 'manuals_index.faiss')

with open('manuals_metadata.json', 'w') as meta_file:
    json.dump(metadata, meta_file)

print("FAISS index and metadata saved.")

# Load the FAISS index and metadata
index = faiss.read_index('manuals_index.faiss')

with open('manuals_metadata.json', 'r') as meta_file:
    metadata = json.load(meta_file)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering")

# Function to query the vector database
def query_manual(query_text, top_k=5):
    # Vectorize the query
    query_embedding = model.encode([query_text])
    
    # Search the index
    distances, indices = index.search(query_embedding, top_k)
    
    # Retrieve the results
    results = []
    for idx in indices[0]:
        result = metadata[idx]
        results.append(result)
    
    return results

# Function to generate answers
def generate_answer(query_text, top_k=5):
    results = query_manual(query_text, top_k)
    
    # Prepare context from retrieved sections
    context = " ".join([res['section_content'] for res in results])
    
    # Use the QA model to generate the answer
    answer = qa_pipeline(question=query_text, context=context)
    
    return answer

# Example query
query = "How to drive on gradients?"
answer = generate_answer(query)
print("Generated Answer:")
print(answer)

FAISS index and metadata saved.


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Generated Answer:
{'score': 0.36999034881591797, 'start': 1670, 'end': 1702, 'answer': 'shift\xad\ning early to a lower gear'}
