In [1]:
!pip install PyPDF2 pdfplumber pdf2image pycryptodome



In [2]:
import os
import json
from PyPDF2 import PdfReader
import pdfplumber
from pdf2image import convert_from_path

In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

In [4]:
# Function to extract tables from a PDF file using pdfplumber
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                tables.append({
                    "page": page_num,
                    "table": table
                })
    return tables

In [5]:
def extract_images_from_pdf(pdf_path, output_directory):
    images = []
    image_dir = os.path.join(output_directory, os.path.splitext(os.path.basename(pdf_path))[0])
    os.makedirs(image_dir, exist_ok=True)

    pages = convert_from_path(pdf_path)
    for page_num, page in enumerate(pages, start=1):
        image_filename = f"page_{page_num}.jpg"  # Adjust extension as needed
        image_path = os.path.join(image_dir, image_filename)
        page.save(image_path)
        images.append({
            "page": page_num,
            "image_file": os.path.relpath(image_path, output_directory)
        })
    return images

In [6]:
def main():
    pdf_directory = "manuals"
    output_directory = "extracted_content"

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over PDF files in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            
            # Extract text
            text = extract_text_from_pdf(pdf_path)
            
            # Extract tables
            tables = extract_tables_from_pdf(pdf_path)
            
            # Save text to JSON file
            text_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_text.json")
            with open(text_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "text": text}, f, ensure_ascii=False, indent=4)
            print(f"Text extracted from {pdf_path} and saved to {text_output_file}")
            
            # Save tables to JSON file
            tables_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_tables.json")
            with open(tables_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "tables": tables}, f, ensure_ascii=False, indent=4)
            print(f"Tables extracted from {pdf_path} and saved to {tables_output_file}")
            
            # Extract and save images
            images = extract_images_from_pdf(pdf_path, output_directory)
            images_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_images.json")
            with open(images_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "images": images}, f, ensure_ascii=False, indent=4)
            print(f"Images extracted from {pdf_path} and saved to {images_output_file}")

if __name__ == "__main__":
    main()

Text extracted from manuals/nexon.pdf and saved to extracted_content/nexon_text.json
Tables extracted from manuals/nexon.pdf and saved to extracted_content/nexon_tables.json
Images extracted from manuals/nexon.pdf and saved to extracted_content/nexon_images.json
Text extracted from manuals/Verna.pdf and saved to extracted_content/Verna_text.json
Tables extracted from manuals/Verna.pdf and saved to extracted_content/Verna_tables.json
Images extracted from manuals/Verna.pdf and saved to extracted_content/Verna_images.json
Text extracted from manuals/exter.pdf and saved to extracted_content/exter_text.json
Tables extracted from manuals/exter.pdf and saved to extracted_content/exter_tables.json
Images extracted from manuals/exter.pdf and saved to extracted_content/exter_images.json
Text extracted from manuals/punch.pdf and saved to extracted_content/punch_text.json
Tables extracted from manuals/punch.pdf and saved to extracted_content/punch_tables.json
Images extracted from manuals/punch.p

In [4]:
import json

def get_json_metadata(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    metadata = {}

    def extract_metadata(obj, metadata):
        if isinstance(obj, dict):
            metadata['type'] = 'object'
            metadata['keys'] = {}
            for key, value in obj.items():
                metadata['keys'][key] = {}
                extract_metadata(value, metadata['keys'][key])
        elif isinstance(obj, list):
            metadata['type'] = 'array'
            metadata['length'] = len(obj)
            if obj:
                metadata['element_type'] = {}
                extract_metadata(obj[0], metadata['element_type'])
        else:
            metadata['type'] = type(obj).__name__

    extract_metadata(data, metadata)
    return metadata

def print_metadata(metadata, indent=0):
    for key, value in metadata.items():
        if key == 'keys':
            for subkey, subvalue in value.items():
                print(' ' * indent + f"{subkey}:")
                print_metadata(subvalue, indent + 2)
        else:
            if key == 'element_type':
                print(' ' * indent + f"{key}:")
                print_metadata(value, indent + 2)
            else:
                print(' ' * indent + f"{key}: {value}")

# Main script
if __name__ == "__main__":
    json_file_path = 'extracted_content/nexon_text.json'  # Replace with your JSON file path
    metadata = get_json_metadata(json_file_path)
    print_metadata(metadata)


type: object
filename:
  type: str
text:
  type: str


In [7]:
import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    # Remove unwanted characters and patterns
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)  # Remove unicode escape sequences
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)    # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)              # Remove extra whitespace
    return text.strip()

# Function to clean JSON data
def clean_json(data):
    if isinstance(data, dict):
        return {key: clean_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_json(item) for item in data]
    elif isinstance(data, str):
        return clean_text(data)
    else:
        return data

# Function to load, clean, and save JSON data
def clean_json_file(input_file, output_file):
    with open(input_file, 'r') as file:
        data = json.load(file)

    cleaned_data = clean_json(data)

    with open(output_file, 'w') as file:
        json.dump(cleaned_data, file, indent=4)

# Main script
if __name__ == "__main__":
    input_json_file = 'extracted_content/nexon_text.json'  # Replace with your JSON file path
    output_json_file = 'cleaned_sample.json'  # Output file path
    clean_json_file(input_json_file, output_json_file)
    print(f"Cleaned JSON data has been saved to {output_json_file}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varunbharadwaj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Cleaned JSON data has been saved to cleaned_sample.json


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunbharadwaj/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
import json
import pinecone
from sentence_transformers import SentenceTransformer

In [9]:
# import json
# from sentence_transformers import SentenceTransformer
# from pinecone import Pinecone, ServerlessSpec

# # Your Pinecone API key
# api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# # Initialize your Pinecone client
# pc = Pinecone(api_key=api_key)

# spec = ServerlessSpec(cloud="aws", region="us-east-1")

# index_name = "extractive-question-answering"

# # Create the index if it does not exist
# if index_name not in pc.list_indexes():
#     pc.create_index(
#         name=index_name,
#         dimension=768,  # Specify the vector dimension
#         metric="cosine",  # Specify the distance metric
#         spec=spec
#     )

# # Load the transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Load the JSON file
# with open("cleaned_sample.json", "r") as file:
#     data = json.load(file)

# # Split the text into chunks of 512 words each
# text = data["text"]
# chunks = text.split(' ')
# chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# # Convert each chunk to a vector and upsert it to the index
# for i, chunk in enumerate(chunks):
#     vector = model.encode([chunk])[0]
#     pc.upsert(index_name=index_name, items={f"{data['filename']}-{i}": vector})



AttributeError: 'Pinecone' object has no attribute 'upsert'

In [14]:
import json
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Your Pinecone API key
api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# Initialize your Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "extractive-question-answering"

# Check if the index exists and create it if it does not
if index_name not in pc.list_indexes().names():
    # Define the serverless spec with your desired cloud and region
    spec = ServerlessSpec(cloud="aws", region="us-east-1")
    pc.create_index(
        name=index_name,
        dimension=384,  # Specify the vector dimension
        metric="cosine",  # Specify the distance metric
        spec=spec
    )

# Connect to the index
index = pc.Index(index_name)

# Load the transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load the JSON file
with open("cleaned_sample.json", "r") as file:
    data = json.load(file)

# Split the text into chunks of 512 words each
text = data["text"]
chunks = text.split(' ')
chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# Convert each chunk to a vector and upsert it to the index
for i, chunk in enumerate(chunks):
    vector = model.encode([chunk])[0]
    # Perform the upsert operation for each vector
    index.upsert(vectors=[(f"{data['filename']}-{i}", vector)])

In [51]:
query = "engine oil"
query_vector = model.encode([query])[0]
print("Query Vector:", query_vector.shape)
print("Query Vector:", query_vector.tolist())


Query Vector: (384,)
Query Vector: [-0.12369763851165771, 0.00024519747239537537, 0.0394088439643383, 0.055418793112039566, 0.032709624618291855, 0.03649448603391647, 0.0627523735165596, 0.029279259964823723, 0.03466429188847542, -0.14888142049312592, -0.048900119960308075, 0.011455107480287552, -0.07829247415065765, -0.09888282418251038, -0.054690729826688766, -0.030223030596971512, 0.02255075052380562, -0.00930424127727747, -0.07605338096618652, -0.12127833813428879, 0.027209168300032616, 0.07177136838436127, 0.03078954666852951, 0.03285842761397362, -0.08181358873844147, 0.13113415241241455, -0.040131863206624985, 0.008868648670613766, 0.0569901168346405, -0.07998310029506683, 0.0574692003428936, 0.004336763639003038, -0.04538670554757118, -0.03829221427440643, -0.05458361655473709, 0.015159028582274914, -0.02968692220747471, -0.0036104752216488123, 0.037903379648923874, 0.0475994236767292, 0.007575187366455793, -0.15454751253128052, -0.041298508644104004, -0.08913002908229828, 0.03

In [27]:
index.query(
    namespace="",
    # vector=[0.1,0.97,0.61,0.85,0.78,0.31,0.17,0.89,0.72,0.1,0.86,0.21,0.78,0.7,0.78,0.51,0.19,0.5,0.51,0.46,0.91,0.56,0.39,0.59,0.23,0.73,0.94,0.1,0.54,0.43,0.38,0.76,0.29,0.86,0.29,0,0.41,0.75,0.91,0.03,0.05,0.21,0.24,0.56,0.58,0.2,0.22,0.21,0.56,0.05,0.62,0.31,0.27,0.01,0.5,0.15,0.78,0.5,0.52,0.7,0.99,0.21,0.29,0.63,0.45,0.7,0.1,0.81,0.58,0.66,0.68,0.85,0.61,0.64,0.38,0.12,0.2,0.65,0.87,0.83,0.67,0.81,0.16,0.1,0.06,0.22,0.88,0.48,0.24,0.17,0.92,0.4,0.77,0.17,0.38,0.51,0.06,0.51,0.91,0.21,0.58,0.58,0.7,0.74,0.99,0.31,0.74,0.41,0.39,0.41,0.69,0.35,0.89,0.04,0.7,0.41,0.43,0.28,0.71,0.41,0.69,0.82,0.73,0.86,0.92,0.81,0.19,0.11,0.69,0.28,0.35,0.76,0.24,0.79,0.72,0.79,0.36,0.99,0.85,0.94,0.2,0.23,0.97,0.27,0.22,0.72,0.68,0.73,0.41,0.2,0.13,0.16,0.96,0.29,0.48,0.15,0.03,0.45,0.02,0.84,0.52,0.35,0.87,0.18,0.14,0.07,0.68,0.22,0.05,0.86,0.16,0.75,0.89,0.95,0.06,0.15,0.52,0.22,0.3,0.26,0.76,0.76,0.63,0.59,0.13,0.03,0.95,0.38,0.37,0.15,0.6,0.89,0.7,0.26,0.36,0.25,0.78,0.12,0.19,0.24,0.87,0.87,0.28,0.85,0.87,0.42,0.11,0.62,0.02,0.89,0.32,0.05,0.3,0.89,0.36,0.72,0.86,0.6,0.33,0.15,0.9,0.9,0.54,0.49,0.19,0.14,0.91,0.79,0.83,0.72,0.92,0.31,0.54,0.3,0.17,0.24,0.48,0.3,0.6,0.3,0.07,0.26,0.91,0.49,0.32,0.81,0.09,0.82,0.95,0.06,0.64,0.66,0.09,0.34,0.64,0.6,0.54,0.05,0.33,0.19,0.26,0.79,0.17,0.41,0.27,0.54,0.05,0.46,0.11,0.88,0.8,0.15,0.45,0.8,0,0.34,0.76,0.27,0.77,0.67,0.68,0.12,0.05,0.35,0.95,0.01,0.35,0.93,0.57,0.44,0.9,0.29,0.38,0.11,0.52,0.03,0.56,0.95,0.27,0.7,0.92,0.93,0.76,0.18,0.23,0.25,0.41,0.84,0.2,0.24,0.66,0.42,0.59,0.5,0.33,0.28,0.93,0.73,0.02,0.77,0.26,0.72,0.33,0.73,0.09,0.67,0.33,0.56,0.24,0.1,0.8,0.28,0,0.35,0.34,0.42,0.79,0.59,0.51,0.03,0.92,0.65,0.39,0.86,0.32,0.66,0.48,0.29,0.06,0.28,0.99,0.48,0.75,0.38,0.28,0.85,0.29,0.86,0.4,0.94,0.22,0.15,0.27,0.04,0.68,0.93,0.14,0.15,0.2,0.58,0.22,0.6,0.67,0.3,0.62,0.62,0.86,0.17,0.41,0.47,0.45,0.24,0.28,0.41],
    top_k=3,
    include_values=True
)

{'matches': [{'id': 'nexonpdf-8',
              'score': 0.0581221506,
              'values': [-0.0333085023,
                         0.0177984815,
                         -0.106937662,
                         0.0629196391,
                         0.118422508,
                         0.0318471566,
                         0.129805818,
                         0.0640248135,
                         0.132255584,
                         0.00575173879,
                         0.0765763,
                         0.0270085298,
                         0.0759308711,
                         -0.0174263529,
                         -0.0300199501,
                         -0.0148449307,
                         0.077429153,
                         -0.0045468295,
                         -0.106042668,
                         -0.0055792355,
                         0.0427955315,
                         0.0235253312,
                         -0.0325792,
                         0.0206393

In [53]:
results = index.query(
    namespace="",
    vector = query_vector.tolist(),
    top_k=3,
    include_values=True
)

In [54]:
print(results["matches"])
print(len(results["matches"]))

[{'id': 'nexonpdf-76',
 'score': 0.386117786,
 'values': [-0.0937722176,
            -0.0481791571,
            0.0665149242,
            -0.0271487813,
            0.0423074886,
            0.0629701465,
            0.0217664707,
            0.177896053,
            -0.0448348783,
            -0.101913162,
            -0.0625904,
            -0.0422153734,
            -0.0221663602,
            -0.00603500754,
            0.0210156869,
            0.00226205913,
            0.0506552979,
            -0.0711495578,
            -0.011653699,
            -0.10148304,
            0.128093958,
            -0.0428119972,
            0.0291101616,
            -0.00238122721,
            -0.0690969378,
            0.0620625652,
            0.0154730594,
            0.00743263587,
            0.0350228436,
            0.0222183354,
            -0.0328341,
            0.00139899796,
            -0.0284620095,
            -0.0366159193,
            -0.0310025252,
            0.000948492205,
    

In [55]:
for result in results['matches']:
    print(f"Vector ID: {result['id']}, Distance: {result['score']}")

Vector ID: nexonpdf-76, Distance: 0.386117786
Vector ID: nexonpdf-82, Distance: 0.344657838
Vector ID: nexonpdf-81, Distance: 0.297510713
