In [1]:
!pip install PyPDF2 pdfplumber pdf2image pycryptodome



In [2]:
import os
import json
from PyPDF2 import PdfReader
import pdfplumber
from pdf2image import convert_from_path

In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

In [4]:
# Function to extract tables from a PDF file using pdfplumber
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                tables.append({
                    "page": page_num,
                    "table": table
                })
    return tables

In [5]:
def extract_images_from_pdf(pdf_path, output_directory):
    images = []
    image_dir = os.path.join(output_directory, os.path.splitext(os.path.basename(pdf_path))[0])
    os.makedirs(image_dir, exist_ok=True)

    pages = convert_from_path(pdf_path)
    for page_num, page in enumerate(pages, start=1):
        image_filename = f"page_{page_num}.jpg"  # Adjust extension as needed
        image_path = os.path.join(image_dir, image_filename)
        page.save(image_path)
        images.append({
            "page": page_num,
            "image_file": os.path.relpath(image_path, output_directory)
        })
    return images

In [6]:
def main():
    pdf_directory = "manuals"
    output_directory = "extracted_content"

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over PDF files in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            
            # Extract text
            text = extract_text_from_pdf(pdf_path)
            
            # Extract tables
            tables = extract_tables_from_pdf(pdf_path)
            
            # Save text to JSON file
            text_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_text.json")
            with open(text_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "text": text}, f, ensure_ascii=False, indent=4)
            print(f"Text extracted from {pdf_path} and saved to {text_output_file}")
            
            # Save tables to JSON file
            tables_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_tables.json")
            with open(tables_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "tables": tables}, f, ensure_ascii=False, indent=4)
            print(f"Tables extracted from {pdf_path} and saved to {tables_output_file}")
            
            # Extract and save images
            images = extract_images_from_pdf(pdf_path, output_directory)
            images_output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_images.json")
            with open(images_output_file, "w", encoding="utf-8") as f:
                json.dump({"filename": filename, "images": images}, f, ensure_ascii=False, indent=4)
            print(f"Images extracted from {pdf_path} and saved to {images_output_file}")

if __name__ == "__main__":
    main()

Text extracted from manuals/nexon.pdf and saved to extracted_content/nexon_text.json
Tables extracted from manuals/nexon.pdf and saved to extracted_content/nexon_tables.json
Images extracted from manuals/nexon.pdf and saved to extracted_content/nexon_images.json
Text extracted from manuals/Verna.pdf and saved to extracted_content/Verna_text.json
Tables extracted from manuals/Verna.pdf and saved to extracted_content/Verna_tables.json
Images extracted from manuals/Verna.pdf and saved to extracted_content/Verna_images.json
Text extracted from manuals/exter.pdf and saved to extracted_content/exter_text.json
Tables extracted from manuals/exter.pdf and saved to extracted_content/exter_tables.json
Images extracted from manuals/exter.pdf and saved to extracted_content/exter_images.json
Text extracted from manuals/punch.pdf and saved to extracted_content/punch_text.json
Tables extracted from manuals/punch.pdf and saved to extracted_content/punch_tables.json
Images extracted from manuals/punch.p

In [4]:
import json

def get_json_metadata(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    metadata = {}

    def extract_metadata(obj, metadata):
        if isinstance(obj, dict):
            metadata['type'] = 'object'
            metadata['keys'] = {}
            for key, value in obj.items():
                metadata['keys'][key] = {}
                extract_metadata(value, metadata['keys'][key])
        elif isinstance(obj, list):
            metadata['type'] = 'array'
            metadata['length'] = len(obj)
            if obj:
                metadata['element_type'] = {}
                extract_metadata(obj[0], metadata['element_type'])
        else:
            metadata['type'] = type(obj).__name__

    extract_metadata(data, metadata)
    return metadata

def print_metadata(metadata, indent=0):
    for key, value in metadata.items():
        if key == 'keys':
            for subkey, subvalue in value.items():
                print(' ' * indent + f"{subkey}:")
                print_metadata(subvalue, indent + 2)
        else:
            if key == 'element_type':
                print(' ' * indent + f"{key}:")
                print_metadata(value, indent + 2)
            else:
                print(' ' * indent + f"{key}: {value}")

# Main script
if __name__ == "__main__":
    json_file_path = 'extracted_content/nexon_text.json'  # Replace with your JSON file path
    metadata = get_json_metadata(json_file_path)
    print_metadata(metadata)


type: object
filename:
  type: str
text:
  type: str


In [7]:
import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    # Remove unwanted characters and patterns
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)  # Remove unicode escape sequences
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)    # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)              # Remove extra whitespace
    return text.strip()

# Function to clean JSON data
def clean_json(data):
    if isinstance(data, dict):
        return {key: clean_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_json(item) for item in data]
    elif isinstance(data, str):
        return clean_text(data)
    else:
        return data

# Function to load, clean, and save JSON data
def clean_json_file(input_file, output_file):
    with open(input_file, 'r') as file:
        data = json.load(file)

    cleaned_data = clean_json(data)

    with open(output_file, 'w') as file:
        json.dump(cleaned_data, file, indent=4)

# Main script
if __name__ == "__main__":
    input_json_file = 'extracted_content/nexon_text.json'  # Replace with your JSON file path
    output_json_file = 'cleaned_sample.json'  # Output file path
    clean_json_file(input_json_file, output_json_file)
    print(f"Cleaned JSON data has been saved to {output_json_file}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varunbharadwaj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Cleaned JSON data has been saved to cleaned_sample.json


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunbharadwaj/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
import json
import pinecone
from sentence_transformers import SentenceTransformer

In [9]:
# import json
# from sentence_transformers import SentenceTransformer
# from pinecone import Pinecone, ServerlessSpec

# # Your Pinecone API key
# api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# # Initialize your Pinecone client
# pc = Pinecone(api_key=api_key)

# spec = ServerlessSpec(cloud="aws", region="us-east-1")

# index_name = "extractive-question-answering"

# # Create the index if it does not exist
# if index_name not in pc.list_indexes():
#     pc.create_index(
#         name=index_name,
#         dimension=768,  # Specify the vector dimension
#         metric="cosine",  # Specify the distance metric
#         spec=spec
#     )

# # Load the transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Load the JSON file
# with open("cleaned_sample.json", "r") as file:
#     data = json.load(file)

# # Split the text into chunks of 512 words each
# text = data["text"]
# chunks = text.split(' ')
# chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# # Convert each chunk to a vector and upsert it to the index
# for i, chunk in enumerate(chunks):
#     vector = model.encode([chunk])[0]
#     pc.upsert(index_name=index_name, items={f"{data['filename']}-{i}": vector})



AttributeError: 'Pinecone' object has no attribute 'upsert'

In [14]:
import json
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Your Pinecone API key
api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# Initialize your Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "extractive-question-answering"

# Check if the index exists and create it if it does not
if index_name not in pc.list_indexes().names():
    # Define the serverless spec with your desired cloud and region
    spec = ServerlessSpec(cloud="aws", region="us-east-1")
    pc.create_index(
        name=index_name,
        dimension=384,  # Specify the vector dimension
        metric="cosine",  # Specify the distance metric
        spec=spec
    )

# Connect to the index
index = pc.Index(index_name)

# Load the transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load the JSON file
with open("cleaned_sample.json", "r") as file:
    data = json.load(file)

# Split the text into chunks of 512 words each
text = data["text"]
chunks = text.split(' ')
chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# Convert each chunk to a vector and upsert it to the index
for i, chunk in enumerate(chunks):
    vector = model.encode([chunk])[0]
    # Perform the upsert operation for each vector
    index.upsert(vectors=[(f"{data['filename']}-{i}", vector)])

Query Vector: (384,)
Query Vector: [-0.12369763851165771, 0.00024519747239537537, 0.0394088439643383, 0.055418793112039566, 0.032709624618291855, 0.03649448603391647, 0.0627523735165596, 0.029279259964823723, 0.03466429188847542, -0.14888142049312592, -0.048900119960308075, 0.011455107480287552, -0.07829247415065765, -0.09888282418251038, -0.054690729826688766, -0.030223030596971512, 0.02255075052380562, -0.00930424127727747, -0.07605338096618652, -0.12127833813428879, 0.027209168300032616, 0.07177136838436127, 0.03078954666852951, 0.03285842761397362, -0.08181358873844147, 0.13113415241241455, -0.040131863206624985, 0.008868648670613766, 0.0569901168346405, -0.07998310029506683, 0.0574692003428936, 0.004336763639003038, -0.04538670554757118, -0.03829221427440643, -0.05458361655473709, 0.015159028582274914, -0.02968692220747471, -0.0036104752216488123, 0.037903379648923874, 0.0475994236767292, 0.007575187366455793, -0.15454751253128052, -0.041298508644104004, -0.08913002908229828, 0.03

In [57]:
query = "engine oil"
query_vector = model.encode([query])[0]
print("Query Vector:", query_vector.shape)
print("Query Vector:", query_vector.tolist())

results = index.query(
    namespace="",
    vector = query_vector.tolist(),
    top_k=3,
    include_values=True
)

print(results["matches"])
print(len(results["matches"]))


for result in results['matches']:
    print(f"Vector ID: {result['id']}, Distance: {result['score']}")

Query Vector: (384,)
Query Vector: [-0.12369763851165771, 0.00024519747239537537, 0.0394088439643383, 0.055418793112039566, 0.032709624618291855, 0.03649448603391647, 0.0627523735165596, 0.029279259964823723, 0.03466429188847542, -0.14888142049312592, -0.048900119960308075, 0.011455107480287552, -0.07829247415065765, -0.09888282418251038, -0.054690729826688766, -0.030223030596971512, 0.02255075052380562, -0.00930424127727747, -0.07605338096618652, -0.12127833813428879, 0.027209168300032616, 0.07177136838436127, 0.03078954666852951, 0.03285842761397362, -0.08181358873844147, 0.13113415241241455, -0.040131863206624985, 0.008868648670613766, 0.0569901168346405, -0.07998310029506683, 0.0574692003428936, 0.004336763639003038, -0.04538670554757118, -0.03829221427440643, -0.05458361655473709, 0.015159028582274914, -0.02968692220747471, -0.0036104752216488123, 0.037903379648923874, 0.0475994236767292, 0.007575187366455793, -0.15454751253128052, -0.041298508644104004, -0.08913002908229828, 0.03

In [63]:
import json
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm

# Your Pinecone API key
api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# Initialize your Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "nexon-qa-index"

# Check if the index exists and create it if it does not
if index_name not in pc.list_indexes().names():
    # Define the serverless spec with your desired cloud and region
    spec = ServerlessSpec(cloud="aws", region="us-east-1")
    pc.create_index(
        name=index_name,
        dimension=384,  # Specify the vector dimension
        metric="cosine",  # Specify the distance metric
        spec=spec
    )

# Connect to the index
index = pc.Index(index_name)

# Set device to GPU if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

# Load the transformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)

# Load the JSON file
with open("cleaned_sample.json", "r") as file:
    data = json.load(file)

# Split the text into chunks of 512 words each
text = data["text"]
chunks = text.split(' ')
chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# We will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(chunks), batch_size)):
    # Find end of batch
    i_end = min(i+batch_size, len(chunks))
    # Extract batch
    batch = chunks[i:i_end]
    # Generate embeddings for batch
    emb = model.encode(batch).tolist()
    # Create unique IDs
    ids = [f"{data['filename']}-{idx}" for idx in range(i, i_end)]
    # Add all to upsert list
    to_upsert = list(zip(ids, emb))
    # Upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# Check that we have all vectors in index
index.describe_index_stats()

mps




  0%|          | 0/2 [00:00<?, ?it/s]

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [62]:
from transformers import pipeline
from pprint import pprint

# Load the reader model into a question-answering pipeline
model_name = 'deepset/electra-base-squad2'
reader = pipeline(tokenizer=model_name, model=model_name, task='question-answering', device=device)

# Function to get context passages from the Pinecone index
def get_context(question, top_k):
    # Generate embeddings for the question
    xq = model.encode([question]).tolist()
    print(xq)
    print(len(xq))
    # Search Pinecone index for context passage with the answer
    xc = index.query(queries=xq, top_k=top_k)
    # Extract the context passage from Pinecone search result
    c = [x.metadata['context'] for x in xc.results]
    return c

# Function to extract answer from the context passage
def extract_answer(question, context):
    results = []
    for c in context:
        # Feed the reader the question and contexts to extract answers
        answer = reader(question=question, context=c)
        # Add the context to answer dict for printing both together
        answer["context"] = c
        results.append(answer)
    # Sort the result based on the score from reader model
    sorted_result = sorted(results, key=lambda x: x['score'], reverse=True)
    return sorted_result

# Test the system with a question
question = "How much oil is Egypt producing in a day?"
context = get_context(question, top_k = 1)
answers = extract_answer(question, context)
pprint(answers)

[[0.01111477892845869, 0.11207541078329086, -0.015426329337060452, -0.006429098546504974, 0.005341485608369112, -0.099352166056633, 0.029795169830322266, -0.044964663684368134, -0.03979458287358284, 0.00988997146487236, 0.02483304776251316, -0.10342174023389816, -0.006280284840613604, -0.03984164446592331, -0.007921525277197361, 0.0035200088750571012, -0.012359962798655033, -0.05668162554502487, 0.009998122230172157, -0.042927421629428864, 0.028049813583493233, 0.01786077953875065, -0.02239406481385231, 0.012631033547222614, 0.035489823669195175, 0.04879404604434967, -0.0004921272629871964, -0.059247419238090515, 0.02665776200592518, 0.02488497458398342, -0.06549574434757233, -0.017194539308547974, 0.03774109482765198, -0.011165713891386986, -0.015685342252254486, 0.024127695709466934, -0.037603821605443954, -0.017420640215277672, 0.10302082449197769, 0.08294710516929626, 0.060143787413835526, -0.08198979496955872, 0.05081189051270485, -0.06554122269153595, -0.049079231917858124, 0.032

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 18 May 2024 12:27:46 GMT', 'Content-Type': 'text/plain', 'Content-Length': '89', 'Connection': 'keep-alive', 'server': 'envoy'})
HTTP response body: queries[383]: invalid value 0.014012962579727173 for type type.googleapis.com/QueryVector


In [65]:
import json
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm

# Your Pinecone API key
api_key = 'aaf5852c-6e2e-4b0f-ad7a-6ad2d9172380'

# Initialize your Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "nexon-qa-index"

# Check if the index exists and create it if it does not
if index_name not in pc.list_indexes().names():
    # Define the serverless spec with your desired cloud and region
    spec = ServerlessSpec(cloud="aws", region="us-east-1")
    pc.create_index(
        name=index_name,
        dimension=384,  # Specify the vector dimension
        metric="cosine",  # Specify the distance metric
        spec=spec
    )

# Connect to the index
index = pc.Index(index_name)

# Set device to GPU if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

# Load the transformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)

# Load the JSON file
with open("cleaned_sample.json", "r") as file:
    data = json.load(file)

# Split the text into chunks of 512 words each
text = data["text"]
chunks = text.split(' ')
chunks = [' '.join(chunks[i:i+512]) for i in range(0, len(chunks), 512)]

# We will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(chunks), batch_size)):
    # Find end of batch
    i_end = min(i+batch_size, len(chunks))
    # Extract batch
    batch = chunks[i:i_end]
    # Generate embeddings for batch
    emb = model.encode(batch).tolist()
    # Create unique IDs
    ids = [f"{data['filename']}-{idx}" for idx in range(i, i_end)]
    # Add all to upsert list
    to_upsert = list(zip(ids, emb))
    # Print embeddings and IDs
    print(to_upsert)
    # Upsert/insert these records to pinecone
    result = index.upsert(vectors=to_upsert)
    # Print result of upsert operation
    print(result)

# Check that we have all vectors in index
index.describe_index_stats()

mps


  0%|          | 0/2 [00:00<?, ?it/s]

[('nexonpdf-0', [-0.07914534956216812, 1.8386625015409663e-05, 0.010439461097121239, 0.007447647396475077, -0.019995741546154022, 0.028304290026426315, 0.016936330124735832, -0.0012372304918244481, 0.008347122929990292, -0.038205359131097794, 0.03558303043246269, 0.01713426224887371, 0.03411044925451279, 0.03500239923596382, -0.0006287000724114478, -0.01410872582346201, 0.041826073080301285, -0.004244827665388584, -0.05564126744866371, -0.036406826227903366, 0.011903936974704266, 0.045460354536771774, -0.009962860494852066, 0.009977028705179691, -0.050309598445892334, -0.05141221359372139, 0.06116935983300209, 0.05667686089873314, -0.07236669957637787, -0.03663066402077675, 0.041983865201473236, 0.0723474845290184, -0.039169639348983765, 0.02208758518099785, 0.024270247668027878, -0.028409065678715706, -0.04265609011054039, 0.027052998542785645, 0.026621181517839432, -0.10489063709974289, -0.015059598721563816, -0.02539011649787426, -0.032171256840229034, 0.013753583654761314, -0.03026

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
query = "engine oil"
query_vector = model.encode([query])[0]
# print("Query Vector:", query_vector.shape)
# print("Query Vector:", query_vector.tolist())

results = index.query(
    namespace="",
    vector = query_vector.tolist(),
    top_k=3,
    include_values=True
)

print(results)

In [77]:
def get_context(question, top_k):
    # Generate embeddings for the question
    xq= model.encode([query])[0].tolist()

    # Search Pinecone index for context passage with the answer
    xc = index.query(namespace="", vector=xq, top_k=top_k, include_values=True)
    # Extract the context passage from Pinecone search result
    # c = [index.fetch(ids=[x.id]).values()[0] for x in xc.results]
    c = [x['values'] for x in xc['matches']]
    return c

In [83]:
get_context("engine oil", top_k=1)

[[-0.0374343731,
  0.0661880895,
  0.0227364972,
  -0.0643775314,
  0.0963629857,
  0.0585226379,
  -0.019275682,
  0.157973334,
  -0.0331987068,
  -0.0619163252,
  -0.0346189849,
  0.0398934446,
  0.00487361802,
  -0.0257654674,
  -0.00230638683,
  0.0173990559,
  0.0948727801,
  -0.0050230748,
  -0.101118334,
  -0.107591562,
  0.0702061877,
  -0.0376188233,
  0.000952682516,
  -0.0104021765,
  -0.0948222503,
  0.0825335234,
  -0.0301556606,
  0.0835483372,
  -0.0355252028,
  0.0433645919,
  -0.0341373906,
  0.0617017895,
  -0.0968979374,
  0.0536033362,
  -0.0120487558,
  -0.0529374033,
  -0.0878597,
  -0.0364907086,
  0.027466381,
  -0.0177395456,
  -0.0275824517,
  -0.135729849,
  -0.0337304063,
  -5.43890783e-05,
  0.080215767,
  0.0339389667,
  0.0396496318,
  -0.0332247727,
  0.0429098681,
  -0.0835897326,
  -0.0302931722,
  0.0385701843,
  0.0108266138,
  0.0393598154,
  -0.0311733931,
  -0.0706491619,
  -0.0581227392,
  -0.0451001264,
  -0.0190989636,
  -0.0323826447,
  -0.032

In [84]:
from tqdm.auto import tqdm
import pandas as pd

# Load the transformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)

# Load the JSON file
with open("cleaned_sample.json", "r") as file:
    data = json.load(file)

# Convert the data to a DataFrame
df = pd.DataFrame(data)



ValueError: If using all scalar values, you must pass an index