In [4]:
# Import necessary libraries
import warnings
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements
import langchain, groq, fastembed, qdrant_client, unstructured
import unstructured.partition
%reload_ext watermark
%watermark -a "Izzet Turkalp Akbasli" -vmp langchain,openai

# Suppress all warnings for cleaner output
warnings.filterwarnings('ignore')

# Display version information for imported libraries
%watermark --iversions

# Specify the path to your PDF file
filename = "/home/turkalp/Desktop/POKEMON_RAG/data/data_for_tutorials/redbook.pdf"
#path = "images"  # Uncomment and specify path if you want to extract images

# Extract images, tables, and chunk text from the PDF
pdf_elements = partition_pdf(
    filename=filename,                                    # Path to the PDF file
    strategy="hi_res",                                    # Strategy for extraction, "hi_res" for high resolution
    extract_images_in_pdf=False,                          # Set to True to extract images from PDF
    #extract_image_block_types=["Image", "Table"],         # optional    
    hi_res_model_name="yolox",                            # Model name for high resolution extraction
    infer_table_structure=True,                           # Set to True to infer the structure of tables
    chunking_strategy="by_title",                         # Strategy to chunk text, here by titles
    max_characters=3000,                                  # Maximum characters per chunk
    #new_after_n_chars=2000,                              # Uncomment to start new chunk after specified characters
    combine_text_under_n_chars=200,                       # Combine text chunks if under specified characters
    #extract_image_block_output_dir=path,                 # Uncomment and specify path to save extracted image blocks
)



Author: Izzet Turkalp Akbasli

Python implementation: CPython
Python version       : 3.9.19
IPython version      : 8.12.0

langchain: 0.2.5
openai   : 1.34.0

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 6.5.0-35-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 24
Architecture: 64bit

unstructured : 0.14.6
fastembed    : 0.3.1
langchain    : 0.2.5
qdrant_client: 1.9.1
groq         : 0.9.0



In [3]:
# Create a dictionary to store counts of each type of PDF element
category_counts = {}

# Iterate through each element in the extracted PDF elements
for element in pdf_elements:
    # Get the type of the element as a string
    category = str(type(element))
    # Increment the count if the category already exists in the dictionary
    if category in category_counts:
        category_counts[category] += 1
    # Add the category to the dictionary with an initial count of 1 if it does not exist
    else:
        category_counts[category] = 1

# Create a set of unique categories from the keys of the category_counts dictionary
unique_categories = set(category_counts.keys())

# Output the category counts
print(category_counts)

# Convert each PDF element to a dictionary and store in a list
element_dict = [el.to_dict() for el in pdf_elements]

# Create a set to store unique types of elements
unique_types = set()

# Iterate through each dictionary element
for item in element_dict:
    # Add the 'type' of each element to the unique_types set
    unique_types.add(item['type'])

# Output the set of unique types
print(unique_types)


NameError: name 'pdf_elements' is not defined

In [None]:
# Import the chunk_by_title function from the unstructured.chunking.title module
from unstructured.chunking.title import chunk_by_title

# Chunk the PDF elements based on their titles with specific arguments
chunk_elements = chunk_by_title(
    pdf_elements,                   # List of elements to be chunked
    combine_text_under_n_chars=200,  # Combine text elements that are under 20 characters
    max_characters=3000             # Maximum number of characters allowed in a single chunk
)

# Print the number of chunked elements
print(len(chunk_elements))

# Chunk the PDF elements based on their titles with default arguments
elements = chunk_by_title(pdf_elements)  # You can modify the arguments to see different chunking results

# Print the number of elements after chunking with default arguments
print(len(elements))

# Argument Explanations:
# - pdf_elements: This is the list of elements extracted from the PDF.
# - combine_text_under_n_chars: This argument specifies the minimum number of characters for individual text elements. 
#   If a text element has fewer characters than this value, it will be combined with adjacent text elements until the total exceeds this value.
# - max_characters: This sets the maximum number of characters for each chunk. If a chunk exceeds this limit, it will be split into smaller chunks.


In [None]:
import os
from langchain_core.documents import Document
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()    

# Get API keys and URLs from environment variables
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize an empty list to store document objects
documents = []

# Convert each element to a Document object with necessary metadata
for element in elements:
    # Convert element metadata to a dictionary
    metadata = element.metadata.to_dict()
    # Remove unnecessary 'languages' field from metadata
    del metadata["languages"]
    # Set the source to the filename from metadata
    metadata["source"] = metadata["filename"]
    # Create a Document object with text content and metadata
    documents.append(Document(page_content=element.text, metadata=metadata))

# Initialize the FastEmbedEmbeddings object for embedding generation
embeddings = FastEmbedEmbeddings()

# Create a Qdrant vector store from the documents with embeddings
# This step can take some time depending on the number of documents and their size
vectorstore = Qdrant.from_documents(
    documents=documents,          # List of documents to be indexed
    embedding=embeddings,         # Embedding model to use
    url=qdrant_url,               # URL for the Qdrant instance
    collection_name="redbook",        # Name of the Qdrant collection
    api_key=qdrant_api_key        # API key for authentication
)

# Create a retriever object from the vector store for querying
retriever = vectorstore.as_retriever(
    search_type="similarity",     # Type of search to perform, here it's similarity search
    search_kwargs={"k": 3}        # Additional search parameters, 'k' specifies the number of top results to return
)

# Argument Explanations:
# - documents: A list of Document objects, each containing the text and metadata to be indexed.
# - embedding: An instance of the embedding model used to convert text into vector representations.
# - url: The URL of the Qdrant instance where the documents will be indexed.
# - collection_name: The name of the collection in Qdrant where the documents will be stored.
# - api_key: The API key for authenticating with the Qdrant instance.
# - search_type: The type of search to perform, in this case, a similarity search to find similar documents.
# - search_kwargs: Additional keyword arguments for the search, 'k' specifies the number of similar results to return.
