In [7]:
# Import necessary libraries
import warnings
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements
import langchain, groq, fastembed, qdrant_client, unstructured
import unstructured.partition
%reload_ext watermark
%watermark -a "Izzet Turkalp Akbasli" -vmp langchain,openai

# Suppress all warnings for cleaner output
warnings.filterwarnings('ignore')

# Display version information for imported libraries
%watermark --iversions

# Specify the path to your PDF file
filename = "/home/turkalp/Desktop/POKEMON_RAG/data/data_for_tutorials/redbook.pdf"
#path = "images"  # Uncomment and specify path if you want to extract images

# Extract images, tables, and chunk text from the PDF
pdf_elements = partition_pdf(
    filename=filename,                                    # Path to the PDF file
    strategy="hi_res",                                    # Strategy for extraction, "hi_res" for high resolution
    extract_images_in_pdf=False,                          # Set to True to extract images from PDF
    #extract_image_block_types=["Image", "Table"],         # optional    
    hi_res_model_name="yolox",                            # Model name for high resolution extraction
    infer_table_structure=False,                           # Set to True to infer the structure of tables
    chunking_strategy="by_title",                         # Strategy to chunk text, here by titles
    max_characters=3000,                                  # Maximum characters per chunk
    #new_after_n_chars=2000,                              # Uncomment to start new chunk after specified characters
    combine_text_under_n_chars=200,                       # Combine text chunks if under specified characters
    #extract_image_block_output_dir=path,                 # Uncomment and specify path to save extracted image blocks
)



Author: Izzet Turkalp Akbasli

Python implementation: CPython
Python version       : 3.9.19
IPython version      : 8.12.0

langchain: 0.2.5
openai   : 1.34.0

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 6.5.0-35-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 24
Architecture: 64bit

unstructured : 0.14.6
fastembed    : 0.3.1
langchain    : 0.2.5
qdrant_client: 1.9.1
groq         : 0.9.0



In [8]:
# Create a dictionary to store counts of each type of PDF element
category_counts = {}

# Iterate through each element in the extracted PDF elements
for element in pdf_elements:
    # Get the type of the element as a string
    category = str(type(element))
    # Increment the count if the category already exists in the dictionary
    if category in category_counts:
        category_counts[category] += 1
    # Add the category to the dictionary with an initial count of 1 if it does not exist
    else:
        category_counts[category] = 1

# Create a set of unique categories from the keys of the category_counts dictionary
unique_categories = set(category_counts.keys())

# Output the category counts
print(category_counts)

# Convert each PDF element to a dictionary and store in a list
element_dict = [el.to_dict() for el in pdf_elements]

# Create a set to store unique types of elements
unique_types = set()

# Iterate through each dictionary element
for item in element_dict:
    # Add the 'type' of each element to the unique_types set
    unique_types.add(item['type'])

# Output the set of unique types
print(unique_types)


{"<class 'unstructured.documents.elements.CompositeElement'>": 2460, "<class 'unstructured.documents.elements.Table'>": 284, "<class 'unstructured.documents.elements.TableChunk'>": 12}
{'CompositeElement', 'Table'}


In [9]:
# Import the chunk_by_title function from the unstructured.chunking.title module
from unstructured.chunking.title import chunk_by_title

# Chunk the PDF elements based on their titles with specific arguments
chunk_elements = chunk_by_title(
    pdf_elements,                   # List of elements to be chunked
    combine_text_under_n_chars=200,  # Combine text elements that are under 20 characters
    max_characters=3000             # Maximum number of characters allowed in a single chunk
)

# Print the number of chunked elements
print(len(chunk_elements))

# Chunk the PDF elements based on their titles with default arguments
elements = chunk_by_title(pdf_elements)  # You can modify the arguments to see different chunking results

# Print the number of elements after chunking with default arguments
print(len(elements))

# Argument Explanations:
# - pdf_elements: This is the list of elements extracted from the PDF.
# - combine_text_under_n_chars: This argument specifies the minimum number of characters for individual text elements. 
#   If a text element has fewer characters than this value, it will be combined with adjacent text elements until the total exceeds this value.
# - max_characters: This sets the maximum number of characters for each chunk. If a chunk exceeds this limit, it will be split into smaller chunks.


2045
10384


In [10]:
import os
from langchain_core.documents import Document
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()    

# Get API keys and URLs from environment variables
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize an empty list to store document objects
documents = []

# Convert each element to a Document object with necessary metadata
for element in elements:
    # Convert element metadata to a dictionary
    metadata = element.metadata.to_dict()
    # Remove unnecessary 'languages' field from metadata
    del metadata["languages"]
    # Set the source to the filename from metadata
    metadata["source"] = metadata["filename"]
    # Create a Document object with text content and metadata
    documents.append(Document(page_content=element.text, metadata=metadata))

# Initialize the FastEmbedEmbeddings object for embedding generation
embeddings = FastEmbedEmbeddings()

# Create a Qdrant vector store from the documents with embeddings
# This step can take some time depending on the number of documents and their size
vectorstore = Qdrant.from_documents(
    documents=documents,          # List of documents to be indexed
    embedding=embeddings,         # Embedding model to use
    url=qdrant_url,               # URL for the Qdrant instance
    collection_name="redbook",        # Name of the Qdrant collection
    api_key=qdrant_api_key        # API key for authentication
)

# Create a retriever object from the vector store for querying
retriever = vectorstore.as_retriever(
    search_type="similarity",     # Type of search to perform, here it's similarity search
    search_kwargs={"k": 3}        # Additional search parameters, 'k' specifies the number of top results to return
)

# Argument Explanations:
# - documents: A list of Document objects, each containing the text and metadata to be indexed.
# - embedding: An instance of the embedding model used to convert text into vector representations.
# - url: The URL of the Qdrant instance where the documents will be indexed.
# - collection_name: The name of the collection in Qdrant where the documents will be stored.
# - api_key: The API key for authenticating with the Qdrant instance.
# - search_type: The type of search to perform, in this case, a similarity search to find similar documents.
# - search_kwargs: Additional keyword arguments for the search, 'k' specifies the number of similar results to return.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

In [13]:
import os
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnableSequence
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores import Qdrant
from langchain.embeddings import FastEmbedEmbeddings
from langchain.chains import LLMChain
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API keys and URLs from environment variables
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

# Define the prompt template with an additional instruction
template = """You are a medical AI assistant specializing in infectious diseases and public health in children. You will receive excerpts from the Red Book along with related questions. 
Respond conversationally and in language similar to the question. If you're unsure of the answer, simply say, "Hmm, I'm not sure," without attempting to provide incorrect information.
Question: {input}
=========
{context}
=========
Answer in Markdown:"""

# Create a PromptTemplate object with the defined template
prompt = PromptTemplate(template=template, input_variables=["input", "context"])

# Initialize the language model with specific parameters
llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")

# Create the document chain using the language model and the map-reduce strategy
doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")

# Create embeddings using FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings()

# Set up the Qdrant vector store to connect to the existing collection
vectorstore = Qdrant.from_existing_collection(
    embedding=embeddings,         # Embedding model to use
    url=qdrant_url,               # URL for the Qdrant instance
    collection_name="redbook",        # Name of the existing Qdrant collection
    api_key=qdrant_api_key,       # API key for authentication
    path=None                     # Path to the existing collection, set to None if not needed
)

# Create a retriever object from the vector store for querying
retriever = vectorstore.as_retriever(
    search_type="similarity",     # Type of search to perform, here it's similarity search
    search_kwargs={"k": 3}        # Additional search parameters, 'k' specifies the number of top results to return
)

# Create the history-aware retriever for context-aware querying
history_aware_retriever = create_history_aware_retriever(
    llm=RunnableSequence(prompt | llm),  # Language model sequence combining prompt and LLM
    retriever=retriever,                 # Retriever for fetching relevant documents
    prompt=prompt                        # Prompt template for the retriever
)

# Create the LLM chain for generating answers
qa_chain = LLMChain(
    llm=llm,                # Language model to use for generating answers
    prompt=prompt           # Prompt template to guide the answer generation
)

# Argument Explanations:
# - template: The text template that defines how the AI should respond, with placeholders for input and context.
# - input_variables: Variables that will be replaced in the template (e.g., "input" and "context").
# - temperature: Parameter for the language model that controls the randomness of the output (0 means deterministic).
# - model_name: Specifies the model version of the language model.
# - embedding: The embedding model used to convert text into vector representations.
# - url: The URL of the Qdrant instance where the documents are stored.
# - collection_name: The name of the existing collection in Qdrant.
# - api_key: The API key for authenticating with the Qdrant instance.
# - search_type: The type of search to perform, in this case, a similarity search to find similar documents.
# - search_kwargs: Additional keyword arguments for the search, 'k' specifies the number of similar results to return.
# - llm: The language model used for generating responses and integrated into various chains.
# - retriever: The component used to fetch relevant documents from the vector store based on the query.
# - prompt: The prompt template guiding the AI on how to respond to questions.
# - RunnableSequence: Combines the prompt and the language model to create a sequence for the history-aware retriever.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
question = "What are the recommended concentrations of free chlorine and bromine for maintaining safe swimming pool water?"
context = """
                The AAP Red Book is a comprehensive guide for infectious disease guidelines in pediatric practice, covering immunizations, diseases, antimicrobial therapy, and management strategies. 
                Ask for specific recommendations from the Red Book on infectious diseases anytime!
           """
result = qa_chain.invoke({"input": question, "context": context, "chat_history": []})
print(result['text'])

**According to the Red Book, the recommended concentrations for maintaining safe swimming pool water are:**

* Free chlorine: 1-3 parts per million (ppm)
* Bromine: 3-5 ppm

These concentrations are recommended to effectively kill bacteria, viruses, and other microorganisms that can cause illnesses in children.


In [20]:
question = "Why are children aged 5 to 14 particularly susceptible to acute otitis externa (AOE) during recreational water activities?"
context = """
                The AAP Red Book is a comprehensive guide for infectious disease guidelines in pediatric practice, covering immunizations, diseases, antimicrobial therapy, and management strategies. 
                Ask for specific recommendations from the Red Book on infectious diseases anytime!
           """
result = qa_chain.invoke({"input": question, "context": context, "chat_history": []})
print(result['text'])

**According to the Red Book, children aged 5 to 14 are particularly susceptible to acute otitis externa (AOE) during recreational water activities because of the combination of factors that increase their risk.**

The Red Book notes that this age group is more likely to engage in water activities, such as swimming and diving, which can lead to water entering the ear canal and increasing the risk of AOE. Additionally, children in this age group may not have developed the necessary ear canal anatomy to effectively prevent water from entering the ear, making them more prone to AOE.

**It's also worth noting that the Red Book recommends that children with a history of AOE or ear canal abnormalities should avoid water activities that involve submerging their head underwater, and that parents and caregivers should take steps to prevent water from entering the ear canal, such as using earplugs or avoiding water activities when the ear canal is irritated or infected.**


In [21]:
question = "What are the typical symptoms associated with symptomatic intestinal amebiasis caused by Entamoeba histolytica?"
context = """
                The AAP Red Book is a comprehensive guide for infectious disease guidelines in pediatric practice, covering immunizations, diseases, antimicrobial therapy, and management strategies. 
                Ask for specific recommendations from the Red Book on infectious diseases anytime!
           """
result = qa_chain.invoke({"input": question, "context": context, "chat_history": []})
print(result['text'])

**Typical symptoms of symptomatic intestinal amebiasis caused by Entamoeba histolytica in children include:**

* Diarrhea, often bloody
* Abdominal pain, tenderness, and cramping
* Weight loss
* Fever
* Nausea and vomiting
* Abdominal distension
* Tenesmus (a feeling of rectal urgency)

These symptoms can range from mild to severe and may be accompanied by other signs of infection, such as anemia, malnutrition, and growth retardation.


In [22]:
question = "How is Entamoeba histolytica transmitted to humans, and what factors influence its prevalence in different populations?"
context = """
                The AAP Red Book is a comprehensive guide for infectious disease guidelines in pediatric practice, covering immunizations, diseases, antimicrobial therapy, and management strategies. 
                Ask for specific recommendations from the Red Book on infectious diseases anytime!
           """
result = qa_chain.invoke({"input": question, "context": context, "chat_history": []})
print(result['text'])

**Transmission and Prevalence of Entamoeba histolytica**

According to the Red Book, Entamoeba histolytica is typically transmitted to humans through the fecal-oral route, where contaminated food, water, or surfaces are ingested. This can occur through direct contact with an infected person's feces, contaminated food or water, or through poor hygiene practices.

The prevalence of Entamoeba histolytica infection varies greatly depending on factors such as geographic location, socioeconomic status, and access to clean water and sanitation. The Red Book notes that the infection is more common in developing countries with poor sanitation and hygiene practices, as well as in areas with high population density and crowding.

In addition, the Red Book highlights that certain populations, such as children under the age of 5, are more susceptible to infection due to their immature immune systems and increased exposure to contaminated environments.

Overall, it's essential to emphasize the impor