# chunking techniques in RAG

## Text extraction research

### Spire pdf python package

In [1]:
from spire.pdf.common import *
from spire.pdf import *

def extract_text_from_pdf(file_path, output_file):
    # Load a PDF document
    doc = PdfDocument()
    doc.LoadFromFile(file_path)

    extracted_text = []

    # Iterate over the pages of the document
    for i in range(doc.Pages.Count):
        page = doc.Pages.get_Item(i)
        # Extract the text from the page
        textExtractor = PdfTextExtractor(page)
        option = PdfTextExtractOptions()        
        text = textExtractor.ExtractText(option)
        extracted_text.append(text)

    # Save the extracted text to a text file
    with open(output_file, "w", encoding="utf-8") as text_file:
        text_file.write("\n".join(extracted_text))

    doc.Close()

# Example usage
file_path = "./pdfs/Effect_of_bump_height_on_the_strain_variation_during_the_thermal_cycling_test_of_ACA_flip-chip_joints.pdf"
output_file = "DocumentText.txt"
extract_text_from_pdf(file_path, output_file)

The output does not deal with multiple columns. 

### pymupdf multi-column.py file
Started with the instalation of the pymupdf using `pip install pymupdf`. Then I copied the multi-column.py file to the root of this project to use the script.

In [6]:
import pymupdf
from multi_column import column_boxes

# read the pdf file and write in a text file

doc = pymupdf.open(file_path)
text = ""
for page in doc:
    bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

    for rect in bboxes:
        text += page.get_text(clip=rect, sort=True)
        text += "\n"
    text += "-" * 80
    text += "\n"
    text += "\n"

with open("text_pymupdf.txt", "w", encoding="utf-8") as text_file:
    text_file.write(text)



The result is acurate to the point that the columns are seperated cleanly. 

### Pymupdf4llm
Trying 1 last package with llm integration. installation of the package using `pip install pymupdf4llm`

In [7]:
import pymupdf4llm
import pathlib
md_text = pymupdf4llm.to_markdown(file_path)

pathlib.Path("text_pymupdf4llm.md").write_bytes(md_text.encode())

Processing ./pdfs/Effect_of_bump_height_on_the_strain_variation_during_the_thermal_cycling_test_of_ACA_flip-chip_joints.pdf...


21179

This has the best results as it follows the text from right to left column no matter the vertical position of the columns. Which the package before its messed up. 

## Chunking script research

Requirements:
* LLM-api_key
* PDF
* embedding model

In [None]:
# imports
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser
import matplotlib.pyplot as plt
import numpy as np
from llama_index.embeddings.ollama import OllamaEmbedding
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain import hub
from pydantic import BaseModel # wtf is this?
from langchain.chains import create_extraction_chain_pydantic
from langchain.docstore.document import Document



### Character level chunking (LlamaIndex)

In [33]:
reader = SimpleDirectoryReader(input_files=["text_pymupdf4llm.md"])

doc = reader.load_data()

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(doc)
print(nodes[0].text)
print('length of nodes:', len(nodes))

Effect of Bump Height on the Strain Variation During the Thermal Cycling Test of ACA Flip-Chip Joints
length of nodes: 15


In [32]:
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(doc)
print(nodes[0].text)
print('length of nodes:', len(nodes))

Effect of Bump Height on the Strain Variation During the Thermal Cycling Test of ACA Flip-Chip Joints
length of nodes: 20


### Token Based Chunking - Langchain
* Tokens in chunk: 80
* Tokenizer: cl100k_base

In [2]:
from langchain_text_splitters import CharacterTextSplitter

# This is a long document we can split up.
with open("text_pymupdf4llm.md", "r") as f:
    text = f.read()

In [7]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
)
docs = text_splitter.create_documents(text)

### Sentence Based chunking- Llamaindex

In [10]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20 )  
reader = SimpleDirectoryReader(input_files=["text_pymupdf4llm.md"])

doc = reader.load_data()
docs = text_splitter.get_nodes_from_documents(doc)

### Semantic chunking

In [31]:
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=ollama_embedding
)

nodes = splitter.get_nodes_from_documents(doc)
print(nodes[0].text)
print('length of nodes:', len(nodes))



Effect of Bump Height on the Strain Variation During the Thermal Cycling Test of ACA Flip-Chip Joints

length of nodes: 28


In [47]:
text = ''

for d in doc:
    text += d.text


### Semantic Chunking Percentile based- Langchain

In [4]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama import OllamaEmbeddings



In [5]:
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)
text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
docs = text_splitter.create_documents([text])


### Semantic chunking double-pass merging

### Agentic Chunking

In [103]:
obj = hub.pull("wfh/proposal-indexing")
# llm = ChatGroq(model='llama-3.1-70b-versatile', api_key= GROQ_API, temperature=0)
llm = ChatOpenAI(model='gpt-4-turbo', api_key=OpenAIAPI, temperature=0)
runnable = obj | llm



In [104]:
class Sentences(BaseModel):
    sentences: List[str]

extraction_chain = llm.with_structured_output(Sentences)
    
# Extraction
# extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    propositions = extraction_chain.invoke(runnable_output).sentences
    return propositions
    
paragraphs = text.split("\n\n")

# remove paragraphs with less than 10 characters
paragraphs = [para for para in paragraphs if len(para) > 50]
text_propositions = []
for i, para in enumerate(paragraphs):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print (f"Done with {i}")

print (f"You have {len(text_propositions)} propositions")
print(text_propositions[:10])

Done with 0
Done with 1
Done with 2
Done with 3
Done with 4
Done with 5
Done with 6
Done with 7
Done with 8
Done with 9
Done with 10
Done with 11
Done with 12
Done with 13
Done with 14
Done with 15
Done with 16
Done with 17
Done with 18
Done with 19
Done with 20
Done with 21
Done with 22
Done with 23
Done with 24
Done with 25
Done with 26
Done with 27
Done with 28
Done with 29
Done with 30
Done with 31
Done with 32
Done with 33
Done with 34
Done with 35
Done with 36
Done with 37
You have 336 propositions
['The effect of bump height on the strain variation is studied.', 'The study occurs during the thermal cycling test.', 'The thermal cycling test involves ACA flip-chip joints.', 'Kuntjoro Pinardi is an individual.', 'Zonghe Lai is a Member of IEEE.', 'Dietmar Vogel is an individual.', 'Yi Lan Kang is an individual.', 'Johan Liu is a Senior Member of IEEE.', 'Sheng Liu is an Associate Member of IEEE.', 'Ralf Haug is an individual.']


In [113]:
# save the propositions in a text file
with open("propositions.txt", "w", encoding="utf-8") as text_file:
    text_file.write("\n".join(text_propositions))

In [1]:
# load the propositions from the text file
with open("propositions.txt", "r", encoding="utf-8") as text_file:
    text_propositions = text_file.readlines()

In [2]:
from agentic_chunker import AgenticChunker
ac = AgenticChunker()


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from agentic_chunker import AgenticChunker


In [None]:
ac.add_propositions(text_propositions)

In [None]:
chunks = ac.get_chunks(get_type='list_of_strings')

### QA Generation

In [17]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
# from gptcache.adapter.langchain_models import LangChainChat
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain.evaluation.qa.generate_chain import QAGenerateChain

text_splitter =  RecursiveCharacterTextSplitter(chunk_overlap=500, chunk_size=2000)

chat = ChatOpenAI(model='gpt-4-turbo', api_key=OpenAIAPI, temperature=0) # using the following code to cache with gptcache
# chat = LangChainChat(chat=ChatOpenAI(temperature=0))

# chain = QAGenerationChain.from_llm(chat, text_splitter=text_splitter)
chain = QAGenerateChain(llm=chat)

ValidationError: 1 validation error for QAGenerateChain
prompt
  Field required [type=missing, input_value={'llm': ChatOpenAI(client...2ElT', openai_proxy='')}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

In [8]:
loader = TextLoader("./text_pymupdf4llm.md")
doc = loader.load()[0]

In [13]:
qa = chain.invoke(doc.page_content)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [15]:
qa

NameError: name 'qa' is not defined