In [1]:
import re
import unicodedata
import fitz 
from spacy.lang.en import English
from tqdm import tqdm  

class PDFPreprocessor:
    def __init__(self, ignore_pages=None, minimum_sentence_length=20):
        """
        Initialize the PDFPreprocessor.

        :param ignore_pages: A list of page numbers (0-indexed) to ignore.
        :param minimum_sentence_length: Minimum length (in characters) for a sentence to be kept.
        """
        self.ignore_pages = ignore_pages if ignore_pages is not None else []
        self.minimum_sentence_length = minimum_sentence_length
        
        # Set up a basic spaCy pipeline using the English model.
        self.nlp = English()
        
        # Add common abbreviations as special cases to avoid erroneous sentence splits.
        special_cases = [
            "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Inc.", "Ltd.", "Co.", "Corp.",
            "e.g.", "i.e.", "etc.", "vs.", "Fig.", "Figs.", "No.", "Vol.", "Ed.",
            "Jr.", "Sr.", "al."
        ]
        for case in special_cases:
            self.nlp.tokenizer.add_special_case(case, [{"ORTH": case}])
        
        # Add the sentencizer with custom punctuation.
        config = {"punct_chars": [".", "!", "?"]}
        self.nlp.add_pipe("sentencizer", config=config)
    
    def decode_if_bytes(self, raw_input):
        """
        If raw_input is of type bytes, decode using UTF-8.
        """
        if isinstance(raw_input, bytes):
            return raw_input.decode('utf-8')
        return raw_input

    def clean_pdf_text(self, raw_text):
        """
        Clean and normalize extracted text from a PDF.
        For chapter pages with a header block, extract only the chapter title and remove everything else.
        Then apply general cleanup.
        """
        # Ensure input is a Unicode string.
        text = self.decode_if_bytes(raw_text)
        
        # Step 1: Unicode normalization (NFC is usually a good choice)
        text = unicodedata.normalize('NFC', text)
        
        # Step 2: Remove copyright header block.
        copyright_pattern = (
            r'\n\s*Programming Massively Parallel Processors\. DOI:\s*https?://[^\n]+\s*'
            r'\n\s*©\s*\d{4}\s*Elsevier Inc\. All rights reserved\.'
        )
        text = re.sub(copyright_pattern, '', text, flags=re.IGNORECASE)
        
        # Step 3: Replace the chapter header block with only the chapter title.
        # This pattern assumes the header starts with "CHAPTER", a chapter number, then the chapter title,
        # then "Chapter Outline" and ends with either "Exercises" or "References" or "Future outlook".
        chapter_pattern = r'(?s)^CHAPTER\s*\n\s*\d+\s*\n(.*?)\n\s*Chapter Outline.*\n\s*(?:Exercises|References|Future outlook)\s*\n'
        text = re.sub(chapter_pattern, '', text, flags=re.IGNORECASE)
        
        # Step 4: Remove trailing page headers (pattern: newline, digits, newline, then header text).
        text = re.sub(r'\n\d+\n.*$', '', text, flags=re.DOTALL)
        
        # Step 5: Remove lines that are mainly punctuation (e.g., lines consisting mostly of dots and numbers).
        lines = text.splitlines()
        clean_lines = [line for line in lines if not re.fullmatch(r'[\.\s\d]+', line)]
        text = "\n".join(clean_lines)
        
        # Step 6: Remove figure references (e.g., "FIGURE 4.8").
        text = re.sub(r'\bFIGURE\s+\d+(\.\d+)?\b', '', text, flags=re.IGNORECASE)
        
        # Step 7: Fix hyphenated line breaks (e.g., "hy-\nphenated" becomes "hyphenated").
        text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)
        
        # Step 8: Remove extra newlines, tabs, and multiple spaces.
        text = re.sub(r'[\r\n\t]+', ' ', text)
        text = re.sub(r' +', ' ', text).strip()
        
        # Step 9: Remove unwanted special characters while preserving punctuation,
        # the multiplication sign (×) and the percent sign (%).
        text = re.sub(r'[^\w\s\.,:;!?()\-\u00D7%]', '', text)
        
        # Step 10: Remove lines that contain only numbers.
        text = "\n".join(line for line in text.splitlines() if not re.fullmatch(r'\s*\d+\s*', line))
        
        return text

    def get_sentences(self, text):
        """
        Split the cleaned text into a list of sentences using spaCy.
        Only sentences with length greater than or equal to self.minimum_sentence_length are returned.
        """
        doc = self.nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) >= self.minimum_sentence_length]
        return sentences

    def extract_chapters_info(self, pdf_path):
        """
        Extract chapter information from the PDF.
        
        For each page, check if it contains a chapter header using a regex pattern.
        If so, record the chapter number and title as the start of a chapter.
        Then, accumulate text from subsequent pages until an "Exercises" or "References"
        marker is found, at which point that chapter is finalized.
        
        Returns a list of dictionaries, where each dictionary contains:
            - 'chapter_number': The chapter number as a string.
            - 'chapter_title': The extracted chapter title.
            - 'start_page': The page number (0-indexed) where the chapter starts.
            - 'end_page': The page number where the chapter content ends.
            - 'combined_text': The cleaned text from the start page up to (but not including) text after the marker.
            - 'sentences': A list of individual sentences extracted from the combined_text.
        """
        doc = fitz.open(pdf_path)
        chapters = []
        current_chapter = None
        current_text = ""
        
        # Regex to capture chapter header: captures chapter number and title.
        chapter_header_pattern = r'(?s)^CHAPTER\s*\n\s*(\d+)\s*\n(.*?)\n\s*Chapter Outline'
        # Pattern to detect end-of-chapter marker (Exercises or References).
        marker_pattern = r'\n\s*(Exercises|References)\s*\n'
        
        for page_num in range(len(doc)):
            raw_text = doc[page_num].get_text()
            # Skip contents pages.
            # if self.is_contents_page(raw_text):
                # continue
            
            marker_match = re.search(marker_pattern, raw_text, flags=re.IGNORECASE)
            # Check if the page contains a chapter header.
            chap_match = re.search(chapter_header_pattern, raw_text, flags=re.IGNORECASE)
            if chap_match:
                # If a chapter is already in progress, finalize it.
                if current_chapter is not None:
                    current_chapter['end_page'] = page_num - 1
                    current_chapter['combined_text'] = current_text.strip()
                    current_chapter['sentences'] = self.get_sentences(current_text.strip())
                    chapters.append(current_chapter)
                    current_chapter = None
                    current_text = ""
                
                # Start a new chapter.
                chapter_number = chap_match.group(1).strip()
                chapter_title = re.sub(r'\s+', ' ', chap_match.group(2)).strip()
                # Remove "With special contributions" and everything after if present
                if "With special contributions" in chapter_title:
                    chapter_title = chapter_title.split("With special contributions")[0].strip()
                current_chapter = {
                    'chapter_number': chapter_number,
                    'chapter_title': chapter_title,
                    'start_page': page_num,
                    'end_page': None,
                    'combined_text': ''
                }
                current_text += self.clean_pdf_text(raw_text) + " "
            # if "Exercises" comes at the starting, we skip processing the page
            elif raw_text.lstrip().startswith("Exercises"):
                current_chapter['end_page'] = page_num - 1
                current_chapter['combined_text'] = current_text.strip()
                current_chapter['sentences'] = self.get_sentences(current_text.strip())
                chapters.append(current_chapter)
                current_chapter = None
                current_text = ""
            # execute logic where "Exercises" appear within a page
            elif marker_match and current_chapter is not None:
                cutoff = marker_match.start()
                page_body = raw_text[:cutoff]
                current_text += self.clean_pdf_text(page_body) + " "
                current_chapter['end_page'] = page_num
                current_chapter['combined_text'] = current_text.strip()
                current_chapter['sentences'] = self.get_sentences(current_text.strip())
                chapters.append(current_chapter)
                current_chapter = None
                current_text = ""
            else:
                # If inside a chapter, accumulate page text.
                if current_chapter is not None:
                    current_text += self.clean_pdf_text(raw_text) + " "
        
        # Finalize any chapter still in progress.
        if current_chapter is not None:
            current_chapter['end_page'] = len(doc) - 1
            current_chapter['combined_text'] = current_text.strip()
            current_chapter['sentences'] = self.get_sentences(current_text.strip())
            chapters.append(current_chapter)
        
        return chapters


if __name__ == "__main__":
    pdf_file = "../data/ppmp.pdf"
    
    # Specify pages to ignore (0-indexed) if needed.
    preprocessor = PDFPreprocessor(ignore_pages=[0, 1, 2], minimum_sentence_length=20)
    
    # Extract chapter information.
    chapters_info = preprocessor.extract_chapters_info(pdf_file)


FileNotFoundError: no such file: 'data/ppmp.pdf'

In [None]:
chapters_info[0]['combined_text']

In [None]:
chapters_info[0]['sentences']

In [None]:
chapters_info[0]['sentences_dict'] = [{'sentence': x, 'index' : i} for i, x in enumerate(chapters_info[0]['sentences'])]
chapters_info[0]['sentences_dict'][:3]

In [10]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

chapters_info[0]['combined_sentences'] = combine_sentences(chapters_info[0]['sentences_dict'])

In [None]:
chapters_info[0]['combined_sentences']

In [None]:
# generate embeddings

In [5]:
from vllm import LLM, TokensPrompt
# For pooling models (task={embed,classify,reward,score}) only
llm = LLM(model="intfloat/multilingual-e5-large-instruct", task="embed", trust_remote_code=True)  # Name or path of your model

INFO 03-04 22:42:44 config.py:422] Found sentence-transformers modules configuration.
INFO 03-04 22:42:44 config.py:442] Found pooling configuration.
INFO 03-04 22:42:44 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='intfloat/multilingual-e5-large-instruct', speculative_config=None, tokenizer='intfloat/multilingual-e5-large-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=intfloat/m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 84.15it/s]



INFO 03-04 22:42:49 model_runner.py:1115] Loading model weights took 1.0417 GB


In [6]:
tokenizer = llm.llm_engine.get_tokenizer_group()

prompt = "Hello, my name is"
prompt_token_ids=tokenizer.encode(prompt=prompt)
output = llm.encode(TokensPrompt(prompt_token_ids=prompt_token_ids))
output

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


[PoolingRequestOutput(request_id='0', outputs=PoolingOutput(data=tensor([ 0.0293,  0.0004, -0.0139,  ..., -0.0115, -0.0028,  0.0349])), prompt_token_ids=[0, 35378, 4, 759, 9351, 83, 2], finished=True)]

In [None]:
output[0].outputs.data.shape

In [None]:
# Assuming the tokenizer is already defined as in your previous code
for combined_sentence in tqdm(chapters_info[0]['combined_sentences']):
    prompt = combined_sentence['combined_sentence']
    prompt_token_ids = tokenizer.encode(prompt=prompt)
    combined_sentence['token_ids'] = prompt_token_ids
    combined_sentence['token_id_length'] = len(prompt_token_ids)

In [None]:
import numpy as np
token_len=[]
for i in range(len(chapters_info[0]['combined_sentences'])):
    token_len.append(chapters_info[0]['combined_sentences'][i]['token_id_length'])
print(token_len[:10])
token_len = np.array(token_len)
print(token_len.mean())
print(token_len.std())
print(token_len.max())
print(token_len.min())

In [None]:
token_embeddings = llm.encode([TokensPrompt(prompt_token_ids=chapters_info[0]['combined_sentences'][i]['token_ids']) for i in range(len(chapters_info[0]['combined_sentences']))])
len(token_embeddings)

In [32]:
for i in range(len(chapters_info[0]['combined_sentences'])):
    chapters_info[0]['combined_sentences'][i]['token_embeddings'] = token_embeddings[i].outputs.data

In [None]:
chapters_info[0]['combined_sentences'][0]['token_embeddings']

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['token_embeddings']
        embedding_next = sentences[i + 1]['token_embeddings']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

In [38]:
distances, chapters_info[0]['combined_sentences'] = calculate_cosine_distances(chapters_info[0]['combined_sentences'])


In [None]:
chapters_info[0]['combined_sentences']

In [None]:
distances[:3]

In [None]:
import matplotlib.pyplot as plt

plt.plot(distances)

In [None]:
import numpy as np

plt.plot(distances)

y_upper_bound = .2
plt.ylim(0, y_upper_bound)
plt.xlim(0, len(distances))

# We need to get the distance threshold that we'll consider an outlier
# We'll use numpy .percentile() for this
breakpoint_percentile_threshold = 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff
plt.axhline(y=breakpoint_distance_threshold, color='r', linestyle='-')

# Then we'll see how many distances are actually above this one
num_distances_above_theshold = len([x for x in distances if x > breakpoint_distance_threshold]) # The amount of distances above your threshold
plt.text(x=(len(distances)*.01), y=y_upper_bound/50, s=f"{num_distances_above_theshold + 1} Chunks")

# Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list

# Start of the shading and text
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, breakpoint_index in enumerate(indices_above_thresh):
    start_index = 0 if i == 0 else indices_above_thresh[i - 1]
    end_index = breakpoint_index if i < len(indices_above_thresh) - 1 else len(distances)

    plt.axvspan(start_index, end_index, facecolor=colors[i % len(colors)], alpha=0.25)
    plt.text(x=np.average([start_index, end_index]),
             y=breakpoint_distance_threshold + (y_upper_bound)/ 20,
             s=f"Chunk #{i}", horizontalalignment='center',
             rotation='vertical')

# # Additional step to shade from the last breakpoint to the end of the dataset
if indices_above_thresh:
    last_breakpoint = indices_above_thresh[-1]
    if last_breakpoint < len(distances):
        plt.axvspan(last_breakpoint, len(distances), facecolor=colors[len(indices_above_thresh) % len(colors)], alpha=0.25)
        plt.text(x=np.average([last_breakpoint, len(distances)]),
                 y=breakpoint_distance_threshold + (y_upper_bound)/ 20,
                 s=f"Chunk #{i+1}",
                 rotation='vertical')

plt.title("PG Essay Chunks Based On Embedding Breakpoints")
plt.xlabel("Index of sentences in essay (Sentence Position)")
plt.ylabel("Cosine distance between sequential sentences")
plt.show()

In [None]:
# Initialize the start index
start_index = 0

# Create a list to hold the grouped sentences
chunks = []

# Iterate through the breakpoints to slice the sentences
for index in indices_above_thresh:
    # The end index is the current breakpoint
    end_index = index

    # Slice the sentence_dicts from the current start index to the end index
    group = chapters_info[0]['combined_sentences'][start_index:end_index + 1]
    combined_text = ' '.join([d['combined_sentence'] for d in group])
    chunks.append(combined_text)
    
    # Update the start index for the next group
    start_index = index + 1

# The last group, if any sentences remain
if start_index < len(chapters_info[0]['combined_sentences']):
    combined_text = ' '.join([d['combined_sentence'] for d in chapters_info[0]['combined_sentences'][start_index:]])
    chunks.append(combined_text)

# grouped_sentences now contains the chunked sentences

In [None]:
for i, chunk in enumerate(chunks[:2]):
    buffer = 200
    
    print (f"Chunk #{i}")
    print (chunk[:buffer].strip())
    print ("...")
    print (chunk[-buffer:].strip())
    print ("\n")

In [None]:
chapters_info[0]['combined_sentences'][0]['combined_sentence']

In [None]:
for i in range(len(chapters_info[0]['combined_sentences'])):
    chapters_info[0]['combined_sentences'][i]['combined_sentence_len'] = len(chapters_info[0]['combined_sentences'][i]['combined_sentence'])

chapters_info[0]['combined_sentences'][0]['combined_sentence_len']


In [None]:
indices_above_thresh

In [None]:
len(chunks)

In [7]:
import re
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class SemanticChunker:
    def __init__(self, llm, tokenizer, buffer_size=1, threshold_percentile=95, context_window=512):
        self.llm = llm
        self.tokenizer = tokenizer
        self.buffer_size = buffer_size
        self.threshold_percentile = threshold_percentile
        self.context_window = context_window

        self.sentences = None

    def preprocess_sentences(self, sentences):
        """
        Preprocess the sentences to create a sentences_dict and combined_sentences.
        :param sentences: List of sentences to preprocess.
        :return: List of dictionaries containing sentence and index.
        """
        # Create sentences_dict
        sentences_dict = [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]
        
        # Combine sentences into combined_sentences
        combined_sentences = self.combine_sentences(sentences_dict)
        
        return combined_sentences

    def combine_sentences(self, sentences):
        """
        Combine sentences into a single string based on a buffer size.
        :param sentences: List of sentence dictionaries.
        :return: List of sentences with combined sentences added.
        """
        for i in range(len(sentences)):
            combined_sentence = ''

            # Add sentences before the current one, based on the buffer size.
            for j in range(i - self.buffer_size, i):
                if j >= 0:
                    combined_sentence += sentences[j]['sentence'] + ' '

            # Add the current sentence
            combined_sentence += sentences[i]['sentence']

            # Add sentences after the current one, based on the buffer size
            for j in range(i + 1, i + 1 + self.buffer_size):
                if j < len(sentences):
                    combined_sentence += ' ' + sentences[j]['sentence']

            # Store the combined sentence in the current sentence dict
            sentences[i]['combined_sentence'] = combined_sentence

        return sentences

    def split_into_chunks(self, text):
        """
        Split text into chunks that fit within the context window.
        """
        sentences = re.split(r'(?<=[.!?]) +', text)  # Split by sentence boundaries
        chunks = []
        current_chunk = []

        for sentence in sentences:
            current_chunk.append(sentence)
            current_length = sum(len(self.tokenizer.encode(s)) for s in current_chunk)

            if current_length > self.context_window:
                # If the current chunk exceeds the context window, finalize the chunk
                chunks.append(' '.join(current_chunk[:-1]))  # Add all but the last sentence
                current_chunk = [sentence]  # Start a new chunk with the last sentence

        if current_chunk:
            chunks.append(' '.join(current_chunk))  # Add any remaining sentences as a chunk

        return chunks

    def generate_embeddings(self):
        """
        Generate embeddings for the combined sentences, ensuring they fit within the context window.
        """
        for combined_sentence in tqdm(self.sentences):
            prompt = combined_sentence['combined_sentence']
            prompt_token_ids = self.tokenizer.encode(prompt=prompt)
            combined_sentence['token_ids'] = prompt_token_ids
            combined_sentence['token_id_length'] = len(prompt_token_ids)

            # Split into chunks if the token length exceeds the context window
            if combined_sentence['token_id_length'] > self.context_window:
                chunks = self.split_into_chunks(prompt)
                combined_sentence['token_chunks'] = chunks
                combined_sentence['token_id_chunks'] = [self.tokenizer.encode(chunk) for chunk in chunks]
            else:
                combined_sentence['token_chunks'] = [prompt]
                combined_sentence['token_id_chunks'] = [prompt_token_ids]

        # Generate embeddings for each chunk
        all_embeddings = []
        for combined_sentence in self.sentences:
            token_id_chunks = combined_sentence['token_id_chunks']
            embeddings = self.llm.encode([TokensPrompt(prompt_token_ids=chunk) for chunk in token_id_chunks])
            all_embeddings.extend(embeddings)

        for i, combined_sentence in enumerate(self.sentences):
            combined_sentence['token_embeddings'] = all_embeddings[i].outputs.data

    def calculate_cosine_distances(self):
        """
        Calculate cosine distances between consecutive sentence embeddings.
        """
        distances = []
        for i in range(len(self.sentences) - 1):
            embedding_current = self.sentences[i]['token_embeddings']
            embedding_next = self.sentences[i + 1]['token_embeddings']
            similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
            distance = 1 - similarity
            distances.append(distance)
            self.sentences[i]['distance_to_next'] = distance

        return distances

    def chunk_sentences(self):
        """
        Chunk sentences based on calculated distances and a defined threshold.
        """
        distances = self.calculate_cosine_distances()
        breakpoint_distance_threshold = np.percentile(distances, self.threshold_percentile)

        chunks = []
        start_index = 0
        indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
        print(indices_above_thresh)

        for index in indices_above_thresh:
            end_index = index
            group = self.sentences[start_index:end_index + 1]
            combined_text = ' '.join([d['combined_sentence'] for d in group])
            chunks.append(combined_text)
            start_index = index + 1

        if start_index < len(self.sentences):
            combined_text = ' '.join([d['combined_sentence'] for d in self.sentences[start_index:]])
            chunks.append(combined_text)

        return chunks

    def finalize_chunks(self, chunks):
        """
        Further split the final chunks if their token length exceeds the context window.
        :param chunks: List of chunks produced by chunk_sentences.
        :return: List of finalized chunks that fit within the context window.
        """
        finalized_chunks = []
        
        for chunk in chunks:
            # Check the token length of the chunk
            token_ids = self.tokenizer.encode(chunk)
            token_length = len(token_ids)

            if token_length > self.context_window:
                # If the chunk exceeds the context window, split it
                sub_chunks = self.split_into_chunks(chunk)
                finalized_chunks.extend(sub_chunks)
            else:
                finalized_chunks.append(chunk)

        return finalized_chunks

    def process(self):
        """
        Execute all steps sequentially: preprocess, generate embeddings, chunk sentences, and finalize chunks.
        :return: List of finalized chunks.
        """
        self.generate_embeddings()
        chunks = self.chunk_sentences()
        finalized_chunks = self.finalize_chunks(chunks)
        return finalized_chunks

    def process_all_chapters(self, chapters_info):
        """
        Execute the chunking process for all chapters in chapters_info.
        :param chapters_info: List of dictionaries containing chapter information.
        :return: List of finalized chunks for all chapters.
        """
        all_finalized_chunks = []
        
        for chapter in chapters_info:
            # Extract sentences from the chapter
            sentences = chapter['sentences']
            # Initialize the chunker for the current chapter
            self.sentences = self.preprocess_sentences(sentences)
            
            # Process the current chapter
            chapter['finalized_chunks'] = self.process()
        
# Usage
semantic_chunker = SemanticChunker(llm, tokenizer, buffer_size=1, context_window=450)
semantic_chunker.process_all_chapters(chapters_info)

100%|██████████| 384/384 [00:00<00:00, 5451.46it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 111.63it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 115.24it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.85it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.71it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.54it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.43it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 138.68it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.46it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[6, 16, 34, 44, 63, 85, 107, 126, 127, 146, 153, 160, 161, 166, 189, 215, 284, 287, 341, 342]


100%|██████████| 401/401 [00:00<00:00, 7061.86it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 70.69it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 130.94it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 129.32it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 129.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 132.22it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 133.65it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 80.07it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.07it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proces

[5, 28, 38, 41, 66, 70, 79, 99, 208, 211, 219, 227, 240, 247, 269, 295, 319, 332, 359, 364]


100%|██████████| 321/321 [00:00<00:00, 6806.84it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 130.30it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.92it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.98it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 155.94it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 155.47it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 154.41it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.79it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[29, 73, 90, 91, 99, 105, 110, 126, 131, 168, 180, 242, 249, 261, 271, 310]


100%|██████████| 416/416 [00:00<00:00, 7120.19it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 131.22it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.43it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.30it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.44it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.49it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.58it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.09it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.74it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[18, 44, 51, 83, 90, 93, 147, 154, 157, 164, 167, 172, 250, 255, 269, 293, 320, 340, 345, 358, 398]


100%|██████████| 484/484 [00:00<00:00, 6713.53it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 124.69it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 139.52it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.10it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.53it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.88it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.89it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[5, 10, 45, 46, 89, 147, 196, 201, 236, 310, 325, 336, 337, 374, 376, 404, 411, 413, 430, 435, 438, 465, 469, 471, 474]


100%|██████████| 394/394 [00:00<00:00, 6335.54it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 132.25it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 138.87it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 139.13it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 132.77it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.61it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.35it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.46it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[9, 11, 18, 20, 23, 26, 34, 129, 148, 191, 209, 244, 290, 308, 314, 317, 318, 374, 388, 391]


100%|██████████| 300/300 [00:00<00:00, 6711.28it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 133.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.00it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 146.16it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.56it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.98it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.78it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[6, 18, 22, 38, 62, 63, 84, 114, 122, 127, 143, 158, 162, 279, 280]


100%|██████████| 205/205 [00:00<00:00, 6036.54it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 118.88it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.73it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.03it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 135.96it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 146.96it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.52it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 138.56it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 133.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[9, 12, 37, 41, 102, 121, 124, 128, 131, 155, 183]


100%|██████████| 296/296 [00:00<00:00, 6779.89it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 127.60it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.66it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.24it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.80it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.01it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.55it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.86it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[6, 9, 14, 54, 67, 68, 72, 109, 117, 119, 122, 130, 250, 260, 288]


100%|██████████| 347/347 [00:00<00:00, 6829.80it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 128.21it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.04it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.04it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.42it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.75it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.52it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.44it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.85it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[3, 14, 29, 55, 61, 64, 75, 78, 80, 83, 87, 90, 151, 197, 216, 282, 299, 304]


100%|██████████| 487/487 [00:00<00:00, 6769.22it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 128.18it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.00it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.40it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.74it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.99it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.15it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.94it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[8, 34, 47, 50, 57, 65, 70, 73, 86, 104, 115, 132, 135, 153, 158, 161, 208, 214, 217, 222, 256, 415, 417, 421, 483]


100%|██████████| 444/444 [00:00<00:00, 6789.47it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 123.12it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.41it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 137.97it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.78it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.37it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.04it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.30it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.80it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[5, 8, 33, 46, 60, 90, 99, 136, 150, 153, 166, 212, 216, 243, 247, 283, 336, 372, 381, 384, 413, 424, 427]


100%|██████████| 280/280 [00:00<00:00, 6596.82it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 129.77it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.80it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.09it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.42it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.89it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.15it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 146.99it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 137.72it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[11, 31, 42, 61, 70, 105, 109, 212, 220, 236, 237, 241, 253, 255]


100%|██████████| 339/339 [00:00<00:00, 6881.60it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 124.10it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.76it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.81it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.63it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.41it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.98it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[25, 28, 29, 37, 43, 63, 80, 99, 110, 116, 123, 148, 162, 244, 252, 319, 337]


100%|██████████| 375/375 [00:00<00:00, 7095.87it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 133.37it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.85it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 155.03it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 155.03it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 154.94it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 154.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.20it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[61, 80, 107, 115, 117, 128, 137, 183, 188, 230, 258, 265, 294, 307, 344, 346, 364, 367, 368]


100%|██████████| 496/496 [00:00<00:00, 5561.12it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 127.31it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.52it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.92it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.61it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.31it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.15it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.83it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.12it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[1, 9, 12, 35, 50, 59, 82, 110, 114, 120, 137, 146, 154, 157, 162, 165, 235, 247, 279, 321, 352, 451, 462, 467, 468]


100%|██████████| 376/376 [00:00<00:00, 5342.05it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 127.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.07it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.51it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.43it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.80it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.14it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.22it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[4, 5, 14, 48, 53, 58, 128, 163, 214, 244, 266, 275, 279, 318, 319, 335, 358, 360, 369]


100%|██████████| 250/250 [00:00<00:00, 5779.57it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 127.07it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.75it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.52it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 138.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.77it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 139.25it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.60it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.76it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[7, 27, 30, 39, 58, 85, 115, 125, 147, 151, 154, 156, 245]


100%|██████████| 259/259 [00:00<00:00, 6121.66it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 131.45it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.27it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 146.34it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 142.98it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.95it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 153.06it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 155.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[8, 60, 71, 157, 165, 185, 191, 205, 206, 229, 232, 251, 254]


100%|██████████| 453/453 [00:00<00:00, 7064.35it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 128.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.94it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.23it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 152.22it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 146.17it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.05it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 149.97it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[8, 16, 21, 26, 42, 101, 102, 121, 135, 140, 157, 189, 213, 231, 251, 255, 291, 292, 296, 343, 397, 400, 417]


100%|██████████| 237/237 [00:00<00:00, 6135.13it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 131.55it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.46it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 151.34it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 150.63it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.27it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 147.51it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.92it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 148.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[3, 16, 21, 77, 82, 105, 145, 182, 195, 214, 225, 227]


100%|██████████| 253/253 [00:00<00:00, 5380.37it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 126.16it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.27it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.77it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.63it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 137.98it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 136.05it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 140.00it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.24it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[10, 22, 25, 72, 126, 129, 134, 164, 172, 175, 185, 186, 204]


100%|██████████| 363/363 [00:00<00:00, 6331.67it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 128.34it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 145.54it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 144.38it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.17it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 141.47it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 139.89it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.92it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 143.70it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Proc

[11, 14, 19, 36, 39, 47, 52, 61, 65, 109, 157, 224, 254, 273, 277, 327, 338, 350, 353]


In [10]:
len(chapters_info[0]['finalized_chunks'])

97

In [None]:
chapters_info[-1].keys()

In [None]:
chapters_info[-1]['chapter_number']

In [11]:
for i in range(len(chapters_info)):
    print(f"Chapter {i} -> {len(chapters_info[i]['finalized_chunks'])} chunks")

Chapter 0 -> 97 chunks
Chapter 1 -> 98 chunks
Chapter 2 -> 83 chunks
Chapter 3 -> 101 chunks
Chapter 4 -> 115 chunks
Chapter 5 -> 103 chunks
Chapter 6 -> 81 chunks
Chapter 7 -> 57 chunks
Chapter 8 -> 73 chunks
Chapter 9 -> 85 chunks
Chapter 10 -> 124 chunks
Chapter 11 -> 113 chunks
Chapter 12 -> 71 chunks
Chapter 13 -> 87 chunks
Chapter 14 -> 95 chunks
Chapter 15 -> 143 chunks
Chapter 16 -> 111 chunks
Chapter 17 -> 65 chunks
Chapter 18 -> 65 chunks
Chapter 19 -> 107 chunks
Chapter 20 -> 64 chunks
Chapter 21 -> 70 chunks
Chapter 22 -> 86 chunks


In [None]:
sum=0
for i in range(len(chapters_info)):
    print(f"Chapter {i} -> {len(chapters_info[i]['finalized_chunks'])} chunks")
    sum += len(chapters_info[i]['finalized_chunks'])
sum


In [None]:
import torch

class GenerateEmbeddings:
    def __init__(self, model_id: str, task: str = "embed", trust_remote_code: bool = True):
        """
        Initialize the GenerateEmbeddings class with the specified model.
        :param model_id: The ID or path of the model to use for generating embeddings.
        :param task: The task type for the model (default is "embed").
        :param trust_remote_code: Whether to trust remote code (default is True).
        """
        self.llm = LLM(model=model_id, task=task, trust_remote_code=trust_remote_code)
        self.prompt_template = (
            "Generate a dense semantic representation for the following excerpt. "
            "Ensure the embedding captures key technical themes, context, and narrative details.\n\n"
            "Chapter: {chapter_name}\n"
            "{text_chunk}\n"
        )

    def generate_embeddings(self, chapters_info, save_path):
        """
        Generate embeddings for each chunk in chapters_info.
        :param chapters_info: List of dictionaries containing chapter information with finalized chunks.
        :return: List of embeddings for all chunks.
        """
        all_embeddings = []

        for chapter in tqdm(chapters_info, desc="Generating embeddings"):
            chapter_title = chapter.get('chapter_title', 'Untitled Chapter')  # Get chapter title or default
            prompts = [
                    self.prompt_template.format(chapter_name=chapter_title, text_chunk=chunk)
                    for chunk in chapter['finalized_chunks']
                ]
            embeddings = self.llm.encode(prompts)
            for embedding in embeddings:
                all_embeddings.append(embedding.outputs.data)

        # Convert the list of embeddings to a PyTorch tensor
        embeddings_tensor = torch.tensor(np.array(all_embeddings))
        # Save the embeddings tensor to disk
        torch.save(embeddings_tensor, save_path)

        return embeddings_tensor 
    

if __name__ == "__main__":
    embedding_generator = GenerateEmbeddings(model_id="intfloat/multilingual-e5-large-instruct")
    embeddings_tensor = embedding_generator.generate_embeddings(chapters_info, save_path="data/embeddings.pt")


In [None]:
# llm model 
from vllm import LLM

llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct")






In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct
# max_tokens is for the maximum length for generation.
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)

# Input the model name or path. Can be GPTQ or AWQ models.
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct")

# Prepare your prompts
prompt = "Tell me something about large language models."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# generate outputs
outputs = llm.generate([text], sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")