In [9]:
# set working directory
import os
os.chdir(r'C:\Users\a1bg532573\repo\Bulgarian-AI-Folktales\preprocesing')
# imports 
from langchain.document_loaders.pdf import PyPDFDirectoryLoader  # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Importing text splitter from Langchain

from langchain_openai import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document  # Importing Document schema from Langchain
from langchain_chroma import Chroma  # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI

import os  # Importing os module for operating system functionalities
import shutil  # Importing shutil module for high-level file operations

import json
from langchain.schema import Document

from collections import defaultdict

import time
from typing import List, Tuple
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
import os

### 1. Load documents and clean duplicates`


In [3]:
def load_documents(DATA_PATH):
    """
    Load text documents from JSON files in the specified directory.

    Returns:
        List of Document objects: Loaded text documents represented as Langchain Document objects.
    """
    documents = []

    # Iterate over each file in the directory
    for filename in os.listdir(DATA_PATH):
        if filename.endswith(".json"):
            file_path = os.path.join(DATA_PATH, filename)
            
            # Open and read each JSON file
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                
                # Extract book, author, and stories from JSON
                book_name = data.get("book", "")
                author = data.get("author", "")
                stories = data.get("stories", [])

                # Iterate through each story and create Document objects
                for story in stories:
                    if "story" in story or "author" in story:
                        story_title = story.get("story") or story.get("author")
                        story_content = story.get("content", "")

                        # Append the story title to the beginning of the story content
                        combined_content = f"{story_title}: {story_content}"

                        # Create a Document object with metadata
                        document = Document(
                            page_content=combined_content,
                            metadata={
                                "book": book_name,
                                "author": author,
                                "story": story_title
                            }
                        )
                        # Append the document to the list of documents
                        documents.append(document)

    return documents

def keep_unique_documents(documents):
    """
    Keep only one unique record for each duplicate, removing redundant records.

    Args:
        documents (list): List of Document objects.

    Returns:
        List of Document objects: List with unique documents, retaining one instance for each duplicate.
    """
    content_tracker = {}
    unique_documents = []

    # Track each document's combined content and keep only the first occurrence
    for doc in documents:
        content = doc.page_content.strip().lower()  # Normalize for comparison
        if content not in content_tracker:
            # Add the first occurrence to the unique list
            content_tracker[content] = doc
            unique_documents.append(doc)

    return unique_documents

DATA_PATH = 'output_json_files'
documents = load_documents(DATA_PATH)  # Load documents from a source
unique_documents = keep_unique_documents(documents)  # Keep only unique documents

### 2. Building a small quick token size function


In [5]:
import tiktoken

def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """
    Count the number of tokens in a given text using the specified model's tokenizer.

    Args:
        text (str): The input text to tokenize.
        model (str): The name of the model to use for tokenization (default: "gpt-3.5-turbo").

    Returns:
        int: The number of tokens in the text.
    """
    model = "gpt-4o-mini"
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

### 3. Sort documents by token size and print first and last examples

In [6]:
# sort unique_documents by unique_documents[i].page_content token size
sorted_documents = sorted(unique_documents, key=lambda x: count_tokens(x.page_content), reverse=True)

# Print the first 10 documents
for doc in sorted_documents[:10]:
    print(f"Document: {doc.metadata['story'][:30]}... | Tokens: {count_tokens(doc.page_content)}")
print("---------------")
# Print last 10 documents
for doc in sorted_documents[-30:]:
    print(f"Document: {doc.page_content}... | Tokens: {count_tokens(doc.page_content)}")

Document: повест за една гора... | Tokens: 75356
Document: срещата на най-големия с ламят... | Tokens: 18720
Document: щетинското ханче... | Tokens: 10889
Document: в тронната зала... | Tokens: 8410
Document: веселият монах... | Tokens: 7511
Document: босата команда... | Tokens: 6840
Document: гарван грачи... | Tokens: 5535
Document: юнакът със звезда на челото и ... | Tokens: 5135
Document: юнакът със звезда на челото и ... | Tokens: 5129
Document: врабчетата на стрина дойна... | Tokens: 4829
---------------
Document: жените или мъжете са повече на земята?: където ходели настрадин ходжа и хитър петър, все се нещо препирали, изпитвали, надлъгвали. като вървели веднъж. хитър петър разправял, че жените са повече на земното кълбо, а ходжата казвал, че мъжете са повече. слушал го, слушал хитър петър, па му рекъл — не си прав, ходжа! жените са повече, защото има мъже, които слушат жените си затова и те се числят към жените!... | Tokens: 141
Document: изгубил доверие: в селото на хитър петър

### 4. Sort docments in bins based on token counts
#### 1. Dropping binned_documents['0-60'] as it is too small and possible left noise from parsing
#### 2. Taking bin ['60-1000'] as full ebedding
#### 3. Taking bin ['1000-'float('inf')] as chunk embeding

In [7]:
def sort_documents_into_bins(documents, bin_ranges):
    """
    Sort documents into bins based on their token counts.
    
    Args:
        documents (list): List of Document objects.
        bin_ranges (list): List of tuples representing the start and end of each bin range.
    
    Returns:
        dict: A dictionary with bin ranges as keys and sorted lists of documents as values.
    """
    bins = {f"{start}-{end}": [] for start, end in bin_ranges}
    
    for doc in documents:
        token_count = count_tokens(doc.page_content)
        for start, end in bin_ranges:
            if start <= token_count < end:
                bins[f"{start}-{end}"].append((doc, token_count))
                break
    
    # Sort documents within each bin by token count (descending order)
    for bin_range in bins:
        bins[bin_range].sort(key=lambda x: x[1], reverse=True)
    
    return bins

# Define bin ranges
bin_ranges = [(0, 60), (60, 1000), (1000, float('inf'))]

# Sort documents into bins
binned_documents = sort_documents_into_bins(unique_documents, bin_ranges)

# Print the number of documents in each bin
for bin_range, docs in binned_documents.items():
    print(f"Bin {bin_range}: {len(docs)} documents")

# Print example documents from each bin
for bin_range, docs in binned_documents.items():
    print(f"\nBin {bin_range}:")
    for doc, token_count in docs[:20]:  # Print first 20 documents in each bin
        print(f"  Document: {doc.metadata['story'][:30]}... | Tokens: {token_count}")
    if len(docs) > 20:
        print("  ...")
    # Print last 20 documents in each bin
    print(f"  Last 20 documents in {bin_range}:")
    for doc, token_count in docs[-20:]:
        print(f"    Document: {doc.metadata['story'][:30]}... | Tokens: {token_count}")

Bin 0-60: 16 documents
Bin 60-1000: 398 documents
Bin 1000-inf: 433 documents

Bin 0-60:
  Document: елин пелин... | Tokens: 33
  Document: леда милева... | Tokens: 31
  Document: елин пелин... | Tokens: 27
  Document: сава попов... | Tokens: 25
  Document: горска хубавица... | Tokens: 23
  Document: ангел каралийчев... | Tokens: 18
  Document: ангел каралийчев... | Tokens: 17
  Document: ангел каралийчев... | Tokens: 17
  Document: ангел каралийчев... | Tokens: 17
  Document: ангел каралийчев... | Tokens: 16
  Document: ангел каралийчев... | Tokens: 13
  Document: ангел каралийчев... | Tokens: 13
  Document: през планини и морета... | Tokens: 13
  Document: елин пелин... | Tokens: 12
  Document: елин пелин... | Tokens: 12
  Document: елин пелин... | Tokens: 11
  Last 20 documents in 0-60:
    Document: елин пелин... | Tokens: 33
    Document: леда милева... | Tokens: 31
    Document: елин пелин... | Tokens: 27
    Document: сава попов... | Tokens: 25
    Document: горска хубавица... |

### 5. Custom Contextual Enhancing embedding for ['60-1000'] bin (**Small Stories**) and ['1000-inf'] bin (**Large Stories**) with GPT-4o-mini

- Timeout mechanism to avoid hitting TPM limit of gpt-4o-mini (200 000 TPM) - keep track of the number of tokens sent to the LLM and pause for 1 minute the process if the limit is about to be reached
- Handling large documents:
1. Document size: Handles texts ranging from 1,000 to 20,000 tokens.
2. Dynamic chunking: Uses a text splitter to divide documents into ~1,000 token chunks with 200 token overlap. Chunk count adapts to document length.
3. Contextual embedding: Applies `get_contextual_embedding()` with `prompt_type="contextual"` to each chunk, providing context within the full document.
4. Document-level enhancement: Calls `get_contextual_embedding()` with `prompt_type="enhancing"` once for the entire document.
5. Information combination: Appends both chunk-specific contextual info and document-level enhancing info to each chunk, resulting in multiple chunks with shared enhancing info but unique contextual details.

In [11]:
# Constants
TEST_CHROMA_PATH = "vector_store_test"
TPM_LIMIT = 200000
PAUSE_TIME = 80  # seconds

os.environ['OPENAI_API_KEY'] = ''
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize components
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Set up ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-4o-mini",  # Use a smaller model for faster response times
    api_key=OPENAI_API_KEY,
    temperature=0,       # Increase temperature for more creativity
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

def get_contextual_embedding(doc: Document, llm: ChatOpenAI, prompt_type: str, full_text: str = None) -> str:
    """
    Generate a contextual embedding for a document using the LLM.
    
    Args:
        doc (Document): The document or chunk to embed.
        llm (ChatOpenAI): The language model to use for generating context.
        prompt_type (str): Type of prompt to use ("contextual" or "enhancing").
        full_text (str): The full text of the document (used for contextual embedding).
    
    Returns:
        str: The contextual embedding as a string.
    """
    if prompt_type == "contextual":
        prompt = f"""
        Here is the chunk we want to situate within the whole document
        <chunk>
        {doc.page_content}
        </chunk>

        Here is the content of the whole document
        <document>
        {full_text}
        </document>

        Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
        Answer only with the succinct context and nothing else.
        """
    elif prompt_type == "enhancing":
        prompt = f"""
        Based on the following story, provide a concise summary including:
        1. A brief synopsis of the story
        2. The main characters
        3. The setting or environment
        4. The moral or main message of the story
        <story>
        Story: {doc.page_content}
        </story>

        Respond in a concise paragraph format: "**brief summary of the story:** ..., **main characters:** ..., **setting:** ..., **moral:** ..." *... are placeholders for the actual values you will provide after reading the story. Please give a succinct answer to augment the document for the purposes of improving search retrieval of the story.
        Separate every value in a separate line with a new line character. 
        """
    
    response = llm.invoke(prompt)
    return response.content

def embed_documents(documents: List[Tuple[Document, int]], llm: ChatOpenAI, embeddings, chroma_path: str) -> None:
    """
    Embed documents with contextual information and store in Chroma DB.
    Handles both small (60-1000 tokens) and large (1000+ tokens) documents differently.
    
    Args:
        documents (List[Tuple[Document, int]]): List of (document, token_count) tuples.
        llm (ChatOpenAI): The language model to use for generating context.
        embeddings: The embedding model to use.
        chroma_path (str): The path to store the Chroma database.
    """
    total_tokens = 0
    start_time = time.time()
    combined_docs = []

    for doc, token_count in documents:
        # Check if we're approaching the TPM limit
        if total_tokens + token_count > TPM_LIMIT:
            elapsed_time = time.time() - start_time
            if elapsed_time < PAUSE_TIME:
                print(f"Approaching TPM limit. Pausing for {PAUSE_TIME - elapsed_time:.2f} seconds.")
                time.sleep(PAUSE_TIME - elapsed_time)
            total_tokens = 0
            start_time = time.time()

        # Determine if it's a large document (1000+ tokens)
        is_large_doc = token_count >= 1000

        if is_large_doc:
            # Calculate the number of chunks based on character count
            character_count = len(doc.page_content)
            num_chunks = max(2, token_count // 1000)
            chunk_size = (character_count // num_chunks) + 200
            
            # Create a text splitter
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=200,
                length_function=len,
            )

            # Split the document into chunks
            chunks = text_splitter.split_documents([doc])

            # Generate enhancing embedding for the whole document
            enhancing_info = get_contextual_embedding(doc, llm, prompt_type="enhancing")

            # Process each chunk
            for chunk in chunks:
                # Generate contextual embedding for the chunk
                contextual_info = get_contextual_embedding(chunk, llm, prompt_type="contextual", full_text=doc.page_content)

                # Combine original content with contextual and enhancing info
                combined_content = f"{chunk.page_content}\n\nContextual Information:\n{contextual_info}\n\nEnhancing Information:\n{enhancing_info}"
                
                # Create a new document with combined content and original metadata
                contextual_doc = Document(page_content=combined_content, metadata=doc.metadata)
                
                combined_docs.append(contextual_doc)

            print(f"Embedded large document: {doc.metadata['story'][:30]}...")
            print(f"Number of splits: {len(chunks)} for Character count: {character_count} for Tokens {token_count}")
        else:
            # For smaller documents, use the original approach
            enhancing_info = get_contextual_embedding(doc, llm, prompt_type="enhancing")
            
            # Combine original content with enhancing info
            combined_content = f"{doc.page_content}\n\nEnhancing Information:\n{enhancing_info}"
            
            # Create a new document with combined content and original metadata
            contextual_doc = Document(page_content=combined_content, metadata=doc.metadata)
            
            combined_docs.append(contextual_doc)
            print(f"Embedded small document: {doc.metadata['story'][:30]}...")

        total_tokens += token_count
        print(f"Total tokens: {total_tokens}")
    
    # Create and persist the Chroma database
    db = Chroma.from_documents(
        combined_docs, embeddings, persist_directory=chroma_path
    )
    print(f"Saved {len(combined_docs)} documents to {chroma_path}.")

# Test with documents from both bins
small_docs = binned_documents['60-1000'][:1]
large_docs = [binned_documents['1000-inf'][-1]]
test_docs = small_docs + large_docs

embed_documents(test_docs, llm, embeddings, TEST_CHROMA_PATH)
print("Finished embedding all documents and saved to Chroma DB.")

Embedded small document: трите патенца...
Total tokens: 999
Embedded large document: врабчето си иска зърното...
Number of splits: 2 for Character count: 2523 for Tokens 1009
Total tokens: 2008
Saved 3 documents to vector_store_test.
Finished embedding all documents and saved to Chroma DB.


### 6. Testing the retrieval of the stored embeddings

In [12]:
db = Chroma(persist_directory=TEST_CHROMA_PATH)
result = db.get(include=['embeddings'])
result

{'ids': ['b51a5ca9-5852-47dc-8a76-518726a93bc1',
  '87a8cc1b-58d6-4146-a101-a59db6e670ed',
  '0f97b8f7-7fb5-4eb4-a809-5922889cdd8c'],
 'embeddings': array([[ 0.06076654,  0.02530093,  0.01129333, ...,  0.0064741 ,
         -0.01222809,  0.03312524],
        [ 0.03261885,  0.0120194 , -0.03974689, ...,  0.00992206,
          0.02024011,  0.04889894],
        [ 0.01968575,  0.01906617, -0.04697545, ...,  0.02292447,
          0.00985696,  0.05542427]]),
 'metadatas': None,
 'documents': None,
 'uris': None,
 'data': None,
 'included': ['embeddings']}

In [14]:
db.get('87a8cc1b-58d6-4146-a101-a59db6e670ed')

{'ids': ['87a8cc1b-58d6-4146-a101-a59db6e670ed'],
 'embeddings': None,
 'metadatas': [{'author': 'ангел каралийчев',
   'book': 'Български народни приказки ак',
   'story': 'врабчето си иска зърното'}],
 'documents': ["врабчето си иска зърното: сивото врабче кацнало върху един плет и почнало да си ниже герданче от мънистени зрънца. както нижело, изтървало едно зърно. зърното паднало в тръните, търколило се някъде и се загубило. — хей, плет — изчуруликало врабчето, — дай ми мънистеното зърно или ще кажа на огъня да те изгори! — кажи му де! — отвърнал плетът. — огънчо — хвръкнало врабчето над огъня, — изгори плета! — не ща — отвърнал огънят. — докато си имам сухи букови дървета, много ми е притрябвало да горя трънливия плет и да се бода на тръните му. — ще кажа на реката да те угаси! — кажи й де! — отвърнал огънят. врабчето литнало над реката и зачуруликало. — речице, моля ти се, угаси огъня! — ами — отговорила реката, — много ми е притрябвало да гася огън. додето си имам тия гладки камъ

In [13]:
from IPython.display import display, Markdown, Latex
display(Markdown(db.get('87a8cc1b-58d6-4146-a101-a59db6e670ed')["documents"][0]))

врабчето си иска зърното: сивото врабче кацнало върху един плет и почнало да си ниже герданче от мънистени зрънца. както нижело, изтървало едно зърно. зърното паднало в тръните, търколило се някъде и се загубило. — хей, плет — изчуруликало врабчето, — дай ми мънистеното зърно или ще кажа на огъня да те изгори! — кажи му де! — отвърнал плетът. — огънчо — хвръкнало врабчето над огъня, — изгори плета! — не ща — отвърнал огънят. — докато си имам сухи букови дървета, много ми е притрябвало да горя трънливия плет и да се бода на тръните му. — ще кажа на реката да те угаси! — кажи й де! — отвърнал огънят. врабчето литнало над реката и зачуруликало. — речице, моля ти се, угаси огъня! — ами — отговорила реката, — много ми е притрябвало да гася огън. додето си имам тия гладки камъчета, които сега броя, що ми трябва да се паря с огън? — ще кажа на бивола да те изпие! — заканило се врабчето. — кажи му де! — биволчо, изпий реката! — кацнало врабчето върху единия рог на бивола. — как не — отвърнал биволът, — аз се напасох с такава росна трева, че ако сръбна и вода — ще ми се надуе коремът и ще се пукне. — тогава ще кажа на вълка да те изяде. — кажи му де! — рекъл биволът. врабчето отишло при вълка в гората. — вълчо — помолило го то, — ела да изядеш бивола. — какво приказваш — отговорил вълкът, — додето има такива крехки агънца, що ми трябва жилаво биволско месо! — ще кажа на овчаря да насъска кучетата и те ще ти разкъсат кожуха! — кажи му де! —

Contextual Information:
The chunk is a narrative segment from a fable about a sparrow trying to retrieve a lost bead from a thorny hedge. It details the sparrow's attempts to threaten various elements of nature (fire, river, buffalo, wolf, shepherd) to get the bead back, showcasing a chain of interactions among these characters. This section illustrates themes of resourcefulness and the interconnectedness of nature, ultimately leading to the resolution where the sparrow receives the bead.

Enhancing Information:
**brief summary of the story:** A gray sparrow loses a bead while stringing them on a fence and demands it back, threatening various elements of nature. Each entity refuses to comply, leading to a chain reaction of threats until a cat intervenes, causing a series of events that ultimately results in the sparrow retrieving its lost bead from the fence. 

**main characters:** The gray sparrow, the fence, the fire, the river, the buffalo, the wolf, the shepherd, the mice, and the cat.

**setting:** The story takes place in a natural environment featuring a fence, a fire, a river, and a forest.

**moral:** The story illustrates the futility of threats and the interconnectedness of nature, emphasizing that cooperation and understanding are more effective than intimidation.