In [14]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import JSONLoader
from langchain.storage import InMemoryStore
from langchain_core.documents import Document

import os
from dotenv import load_dotenv

In [20]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.storage import InMemoryStore
from langchain_core.documents import Document

import os
from dotenv import load_dotenv

# Load environment variables (make sure to set GOOGLE_API_KEY in your .env file)
load_dotenv()

def read_file_with_fallback_encoding(file_path):
    encodings = ['utf-8', 'latin-1', 'ascii', 'utf-16', 'cp1252']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Unable to read the file {file_path} with any of the attempted encodings")

def load_rag_content_from_text(text_file_path, chunk_size=1000, chunk_overlap=200):
    try:
        # Read the file content
        file_content = read_file_with_fallback_encoding(text_file_path)
        
        # Create a Document object
        document = Document(page_content=file_content, metadata={"source": text_file_path})

        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

        # Split the document
        splits = text_splitter.split_documents([document])

        # Initialize the embedding function with Gemini Pro
        embedding_function = GoogleGenerativeAIEmbeddings(model="gemini-1.5-flash")

        # Create the vector store
        vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function)

        # Create the retriever
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

        return retriever
    except Exception as e:
        print(f"Error processing file {text_file_path}: {str(e)}")
        return None

# Usage
text_file_path = "rag_string.txt"
retriever = load_rag_content_from_text(text_file_path)

Error processing file rag_string.txt: Could not import chromadb python package. Please install it with `pip install chromadb`.


In [None]:
if retriever:
    # Test the retriever
    query = "What is the main topic of the content?"
    results = retriever.get_relevant_documents(query)

    for doc in results:
        print(f"Content: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}")
        print("-" * 50)
else:
    print("Failed to create retriever due to file reading error.")