In [62]:
# !pip install langchain
# !pip install langchain-community
# !pip install langchain-experimental
# !pip install PyPDF2
# !pip install langchain_groq

In [53]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from rich import print
import PyPDF2
import requests
from bs4 import BeautifulSoup
from google.colab import userdata

In [54]:
# Embedding Model
embedding_model = HuggingFaceInferenceAPIEmbeddings(api_key =userdata.get('HF_TOKEN'),model_name="BAAI/bge-base-en-v1.5")

# Defining the text for reference

In [11]:
def get_content(url:str):

  # Send a GET request to the webpage
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Parse the HTML content using BeautifulSoup
      soup = BeautifulSoup(response.text, "html.parser")

      # Extract the main content (text)
      # This site has the main text within <font> tags
      content = soup.find_all("font")

      # Combine and clean the extracted text
      full_text = "\n".join([element.get_text() for element in content])

      return full_text
  else:
      print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

url = "https://paulgraham.com/persistence.html"
# essay = get_content(url)
# print(essay)

The above code is just for your reference that if you want to explore the online blogs then you can simply pass the url and it will be automatically extracted.

In [31]:
# Open the PDF file in binary mode
with open("/content/RAG.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    content = ""

    # Calculate the number of pages to read (excluding the last 3 pages)
    total_pages = len(reader.pages)
    pages_to_read = total_pages - 5

    # Extract text from the pages, excluding the last 3
    for page_num in range(pages_to_read):
        content += reader.pages[page_num].extract_text()

## Technqiue 1 : Character Text Splitter

In [43]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=200, separator='', strip_whitespace=True)
docs_technique1 = text_splitter.create_documents([content])

In [48]:
print("Total documents created:",len(docs_technique1))
print(docs_technique1[1])

## Technique 2 : Recursive text splitter

In [46]:
text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""])
docs_technique2 = text_splitter_recursive.create_documents([content])

In [49]:
print("Total documents created:",len(docs_technique2))
print(docs_technique2[1])

## Semantic Splitting

In [56]:
%%time
text_splitter_semantic = SemanticChunker(embedding_model)
docs_technique3 = text_splitter_semantic.create_documents([content])

CPU times: user 364 ms, sys: 73.5 ms, total: 438 ms
Wall time: 1.43 s


In [57]:
print("Total documents created:",len(docs_technique3))
print(docs_technique3[1])

## Agentic Spliting

In [64]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
from langchain_groq import ChatGroq
import os

In [65]:
# Loading the LLM model
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
model = ChatGroq(model="llama3-8b-8192")

In [60]:
# Pulling out propositions is done via a well-crafted prompt. Let's pull it from LangHub, LangChain's home for prompts.
obj = hub.pull("wfh/proposal-indexing",api_key = userdata.get('LANGSMITH'))

The output from a runnable is a json-esque structure in a string. We need to pull the sentences out. I found that LangChain's example extraction was giving me a hard time so I'm doing it manually with a pydantic data class. There is definitely room to improve this.

Create your class then put it in an extraction chain.

In [66]:
# use it in a runnable
runnable = obj | model

# Pydantic data class
class Sentences(BaseModel):
    sentences: List[str]

# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=model)

In [67]:
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content

    propositions = extraction_chain.run(runnable_output)[0].sentences
    return propositions

In [70]:
test_content = docs_technique2[2].page_content

In [72]:
results = get_propositions(test_content)

BadRequestError: Error code: 400 - {'error': {'message': "Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': 'Here is the extracted information in JSON format:\n\n<tool-use>\n{\n  "tool_calls": [\n    {\n      "id": "pending",\n      "type": "function",\n      "function": {\n        "name": "information_extraction",\n        "parameters": {\n          "info": {\n            "sentences": [\n              "LARGE language models have achieved remarkable success, though they still face significant limitations.",\n              "LARGE language models face significant limitations, especially in domain-specific or knowledge-intensive tasks.",\n              "Domain-specific or knowledge-intensive tasks are challenging for LARGE language models.",\n              "LARGE language models produce \'hallucinations\' when handling queries beyond their training data or requiring current information.",\n              "Retrieval-Augmented Generation (RAG) enhances LARGE language models by retrieving relevant document chunks from external knowledge base through semantic similarity calculation.",\n              "RAG effectively reduces the problem of generating factually incorrect content by referencing external knowledge.",\n              "The integration of RAG into LARGE language models has resulted in widespread adoption.",\n              "Index Terms —Large language model, retrieval-augmented generation, natural language processing, information retrieval"\n            ]\n          }\n        }\n      }\n    }\n  ]\n}\n</tool-use>'}}