In [2]:
import os
from dotenv import load_dotenv

# Load the environment variables from .env file
load_dotenv()

MODEL = "llama3"
MODEL = "gemma:7b"
MODEL = "stablelm2"

In [3]:
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
model = ChatOllama(model = MODEL)
embeddings = OllamaEmbeddings()


In [4]:
# print(model.invoke("Tell me a joke")) # Completion model -> gemma:7b

In [5]:
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()
chain = model | output_parser
# print(chain.invoke("Tell me a joke"))

In [6]:
from langchain_community.document_loaders import PyPDFLoader

# Create a document loader using the PyPDFLoader
document_loader = PyPDFLoader('BTDA-Report.pdf')

# Load the PDF document
pages = document_loader.load_and_split()
pages

[Document(page_content='Brain Tumor Detection Assistant (BTDA)Project Report\nSudarshan R 21BAI1257Mukundh J 21BAI1133\nProject Report1', metadata={'source': 'BTDA-Report.pdf', 'page': 0}),
 Document(page_content='Table of Contents Literature Review………………………………………………………………………………3 Problem Statement…………………………………………………………………………..4 Dataset………………………………………………………………………………………..……5 Model...............................................................6 Hyperparameters..............................................7 Coding...............................................................8 Performance Improvement.................................9 Execution.......................................................... 10 Results.............................................................11 Conclusion........................................................13 References........................................................14 \nProject Report2', metadata={'source': 'BTDA-Report.pdf', 'page': 1}),
 Document(

In [7]:
print(f"Number of pages: {len(pages)}")

Number of pages: 13


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter


def extract_text_with_selenium(url):
    try:
        # Configure Selenium WebDriver (adjust for your setup)
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        service = Service(executable_path='/usr/local/bin/chromedriver')
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Load the webpage
        driver.get(url)

        # Wait for the page to load (adjust wait time if needed)
        driver.implicitly_wait(10)

        # Get the page source after JavaScript execution
        page_source = driver.page_source

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract text from the parsed HTML
        text = soup.get_text(separator=' ', strip=True)

        # Create a LangChain Document object
        metadata = {"source": url}  
        document = Document(page_content=text, metadata=metadata)
        return document

    except Exception as e:
        # st.error(f"Error extracting text from the webpage: {e}")
        return None

    finally:
        # Close the browser
        driver.quit()

# "https://medium.com/@deepanshut041/introduction-to-surf-speeded-up-robust-features-c7396d6e7c4e", "https://www.freecodecamp.org/news/beginners-guide-to-langchain/"
# Example Usage (assuming you have a list of URLs)
urls = ["https://medium.com/@sakhujasaiyam/dimensionality-reduction-using-haar-wavelet-transform-a1678c5dc6e2"]
documents = []
for url in urls:
    document = extract_text_with_selenium(url)
    if document:
        documents.append(document)


# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
chunks = text_splitter.split_documents(documents)
chunks

[Document(page_content='Dimensionality Reduction using Haar Wavelet Transform: Theory & Implementation | by Saiyam Sakhuja | Apr, 2024 | Medium Open in app Sign up Sign in Write Sign up Sign in Dimensionality Reduction using Haar Wavelet Transform: Theory & Implementation Saiyam Sakhuja · Follow 4 min read · Apr 18, 2024 -- Listen Share Dimensionality reduction is a crucial technique in machine learning and data analysis, aimed at reducing the number of features or variables in a dataset while preserving its essential information. One powerful approach to dimensionality reduction is through the use of wavelet transforms, which decompose signals into localized frequency components. In this blog post, we’ll explore how to implement Haar Wavelet Transform, a simple yet effective wavelet transform, for dimensionality reduction in Python. Understanding Haar Wavelet Transform Introducing Haar Wavelet: Haar Wavelet is the simplest form of wavelet transform, named after mathematician Alfréd Ha

In [9]:
from langchain.prompts import PromptTemplate

template = """

You are a grader assessing relevance of a retrieved document to a user question. If the document contains keywords related to the user question,
grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. 
Provide the binary score as aIJSON with a single key 'score' and no premable or explaination.

Context: {context}

Question: {question}

"""

In [10]:
template = """


You are an assistant for question-answe Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you dont know. 
Use three sentences maximum and keep the answer concise. Keep in mind that the goal is to provide a quick and accurate answer to the user's question.

Context: {context}

Question: {question}

"""




In [11]:
prompt = PromptTemplate.from_template(template)
# print(prompt.format(context = "The quick brown fox jumps over the lazy dog.", question = "What does the fox jump over?"))

In [12]:
chain = prompt | model | output_parser


In [13]:
# print(chain.invoke({
#     "context": "The quick brown fox jumps over the lazy dog.",
#     "question": "What does the fox jump over?"
# }))

In [14]:
chain.input_schema.schema()

{'title': 'PromptInput',
 'type': 'object',
 'properties': {'context': {'title': 'Context', 'type': 'string'},
  'question': {'title': 'Question', 'type': 'string'}}}

In [15]:
type(pages) == type(chunks)

True

In [16]:
from langchain_community.vectorstores import DocArrayInMemorySearch

# vectorstore = DocArrayInMemorySearch.from_documents(chunks, embedding=embeddings)
vectorstore = Chroma.from_documents(documents=chunks, collection_name="rag-chroma", embedding=embeddings)

In [17]:
retriever = vectorstore.as_retriever()
retriever.invoke("Haar wavelet")

[Document(page_content='Dimensionality Reduction using Haar Wavelet Transform: Theory & Implementation | by Saiyam Sakhuja | Apr, 2024 | Medium Open in app Sign up Sign in Write Sign up Sign in Dimensionality Reduction using Haar Wavelet Transform: Theory & Implementation Saiyam Sakhuja · Follow 4 min read · Apr 18, 2024 -- Listen Share Dimensionality reduction is a crucial technique in machine learning and data analysis, aimed at reducing the number of features or variables in a dataset while preserving its essential information. One powerful approach to dimensionality reduction is through the use of wavelet transforms, which decompose signals into localized frequency components. In this blog post, we’ll explore how to implement Haar Wavelet Transform, a simple yet effective wavelet transform, for dimensionality reduction in Python. Understanding Haar Wavelet Transform Introducing Haar Wavelet: Haar Wavelet is the simplest form of wavelet transform, named after mathematician Alfréd Ha

In [19]:
from operator import itemgetter
chain = (
    {"context": itemgetter ("question") | retriever, "question": itemgetter ("question")} 
        | prompt 
        | model 
        | output_parser)

print(chain.invoke({"question": "What is a haar wavelet?"}))

A Haar Wavelet Transform is a powerful tool for dimensionality reduction and signal processing tasks in Python, commonly used in applications such. It applies the Haar Wavelet Transform to capture local features and sharp transitions in data, making it well-suited for analyzing complex signals and images. By selecting a subset of the approximation coefficients obtained from the Haar Wavelet Transform, we can effectively reduce the dimensionality of datasets while preserving important information and patterns.
