### Simple Gen AI app with langchain and langsmith

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

### scrap the data from a website

In [2]:
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [48]:


from langchain_community.document_loaders import WebBaseLoader

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/115.0 Safari/537.36"}

loader = WebBaseLoader(
    "https://www.gutenberg.org/files/84/84-h/84-h.htm",
    requests_kwargs={"headers": headers}
)

docs = loader.load()
docs

[Document(metadata={'source': 'https://www.gutenberg.org/files/84/84-h/84-h.htm', 'title': 'Frankenstein | Project Gutenberg', 'language': 'en'}, page_content='\n\n\n\n\nFrankenstein | Project Gutenberg\n\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK 84 ***\nFrankenstein;\nor, the Modern Prometheus\nby Mary Wollstonecraft (Godwin) Shelley\n\n\nCONTENTS\n\n\n\nLetter 1\n\n\n\n\nLetter 2\n\n\n\n\nLetter 3\n\n\n\n\nLetter 4\n\n\n\n\nChapter 1\n\n\n\n\nChapter 2\n\n\n\n\nChapter 3\n\n\n\n\nChapter 4\n\n\n\n\nChapter 5\n\n\n\n\nChapter 6\n\n\n\n\nChapter 7\n\n\n\n\nChapter 8\n\n\n\n\nChapter 9\n\n\n\n\nChapter 10\n\n\n\n\nChapter 11\n\n\n\n\nChapter 12\n\n\n\n\nChapter 13\n\n\n\n\nChapter 14\n\n\n\n\nChapter 15\n\n\n\n\nChapter 16\n\n\n\n\nChapter 17\n\n\n\n\nChapter 18\n\n\n\n\nChapter 19\n\n\n\n\nChapter 20\n\n\n\n\nChapter 21\n\n\n\n\nChapter 22\n\n\n\n\nChapter 23\n\n\n\n\nChapter 24\n\n\n\n\n\nLetter 1\n\nTo Mrs. Saville, England.\n\n\r\nSt. Petersburgh, Dec. 11th, 17—.\r\n\n\r\nYou wi

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader

class SeleniumWebLoader(BaseLoader):
    def __init__(self, url: str):
        self.url = url

    def load(self):
        # Start browser
        driver = webdriver.Chrome()   # make sure chromedriver is installed
        driver.get(self.url)

        # Extract HTML
        html = driver.page_source
        driver.quit()

        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text(separator="\n", strip=True)

        # Return as LangChain Document
        return [Document(page_content=text, metadata={"source": self.url})]


# -------------------------------
# ✅ Usage
# -------------------------------
url = "https://learnenglish.britishcouncil.org/general-english/story-zone/b2-c1-stories/bad-blood-b2/c1/"
loader = SeleniumWebLoader(url)
docs = loader.load()

print(docs[0].page_content[:500])   # show first 500 chars


In [63]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter  =   RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=150)
splitted_docs_ori = splitter.split_documents(docs)
len(splitted_docs_ori)

823

In [64]:
splitted_docs = splitted_docs_ori[:33]
len(splitted_docs)

33

In [65]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

In [66]:
a = embeddings.embed_documents([d.page_content for d in splitted_docs])

In [67]:
len(a)

33

In [68]:
from langchain_community.vectorstores import Chroma

# Create a Chroma database and store docs
db = Chroma.from_documents(
    splitted_docs,
    embeddings,
    persist_directory="./chroma_store2"  # saves DB locally
)

# Save database to disk
db.persist()


In [71]:
# Reload the DB later
db = Chroma(
    persist_directory="./chroma_store2",
    embedding_function=embeddings
)

# Example query
query = " Who is the recipient of Robert Walton's first letter, and from what city is it sent?"
results = db.similarity_search(query, k=3)

for r in results:
    print(r.page_content[:300])
    print("++++++++++++++++++++++++++++")  # show first 300 chars


But to return to dearer considerations. Shall I meet you again, after having
traversed immense seas, and returned by the most southern cape of Africa or
America? I dare not expect such success, yet I cannot bear to look on the
reverse of the picture. Continue for the present to write to me by eve
++++++++++++++++++++++++++++
Your affectionate brother,
R. Walton



Letter 2

To Mrs. Saville, England.


Archangel, 28th March, 17—.


How slowly the time passes here, encompassed as I am by frost and snow! Yet a
second step is taken towards my enterprise. I have hired a vessel and am
occupied in collecting my sailors;
++++++++++++++++++++++++++++
Chapter 21




Chapter 22




Chapter 23




Chapter 24





Letter 1

To Mrs. Saville, England.


St. Petersburgh, Dec. 11th, 17—.


You will rejoice to hear that no disaster has accompanied the commencement of
an enterprise which you have regarded with such evil forebodings. I arrived
here ye
++++++++++++++++++++++++++++


In [78]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama

# Retriever from Chroma
retriever = db.as_retriever(search_kwargs={"k": 5})

# Prompt with context slot
prompt = ChatPromptTemplate.from_messages([
    ("system",  '''You are a knowledgeable literature assistant. 
                Use ONLY the provided context (which comes from classic literature texts such as Project Gutenberg) 
                to answer the question. 
                If the context does not contain the answer, say 'I don't know based on the given text.' 
                Do not use outside knowledge or make assumptions.'''),
    ("human", "{question}")
])

# LLM
llm = Ollama(model="gemma3:4b")

# Parser
output_parser = StrOutputParser()

# Function to pull docs from retriever
def get_context(inputs: dict):
    docs = retriever.get_relevant_documents(inputs["question"])
    context = "\n\n".join(d.page_content for d in docs)
    return {"context": context, "question": inputs["question"]}

# Full chain = retriever -> prompt -> llm -> parser
rag_chain = get_context | prompt | llm | output_parser

# Test query
response = rag_chain.invoke({"question": "Who is the recipient of Robert Walton's first letter, and from what city is it sent?"})
print(response) 


I don't know based on the given text.
