In [19]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [6]:
# Setup Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_postgres import PGVector

modelPath = "Alibaba-NLP/gte-base-en-v1.5"
model_kwargs = { "device": "cpu", "trust_remote_code": True }
encode_kwargs = {'normalize_embeddings': True}
hugging_face_embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs, # Pass the encoding options
)

vectorstore = PGVector(
    embeddings=hugging_face_embeddings,
    collection_name="test_docs",
    connection="postgresql+psycopg://postgres:admin@localhost:5432/fyp",
)

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("http://127.0.0.1:5500/test.html",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


vectorstore.add_documents(splits)


USER_AGENT environment variable not set, consider setting it to identify your requests.


ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5500): Max retries exceeded with url: /test.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001A3014B8B90>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [8]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.callbacks import CallbackManager
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

llm = Ollama(
    model="llama3.1:8b",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("University of Nottingham Malaysia offers what courses?")

In [32]:
vectorstore.delete_collection()

Collection not found
