In [1]:
!pip install langchain langchain-community llama-cpp-python sentence-transformers faiss-cpu beautifulsoup4
print("--- Instalation completed ---")

--- Instalowanie zależności (ok. 2-3 min) ---
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchai

In [2]:
print("--- Downloading Phi-3 (2.2GB)... ---")
!wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
print("--- Model downloaded ---")

--- Pobieranie modelu Phi-3 (2.2GB)... ---
--2025-10-29 22:23:01--  https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
Resolving huggingface.co (huggingface.co)... 3.165.160.11, 3.165.160.61, 3.165.160.12, ...
Connecting to huggingface.co (huggingface.co)|3.165.160.11|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/662698108f7573e6a6478546/df220524a4e4a750fe1c325e41f09ff69137f38b52d8831ba22dcbee3cc8ab6d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251029%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251029T222301Z&X-Amz-Expires=3600&X-Amz-Signature=1e3b68ab00eaeee37b01003c84f43cefffaaf7d5f7ef3b4e404100b2b567ad9a&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27Phi-3-mini-4k-instruct-q4.gguf%3B+filename%3D%22Phi-3-mini-4k-instruct-q4.gguf%22%3B&x-id=G

In [3]:

import os
from langchain.vectorstores import FAISS

from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp




--- Biblioteki zaimportowane ---


In [8]:

URLS = [
    "https://batman.fandom.com/wiki/Bruce_Wayne",
    "https://dc.fandom.com/wiki/Batman_(Bruce_Wayne)",
    "https://en.wikipedia.org/wiki/Batman",
    "https://www.dc.com/characters/batman",
    "https://dc.fandom.com/wiki/Bruce_Wayne_(Absolute_Universe)",
    "https://graemesliterarytimemachine.wordpress.com/2015/10/27/the-complete-history-of-bruce-wayne-a-study-of-before-he-was-batman-1939-1956-1980-1987-1989-1990-1993-2003-2004-2008-2011-2012/",
    "https://dc.fandom.com/wiki/Bruce_Wayne_(New_Earth)",
    "https://dc.fandom.com/wiki/Dick_Grayson",
    "https://dc.fandom.com/wiki/Damian_Wayne",
    "https://mysupersuit.com/en/blogs/story/batman-everything-you-ignore-about-heros-dc?srsltid=AfmBOoqakLSI3_88Uj5-3-mKIAopOtvDeiqtc-7NwA_t9p2JI0wuKhCT",
    "https://www.dc.com/blog/2020/11/04/five-batman-facts-you-think-are-true-but-arent"
]

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"

def load_and_process_webpages(urls, model_name):

    loader = WebBaseLoader(urls)
    documents = loader.load()
    print(f"Loaded web content")


    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    split_documents = text_splitter.split_documents(documents)
    print(f"Documents splitted into {len(split_documents)} fragmeents.")


    print(f"Downloading model embeddings: {model_name}...")
    embeddings_model = HuggingFaceEmbeddings(model_name=model_name)



    vector_store = FAISS.from_documents(split_documents, embeddings_model)


    return vector_store

def get_llm_model(model_path):

    print(f"\n--- Loading LLM model: {model_path} ---")


    llm = LlamaCpp(
        model_path=model_path,
        n_ctx=4096,
        n_batch=512,
        verbose=False,
        n_gpu_layers=-1
    )

    print("--- Model loaded ---")
    return llm

print("--- done ---")

--- Funkcje zdefiniowane ---


In [20]:

if os.path.exists(LLM_MODEL_FILE):

    if 'vector_store' not in locals():
        vector_store = load_and_process_webpages(URLS, EMBEDDING_MODEL_NAME)


    if 'llm' not in locals():
        llm = get_llm_model(LLM_MODEL_FILE)


    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    template = """
<|user|>
You are an expert on the DC Comics universe.
Use the following context snippets to answer the user's question.
Answer in English, comprehensively but concisely. List all relevant details from the context, such as multiple people or examples, if available. The question is about DC universe.
If you don't know the answer based on the context, say "I don't know this based on my data".

Context:
{context}

Question: {input}
<|end|>
<|assistant|>
"""
    rag_prompt = PromptTemplate.from_template(template)


    document_chain = create_stuff_documents_chain(llm, rag_prompt)
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    print("\n-------- WE ARE READY! --------")
    print("Ready for your questions in the next cell!")
else:
    print("Error: LLM model file not found. Run Cell 2.")


-------- WE ARE READY! --------
Ready for your questions in the next cell!


In [17]:

user_question = "Who invented batman?" # <-- ENTER QUESTIONS HERE

print(f"Your question: {user_question}")
print("Searching for an answer... (this may take a little)")


response = retrieval_chain.invoke({"input": user_question})

print("\n--------- ANSWER ----------")
clean_answer = response["answer"].split("<|end|>")[0].strip()
print(clean_answer)

# Sources
# print("\n--- Sources (Context) ---")
# for i, doc in enumerate(response["context"]):
#     print(f"\nSource {i+1} (from page: {doc.metadata.get('source', 'N/A')}):")
#     print(f"...{doc.page_content}...")

Your question: Who invented batman?
Searching for an answer... (this may take 10-30 seconds)

--- ANSWER ---
Batman, a character in the DC Comics universe, was created by artist Bob Kane and writer Bill Finger. His first appearance is in Detective Comics #27 published on March 30, 1939.
