<a href="https://colab.research.google.com/github/vlad-pirvu/LLM_testing/blob/main/Guild_Wars_Ancient_Library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers faiss-cpu gradio beautifulsoup4 requests

In [18]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr
import time

In [19]:
BASE_URL = "https://wiki.guildwars.com"
MAIN_PAGE = "https://wiki.guildwars.com/wiki/Main_Page"

def get_links_from_main():
    res = requests.get(MAIN_PAGE)
    soup = BeautifulSoup(res.text, 'html.parser')

    # Find relevant internal links in the main content area
    content = soup.find(id="mw-content-text")
    links = content.find_all('a', href=True)

    urls = set()
    for a in links:
        href = a['href']
        if href.startswith("/wiki/") and not ":" in href and len(href) > 6:
            full_url = BASE_URL + href
            urls.add(full_url)

    return list(urls)

urls = get_links_from_main()
print(f"Found {len(urls)} article URLs")


Found 149 article URLs


In [20]:
def scrape_text_from_url(url):
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        content = soup.find(id="mw-content-text")
        if content:
            paragraphs = content.find_all('p')
            text = "\n".join(p.get_text() for p in paragraphs)
            return text.strip()
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
    return ""

# Scrape texts with a small delay to be polite to server
texts = []
for i, url in enumerate(urls[:30]):  # limit to first 30 for speed & API friendliness
    print(f"Scraping {i+1}/{len(urls)}: {url}")
    text = scrape_text_from_url(url)
    if text:
        texts.append((url, text))
    time.sleep(1)  # polite delay


Scraping 1/149: https://wiki.guildwars.com/wiki/Storyline
Scraping 2/149: https://wiki.guildwars.com/wiki/News
Scraping 3/149: https://wiki.guildwars.com/wiki/Storybook
Scraping 4/149: https://wiki.guildwars.com/wiki/Game_mechanics
Scraping 5/149: https://wiki.guildwars.com/wiki/NPC_service
Scraping 6/149: https://wiki.guildwars.com/wiki/Guide_to_hard_mode
Scraping 7/149: https://wiki.guildwars.com/wiki/Guide_to_character_creation
Scraping 8/149: https://wiki.guildwars.com/wiki/Campaign_travel
Scraping 9/149: https://wiki.guildwars.com/wiki/Guild_hall
Scraping 10/149: https://wiki.guildwars.com/wiki/Build
Scraping 11/149: https://wiki.guildwars.com/wiki/Zaishen_Vanquish
Scraping 12/149: https://wiki.guildwars.com/wiki/Armorer
Scraping 13/149: https://wiki.guildwars.com/wiki/Unique_item
Scraping 14/149: https://wiki.guildwars.com/wiki/Emote
Scraping 15/149: https://wiki.guildwars.com/wiki/Skill
Scraping 16/149: https://wiki.guildwars.com/wiki/Collector
Scraping 17/149: https://wiki.guil

In [21]:
def build_index(texts, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    chunks = []
    meta = []
    for url, text in texts:
        # Split text into overlapping chunks of ~500 characters
        for i in range(0, len(text), 400):
            chunk = text[i:i+500]
            chunks.append(chunk)
            meta.append(url)
    embeddings = model.encode(chunks, show_progress_bar=True)
    embeddings = np.array(embeddings).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return model, index, chunks, meta

model, index, chunks, meta = build_index(texts)
print(f"Indexed {len(chunks)} chunks of lore text")


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Indexed 345 chunks of lore text


In [22]:
def search_guild_wars(query, model, index, chunks, meta, top_k=3):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).astype('float32')
    D, I = index.search(query_embedding, top_k)
    results = []
    for i in I[0]:
        results.append(f"🔗 {meta[i]}:\n{chunks[i]}")
    return "\n\n---\n\n".join(results)


In [23]:
def safe_search(q):
    try:
        return search_guild_wars(q, model, index, chunks, meta)
    except Exception as e:
        return f"⚠️ Error during search:\n{e}"

with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
    gr.Markdown("# 📜 Guild Wars Lore Retriever 📜\nAsk any question about Guild Wars lore!")
    query_input = gr.Textbox(label="Your question")
    output_text = gr.Textbox(label="Lore Answer", lines=10)
    search_button = gr.Button("🔮 Search Lore")

    search_button.click(fn=safe_search, inputs=query_input, outputs=output_text)

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c93edb0c261f22d7d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


