Web scraping all the nearly 18000 webpages of UTS.

In [None]:
# Install dependencies
!pip install -q beautifulsoup4 requests

import requests
from bs4 import BeautifulSoup
import time
import pickle
import os
from langchain.docstore.document import Document

# STEP 1: Get all URLs from UTS sitemap
def get_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.content, "xml")
    return [loc.text for loc in soup.find_all("loc")]

# STEP 2: Scrape and save each batch
def scrape_and_save_batch(urls, batch_num, batch_size=1000, output_dir="/content"):
    docs = []
    start = batch_num * batch_size
    end = min(start + batch_size, len(urls))

    for i, url in enumerate(urls[start:end]):
        try:
            res = requests.get(url, timeout=5)
            soup = BeautifulSoup(res.content, "html.parser")
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()
            text = soup.get_text(separator="\n", strip=True)
            if len(text) > 100:
                docs.append(Document(page_content=text, metadata={"source": url}))
            print(f"[{start+i+1}] ✅ {url}")
        except Exception as e:
            print(f"[{start+i+1}] ❌ {url} ({e})")
        time.sleep(0.3)

    # Save batch
    batch_file = os.path.join(output_dir, f"web_docs_batch_{batch_num}.pkl")
    with open(batch_file, "wb") as f:
        pickle.dump(docs, f)
    print(f"✅ Saved batch {batch_num} ({len(docs)} docs) to {batch_file}")

# STEP 3: Orchestrate scraping with resume support
sitemap_url = "https://www.uts.edu.au/sitemap.xml"
all_urls = get_urls_from_sitemap(sitemap_url)

max_pages = 18000
batch_size = 1000
output_dir = "/content"
num_batches = (min(len(all_urls), max_pages) + batch_size - 1) // batch_size

print(f"🌐 Total URLs: {len(all_urls)} | Max pages: {max_pages} | Total batches: {num_batches}")

for batch_num in range(num_batches):
    batch_file = os.path.join(output_dir, f"web_docs_batch_{batch_num}.pkl")
    if os.path.exists(batch_file):
        print(f"⏩ Batch {batch_num} already exists. Skipping.")
        continue
    scrape_and_save_batch(all_urls, batch_num, batch_size, output_dir)



Combining the webpage document batches into a single file.

In [None]:
import pickle, glob

# Load all batch files
batch_files = sorted(glob.glob("/content/web_docs_batch_*.pkl"))
all_docs = []

for file in batch_files:
    with open(file, "rb") as f:
        all_docs.extend(pickle.load(f))

print(f"✅ Loaded {len(all_docs)} documents from {len(batch_files)} batches.")

# Save to a single combined .pkl file
with open("/content/web_docs_combined.pkl", "wb") as f:
    pickle.dump(all_docs, f)

print("💾 Combined documents saved to /content/web_docs_combined.pkl")


Building vectors and training the LLM models

In [None]:
# STEP 1: Install packages
!pip install -q langchain-community langchain faiss-cpu sentence-transformers pdfplumber gradio transformers accelerate huggingface_hub

# STEP 2: Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Imports
import os, pickle
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr
import torch

# STEP 4: Load PDFs
pdf_folder_path = "/content/drive/MyDrive/UTS_Chatbot/training_pdf"

def load_all_pdfs_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            with pdfplumber.open(full_path) as pdf:
                text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                documents.append(Document(page_content=text, metadata={"source": filename}))
    return documents

pdf_docs = load_all_pdfs_from_folder(pdf_folder_path)

# STEP 5: Load web docs
with open("/content/web_docs_combined.pkl", "rb") as f:
    web_docs = pickle.load(f)

# STEP 6: Combine & Split
all_docs = pdf_docs + web_docs
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)
documents = splitter.split_documents(all_docs)

# Load FAISS vectorstore from disk
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_path = "/content/drive/MyDrive/UTS_Chatbot/faiss_index"
vectorstore = FAISS.load_local(faiss_path, embedding_model, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# STEP 7: Embedding & FAISS (Training from scratch)
#embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#vectorstore = FAISS.from_documents(documents, embedding_model)
#retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# STEP 8: Load all models
hf_token = "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"

model_ids = {
    "LLaMA 3.2 3B": "meta-llama/Llama-3.2-3B-Instruct",
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.2",
    "Phi Mini": "microsoft/Phi-3-mini-4k-instruct",
    "Gemma 2B": "google/gemma-2b-it",
    "Tiny LLaMA": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
}

pipelines = {}

for name, model_id in model_ids.items():
    print(f"🔄 Loading: {name}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=hf_token,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
    )
    pipelines[name] = pipe

# STEP 9: Prompt & chatbot function
def format_prompt(context, query):
    return f"""
You are an expert guide for international students at UTS.

Context:
{context}

Question: {query}

Answer:"""

def create_chatbot_func(model_name):
    pipe = pipelines[model_name]

    def chatbot(query):
        docs = retriever.get_relevant_documents(query)
        context = "\n\n".join([doc.page_content for doc in docs])[:1000]
        prompt = format_prompt(context, query)
        result = pipe(prompt)[0]["generated_text"]
        return result.split("Answer:")[-1].strip()

    return chatbot

# STEP 10: Gradio UI with tabs for each model
tabs = []
for model_name in model_ids:
    with gr.Tab(model_name):
        tabs.append(gr.Interface(
            fn=create_chatbot_func(model_name),
            inputs="text",
            outputs="text",
            title=f"🤖 UTS Student assistant ({model_name})",
            description="Ask anything about life and study at UTS"
        ))

gr.TabbedInterface(tabs, tab_names=list(model_ids.keys())).launch()
