### Tải thông tin Tuyến Buýt

In [2]:
# download_bus.py
import requests, os, json, time

DATA_DIR = "raw_data/bus"
os.makedirs(DATA_DIR, exist_ok=True)

BASE = "http://apicms.ebms.vn"  # endpoints từ SaiGonBusMap README

def save_json(obj, name):
    path = os.path.join(DATA_DIR, name)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print("Saved", path)

def fetch_stops_in_bounds(sw_lng, sw_lat, ne_lng, ne_lat, timeout=10):
    url = f"{BASE}/businfo/getstopsinbounds/{sw_lng}/{sw_lat}/{ne_lng}/{ne_lat}"
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.json()

def fetch_routes_through_stop(stop_id, timeout=10):
    url = f"{BASE}/businfo/getroutesthroughstop/{stop_id}"
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.json()

def fetch_prediction_by_stop(stop_id, timeout=10):
    url = f"{BASE}/prediction/predictbystopid/{stop_id}"
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.json()

if __name__ == "__main__":
    # bounding box toàn TP.HCM (chỉnh nếu cần chính xác hơn)
    sw_lng, sw_lat = 106.45, 10.70
    ne_lng, ne_lat = 106.90, 10.95

    # lấy danh sách trạm
    stops = fetch_stops_in_bounds(sw_lng, sw_lat, ne_lng, ne_lat)
    save_json(stops, "stops_hcm.json")

    # chuẩn hóa thành list
    if isinstance(stops, list):
        stop_list = stops
    elif isinstance(stops, dict):
        stop_list = stops.get("stops", [])
    else:
        stop_list = []

    print("Số trạm lấy được:", len(stop_list))

    # lấy routes cho vài stop sample
    for s in stop_list[:30]:
        sid = s.get("stop_id") or s.get("id") or s.get("stopCode")
        if not sid:
            continue
        try:
            r = fetch_routes_through_stop(sid)
            save_json(r, f"routes_stop_{sid}.json")
            # lấy luôn dự báo giờ đến
            p = fetch_prediction_by_stop(sid)
            save_json(p, f"prediction_stop_{sid}.json")
            time.sleep(0.2)
        except Exception as e:
            print("Lỗi khi lấy stop", sid, e)


Saved raw_data/bus\stops_hcm.json
Số trạm lấy được: 4170


In [1]:
# download_signs_vn.py
import requests, os, json, time, re
from urllib.parse import urlparse

DATA_DIR = "raw_data/signs"
IM_DIR = os.path.join(DATA_DIR, "images")
META_PATH = os.path.join(DATA_DIR, "traffic_signs_vn.json")
os.makedirs(IM_DIR, exist_ok=True)

HEADERS = {"User-Agent": "hcm-rag-crawler/1.0 (+https://your.project)"}
API = "https://commons.wikimedia.org/w/api.php"

# Categories specific to Vietnam on Commons
CATEGORIES = [
    "Road signs in Vietnam",
    "Diagrams of road signs of Vietnam",
    "SVG road signs in Vietnam",
    "Warning road signs in Vietnam",
    "Prohibitory road signs in Vietnam",
    "Informational road signs in Vietnam",
    "Diagrams of Vietnamese-language road signs",
    "Diagrams of national road signs of Vietnam"
]

# Limits
LIMIT_PER_CATEGORY = 300   # chỉnh nếu cần
MAX_DOWNLOAD = 800        # tổng ảnh tối đa tải về

def fetch_category_files(category, limit=200):
    titles = []
    cmcontinue = None
    fetched = 0
    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category}",
            "cmtype": "file",
            "cmlimit": 50,
            "format": "json"
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue
        r = requests.get(API, params=params, headers=HEADERS, timeout=20)
        r.raise_for_status()
        j = r.json()
        members = j.get("query", {}).get("categorymembers", [])
        for m in members:
            titles.append(m["title"])
            fetched += 1
            if fetched >= limit:
                return titles
        cmcontinue = j.get("continue", {}).get("cmcontinue")
        if not cmcontinue:
            break
        time.sleep(0.12)
    return titles

def fetch_imageinfo(titles):
    out = []
    for i in range(0, len(titles), 50):
        chunk = titles[i:i+50]
        params = {
            "action": "query",
            "titles": "|".join(chunk),
            "prop": "imageinfo",
            "iiprop": "url|mime|size|metadata",
            "format": "json"
        }
        r = requests.get(API, params=params, headers=HEADERS, timeout=20)
        r.raise_for_status()
        j = r.json()
        pages = j.get("query", {}).get("pages", {})
        for p in pages.values():
            if "imageinfo" in p:
                ii = p["imageinfo"][0]
                out.append({
                    "title": p.get("title"),
                    "url": ii.get("url"),
                    "mime": ii.get("mime"),
                    "size": ii.get("size"),
                    "metadata": ii.get("metadata", [])
                })
        time.sleep(0.12)
    return out

def safe_download(url, dst, retries=3):
    for _ in range(retries):
        try:
            r = requests.get(url, headers=HEADERS, timeout=40, stream=True)
            r.raise_for_status()
            with open(dst, "wb") as f:
                for chunk in r.iter_content(1024*16):
                    if chunk:
                        f.write(chunk)
            return True
        except Exception:
            time.sleep(0.5)
    return False

def extract_sign_code(title):
    # ví dụ title: "Vietnam road sign P122.svg" -> P122
    m = re.search(r'([A-Z]{0,2}\d{1,4}[a-zA-Z0-9]*)', title)
    if m:
        return m.group(1)
    # fallback: last token of title
    parts = title.split()
    return parts[-1] if parts else title

def collect_vn_signs():
    all_titles = []
    for cat in CATEGORIES:
        print("Fetching titles from category:", cat)
        t = fetch_category_files(cat, limit=LIMIT_PER_CATEGORY)
        print("  got", len(t))
        all_titles.extend(t)
        # avoid duplicates
        all_titles = list(dict.fromkeys(all_titles))
        # stop early if enough
        if len(all_titles) >= LIMIT_PER_CATEGORY * 2:
            break
    print("Total unique titles:", len(all_titles))

    infos = fetch_imageinfo(all_titles)
    print("Imageinfo fetched:", len(infos))

    # filter: ensure 'Vietnam' or Vietnamese-language signs (conservative)
    filtered = []
    for it in infos:
        title = it.get("title","").lower()
        url = it.get("url","")
        # accept if category indicates Vietnam or filename contains 'vietnam' or path has '/Vietnam_'
        if "vietnam" in title or "/Vietnam" in url or "vn" in title:
            filtered.append(it)
        else:
            # still allow SVG files that include 'Vietnam' in metadata or SVGs in VN categories
            filtered.append(it)  # permissive: categories already VN-specific
    print("After filtering:", len(filtered))

    # download images up to MAX_DOWNLOAD
    saved = []
    for idx, it in enumerate(filtered[:MAX_DOWNLOAD]):
        u = it.get("url")
        if not u:
            it["local"] = None
            continue
        parsed = urlparse(u.split("?")[0])
        ext = os.path.splitext(parsed.path)[1] or (".svg" if it.get("mime","").startswith("image/svg") else ".jpg")
        fname = f"vn_sign_{idx:04d}{ext}"
        dst = os.path.join(IM_DIR, fname)
        if os.path.exists(dst):
            it["local"] = dst
            saved.append(it)
            continue
        ok = safe_download(u, dst)
        it["local"] = dst if ok else None
        it["code"] = extract_sign_code(it.get("title",""))
        saved.append(it)
        print(f"Download {idx+1}/{min(len(filtered),MAX_DOWNLOAD)} ->", "ok" if ok else "fail")
        time.sleep(0.12)
    # save metadata
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump(saved, f, ensure_ascii=False, indent=2)
    print("Saved metadata to", META_PATH)
    return saved

if __name__ == "__main__":
    collect_vn_signs()


Fetching titles from category: Road signs in Vietnam
  got 30
Fetching titles from category: Diagrams of road signs of Vietnam
  got 201
Fetching titles from category: SVG road signs in Vietnam
  got 170
  got 6
Fetching titles from category: Prohibitory road signs in Vietnam
  got 6
Fetching titles from category: Informational road signs in Vietnam
  got 9
Fetching titles from category: Diagrams of Vietnamese-language road signs
  got 93
Fetching titles from category: Diagrams of national road signs of Vietnam
  got 199
Total unique titles: 627
Imageinfo fetched: 627
After filtering: 627
Download 1/627 -> ok
Download 2/627 -> ok
Download 3/627 -> ok
Download 4/627 -> ok
Download 5/627 -> ok
Download 6/627 -> ok
Download 7/627 -> ok
Download 8/627 -> ok
Download 9/627 -> ok
Download 10/627 -> ok
Download 11/627 -> ok
Download 12/627 -> ok
Download 13/627 -> ok
Download 14/627 -> ok
Download 15/627 -> ok
Download 16/627 -> ok
Download 17/627 -> ok
Download 18/627 -> ok
Download 19/627 -

In [4]:
# download_laws.py
import requests, os, json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

DATA_DIR = "raw_data/laws"
os.makedirs(DATA_DIR, exist_ok=True)
HEADERS = {"User-Agent": "hcm-rag-crawler/1.0"}

# danh sách url mẫu (bạn có thể thêm)
LAW_PAGES = [
    "https://vanban.chinhphu.vn/default.aspx?docid=81140",  # Luật GTĐB 2008 (ví dụ)
    "https://moj.gov.vn"  # trang chính Bộ Tư pháp (crawl danh sách sau)
]

def save_json(obj, name):
    with open(os.path.join(DATA_DIR, name), "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def fetch_page_links(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except Exception as e:
        print("fetch fail", url, e)
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    links = []
    for a in soup.select("a"):
        href = a.get("href")
        txt = a.get_text(strip=True)
        if not href or not txt:
            continue
        if "luật" in txt.lower() or "giao thông" in txt.lower() or href.endswith(".pdf"):
            links.append({"title": txt, "url": urljoin(url, href)})
    return links

if __name__ == "__main__":
    out = []
    for u in LAW_PAGES:
        out += fetch_page_links(u)
    save_json(out, "law_links.json")
    print("Luật tìm:", len(out))


Luật tìm: 108


In [5]:
# download_culture.py
import requests, os, json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

DATA_DIR = "raw_data/culture"
os.makedirs(DATA_DIR, exist_ok=True)
HEADERS = {"User-Agent": "hcm-rag-crawler/1.0"}

# ví dụ crawl trang Sở Văn hoá TP.HCM (thay URL thực tế nếu khác)
START_URLS = [
    "https://sovhtt.hochiminhcity.gov.vn",  # nếu không đúng, bạn thay URL chính xác
    "https://vi.wikipedia.org/wiki/Thể_loại:Văn_hóa_việt_nam"
]

def save_json(obj, name):
    with open(os.path.join(DATA_DIR, name), "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def crawl_wikipedia_category(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except:
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    items = []
    for li in soup.select(".mw-category a"):
        title = li.get_text(strip=True)
        href = li.get("href")
        items.append({"title": title, "url": urljoin("https://vi.wikipedia.org/", href)})
    return items

if __name__ == "__main__":
    data = []
    data += crawl_wikipedia_category("https://vi.wikipedia.org/wiki/Thể_loại:Văn_hóa_Việt_Nam")
    save_json(data, "culture_wiki_list.json")
    print("Culture items:", len(data))


Culture items: 222


In [6]:
# download_tourism.py
import requests, os, json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

DATA_DIR = "raw_data/tourism"
os.makedirs(DATA_DIR, exist_ok=True)
HEADERS = {"User-Agent": "hcm-rag-crawler/1.0"}

START_URLS = [
    "https://www.vietnam.travel/vn",  # portal du lịch VN
    "https://www.vietnam.travel/vn/chu-de/ho-chi-minh-city"  # trang HCM nếu có
]

def save_json(obj, name):
    with open(os.path.join(DATA_DIR, name), "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def crawl_vietnam_travel_list(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except:
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    out = []
    for a in soup.select("a"):
        href = a.get("href")
        txt = a.get_text(strip=True)
        if href and txt and "/vn/diem-den" in href:
            out.append({"title": txt, "url": urljoin(url, href)})
    return out

if __name__ == "__main__":
    res = []
    for u in START_URLS:
        res += crawl_vietnam_travel_list(u)
    save_json(res, "tourism_vietnam_travel.json")
    print("Tourism items:", len(res))


Tourism items: 0
