In [12]:
import re
import requests
from bs4 import BeautifulSoup
from typing import Optional, Tuple, Dict


_session = requests.Session()
_session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8"
})


def get_soup(url: str, timeout: int = 20) -> Optional[BeautifulSoup]:
    """HTTP GET → BeautifulSoup hoặc None."""
    resp = _session.get(url, timeout=timeout)
    print("GET", resp.status_code, resp.url)

    if resp.status_code != 200:
        return None

    return BeautifulSoup(resp.text, "html.parser")

def clean_text(text: Optional[str]) -> str:
    """Chuẩn hoá chuỗi: xoá khoảng trắng thừa."""
    return re.sub(r"\s+", " ", (text or "")).strip()


def first_int(text: Optional[str]) -> Optional[int]:
    """Lấy số nguyên đầu tiên trong chuỗi."""
    if not text:
        return None
    m = re.search(r"\d+", text)
    return int(m.group()) if m else None


def parse_price_ty_vnd(text: Optional[str]) -> Tuple[Optional[float], Optional[str]]:
    """
    '3 tỷ 500 triệu' → (3.5, raw)
    Trả về (giá_tỷ, text_gốc)
    """
    if not text:
        return None, None

    normalized = clean_text(text).lower()
    parts = normalized.split()

    total_ty = 0.0
    for i in range(len(parts)):
        if "tỷ" in parts[i]:
            total_ty += int(parts[i - 1])
        elif "triệu" in parts[i]:
            total_ty += int(parts[i - 1]) / 1000

    return total_ty, normalized


def parse_area_m2(text: Optional[str]) -> Optional[float]:
    """
    Bắt được các dạng: “78 m²”, “78 m2”, “78 m^2”, “78 m 2”
    """
    if not text:
        return None

    t = clean_text(text).lower()
    t = t.replace("m²", "m2")
    t = re.sub(r"m\s*\^\s*2", "m2", t)
    t = re.sub(r"m\s*2", "m2", t)

    m = re.search(r"([\d\.,]+)\s*m2\b", t)
    if not m:
        return None

    number = m.group(1).replace(",", ".")
    number = re.sub(r"[^0-9.]", "", number)

    try:
        return float(number)
    except:
        return None


def extract_info_attr_rows(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Trả về dict {label_lower: value_text} từ các <div class=info-attr clearfix>.
    """
    info = {}

    rows = soup.select("div.info-attr.clearfix")
    if not rows:
        fallback = soup.select_one("div.info-attrs.clearfix")
        if fallback:
            rows = fallback.select("div.info-attr")

    for row in rows:
        spans = row.find_all("span", recursive=False)

     
        if len(spans) >= 2:
            label = clean_text(spans[0].get_text(" ", strip=True)).lower()
            value = clean_text(spans[1].get_text("", strip=True))
            if label and value:
                info[label] = value
            continue

        
        raw = clean_text(row.get_text(" ", strip=True))
        if ":" in raw:
            k, v = raw.split(":", 1)
            k = clean_text(k).lower()
            v = clean_text(v)
            if k and v:
                info[k] = v

    return info


def pick_value(data: Dict[str, str], keys) -> Optional[str]:
    """Lấy value khớp với bất kỳ key con nào."""
    for label, value in data.items():
        label_l = label.lower()
        if any(k in label_l for k in keys):
            return value
    return None


def parse_detail_htmlparser(url: str) -> Optional[dict]:
    soup = get_soup(url)
    if not soup:
        return None

    # Tiêu đề
    h1 = soup.find("h1")
    tieu_de = clean_text(h1.get_text(" ")) if h1 else None

    # Giá
    price_el = soup.find("div", class_="price")
    gia_text = clean_text(price_el.get_text(" ")) if price_el else None
    gia_vnd, gia_raw = parse_price_ty_vnd(gia_text)

    # Địa chỉ
    addr_el = soup.find("div", class_="address")
    dia_chi = clean_text(addr_el.get_text(" ")) if addr_el else None

    # Giới thiệu
    desc_el = soup.find("div", class_="info-content-body")
    gioi_thieu = clean_text(desc_el.get_text("\n")) if desc_el else None

    # Thuộc tính
    attrs = extract_info_attr_rows(soup)

    # Diện tích
    dt_sd = pick_value(attrs, ["diện tích sử dụng", "dien tich su dung", "dtsd"])
    dt_dat = pick_value(attrs, ["diện tích đất", "dien tich dat", "dt đất", "dt dat"])

    if not dt_dat:
        dt_dat = pick_value(attrs, ["diện tích", "dien tich"])

    dien_tich_su_dung = parse_area_m2(dt_sd) if dt_sd else None
    dien_tich_dat = parse_area_m2(dt_dat) if dt_dat else None

    # Phòng ngủ / nhà tắm
    pn_text = pick_value(attrs, ["phòng ngủ", "số phòng ngủ", "so phong ngu", "pn"])
    wc_text = pick_value(attrs, ["phòng tắm", "nhà tắm", "toilet", "wc", "số toilet"])

    phong_ngu = first_int(pn_text)
    nha_tam = first_int(wc_text)

    return {
        "tieu_de": tieu_de,
        "link": url,
        "gia_raw": gia_raw,
        "gia_vnd": gia_vnd,
        "dia_chi": dia_chi,
        "dien_tich_dat_m2": dien_tich_dat,
        "dien_tich_su_dung_m2": dien_tich_su_dung,
        "phong_ngu": phong_ngu,
        "nha_tam": nha_tam,
        "phap_ly": pick_value(attrs, ["pháp lý", "phap ly", "giấy tờ", "giay to"]),
        "gioi_thieu": gioi_thieu
    }


In [13]:
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode

def set_cp_param(url, page: int) -> str:
    
    parts = urlsplit(url)
    
    query_params = parse_qs(parts.query, keep_blank_values=True)
  
    query_params['cp'] = [str(page)]

    new_query = urlencode(query_params, doseq=True)
    
    new_parts = parts._replace(query=new_query)
    
    return urlunsplit(new_parts)

In [None]:
import re
import time
import random
from urllib.parse import urljoin

PATTERN_DETAIL = re.compile(r"-id(\d{5,})", re.I)


def extract_listing_links(listing_soup, base_url):
    """Lấy toàn bộ link listing hợp lệ từ một trang soup."""
    if not listing_soup:
        return []

    found_links = []
    visited = set()

    for tag in listing_soup.select("a[href]"):
        href = tag.get("href", "")
        if not href or href.startswith("#") or "javascript:" in href:
            continue

        full_url = urljoin(base_url, href).split("?")[0]
        match = PATTERN_DETAIL.search(full_url)

        if "mogi.vn" in full_url and match and full_url not in visited:
            visited.add(full_url)
            found_links.append(full_url)

    return found_links

def collect_links_by_cp(
    base_url,
    start_page=1,
    max_pages=30,
    sleep_range=(1.0, 2.0),
    break_no_new_pages=2
):
    all_links = []
    known_ids = set()
    consecutive_empty = 0

    for cp in range(start_page, start_page + max_pages):
        page_url = set_cp_param(base_url, cp)
        soup = get_soup(page_url)

        if not soup:
            print("Ko tai duoc", page_url)
            break

        links = extract_listing_links(soup, page_url)

        # Lọc link mới dựa trên ad_id
        new_links = []
        for url in links:
            match = PATTERN_DETAIL.search(url)
            ad_id = match.group(1) if match else None

            if ad_id and ad_id not in known_ids:
                known_ids.add(ad_id)
                new_links.append(url)

        print(
            f"Trang cp={cp}: {len(new_links)}/{len(links)} link mới; "
            f"Tổng: {len(all_links) + len(new_links)}"
        )

        if not new_links:
            consecutive_empty += 1
        else:
            consecutive_empty = 0

        all_links.extend(new_links)

        if consecutive_empty >= break_no_new_pages:
            print("Khong co link moi nhieu trang lien tiep")
            break

       
        time.sleep(random.uniform(*sleep_range))

    return all_links


In [None]:
import os
import csv
import re
import time
import random


FIELDS = [
    "ad_id", "tieu_de", "link", "dia_chi", "gia_raw", "gia_vnd",
    "dien_tich_dat_m2", "dien_tich_su_dung_m2",
    "phong_ngu", "nha_tam", "phap_ly", "gioi_thieu"
]

def extract_adid(url):
    m = re.search(r"-id(\d{5,})", url)
    return m.group(1) if m else None


def should_skip(adid, processed_set):
    return adid in processed_set if adid else False


def format_csv_row(item):
    return {key: item.get(key) for key in FIELDS}


def load_existing_ids(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        return set()

    with open(path, "r", encoding="utf-8-sig", newline="") as f:
        return {row["ad_id"] for row in csv.DictReader(f) if row.get("ad_id")}


def need_write_header(path):
    return (not os.path.exists(path)) or os.path.getsize(path) == 0


def writer_with_header(path):
    f = open(path, "a", newline="", encoding="utf-8-sig")
    w = csv.DictWriter(f, fieldnames=FIELDS)

    if need_write_header(path):
        w.writeheader()

    return f, w


def write_row(writer, row):
    writer.writerow(row)


def polite_sleep(delay_range):
    time.sleep(random.uniform(*delay_range))

def crawl_streaming(detail_links, out_csv="quan9.csv", sleep_range=(1.5, 2.5)):
    processed_ids = load_existing_ids(out_csv)

    file, writer = writer_with_header(out_csv)
    total_written = 0

    try:
        for url in detail_links:

            adid = extract_adid(url)

            if should_skip(adid, processed_ids):
                continue

            try:
                item = parse_detail_htmlparser(url)
            except Exception as e:
                print("Lỗi parse:", url, e)
                polite_sleep(sleep_range)
                continue

            if not item:
                polite_sleep(sleep_range)
                continue

            item["ad_id"] = adid

            row = format_csv_row(item)

            write_row(writer, row)
            total_written += 1

            if total_written % 50 == 0:
                file.flush()
                print(f"Đã ghi {total_written} dòng vào {out_csv}")

            polite_sleep(sleep_range)

    finally:
        file.close()


In [16]:
url = "https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho"
quan1 = collect_links_by_cp(url, start_page=1, max_pages=5, sleep_range=(0.5, 1.5))
crawl_streaming(quan1, sleep_range=(1.5, 2.5))

GET 200 https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho?cp=1
Trang cp=1: 15/15 link mới; Tổng: 15
GET 200 https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho?cp=2
Trang cp=2: 15/15 link mới; Tổng: 30
GET 200 https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho?cp=3
Trang cp=3: 15/15 link mới; Tổng: 45
GET 200 https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho?cp=4
Trang cp=4: 15/15 link mới; Tổng: 60
GET 200 https://mogi.vn/ho-chi-minh/quan-binh-thanh/mua-can-ho?cp=5
Trang cp=5: 15/15 link mới; Tổng: 75
GET 200 https://mogi.vn/quan-binh-thanh/mua-can-ho-chung-cu/can-ho-2pn-nguyen-hong-51m2-full-noi-that-so-hong-rieng-2-05ty-o-ngay-id22712784
GET 200 https://mogi.vn/quan-binh-thanh/mua-can-ho-chung-cu/can-ho-le-quang-dinh-2pn-nhu-hinh-2-17-ty-full-noi-that-o-ngay-san-so-id22707679
GET 200 https://mogi.vn/quan-binh-thanh/mua-can-ho-chung-cu/can-ho-2pn-ngay-pham-van-dong-2-2ty-o-ngay-so-rieng-bao-phi-vay-70-id22705798
GET 200 https://mogi.vn/quan-binh-thanh/mua-can-ho-chun