In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
功能：
1) 自动从 Singapore Pools 抓取过去 N 天（默认 31 天）的 TOTO 开奖页面
2) 输出每期（日期、期号、6 个主号、附加号）
3) 用简单的频率加权 + 约束过滤，给出 1 组“娱乐预测号”

注意：
- 仅做信息抓取与娱乐化组合，不改变中头奖的数学概率。
- 如官网 DOM 结构调整，需相应更新解析逻辑。
"""

import re
import sys
import time
import base64
import argparse
import datetime as dt
from typing import List, Dict, Any, Optional

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtparser
from dateutil import tz

SG_URL_MAIN = "https://www.singaporepools.com.sg/en/product/sr/pages/toto_results.aspx"
SG_URL_DRAW_FMT = "https://www.singaporepools.com.sg/en/product/sr/Pages/toto_results.aspx?sppl={encoded}"  # encoded = base64("DrawNumber=<no>")
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; TOTO-Scraper/1.0; +https://example.com)"
}

def b64_draw_no(n: int) -> str:
    raw = f"DrawNumber={n}".encode("utf-8")
    return base64.b64encode(raw).decode("ascii")

def fetch_html(url: str, sleep_sec: float = 0.6) -> str:
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    time.sleep(sleep_sec)  # 轻微限速，礼貌抓取
    return r.text

def extract_latest_draw_no_and_date(html: str) -> Dict[str, Any]:
    """
    从官网“当前结果”页提取最近一期的 Draw No 与日期。
    页面示例包含：
      - "Draw No. 4101"
      - "Mon, 04 Aug 2025"
    """
    # Draw No.
    m = re.search(r"Draw\s*No\.\s*(\d+)", html, re.IGNORECASE)
    if not m:
        raise ValueError("未找到最近一期的 Draw No.")
    draw_no = int(m.group(1))

    # 日期（页面含英文缩写星期与日期）
    # 例如：Mon, 04 Aug 2025（或 Fri, 08 Aug 2025）
    mdate = re.search(r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s*\d{2}\s+\w+\s+\d{4}", html)
    if not mdate:
        # 备用：有时会以 <time> 或其他元素展示，故放宽匹配
        mdate = re.search(r"\b\d{2}\s+\w+\s+\d{4}\b", html)
    if not mdate:
        raise ValueError("未找到最近一期的开奖日期文本。")
    date_str = mdate.group(0)

    # 解析为新加坡时区日期
    dt_sgt = dtparser.parse(date_str).astimezone(tz.gettz("Asia/Singapore")).date()
    return {"draw_no": draw_no, "date": dt_sgt}

def parse_numbers_from_html(html: str) -> Dict[str, Any]:
    """
    从单期结果页 HTML 解析 6 个主号与 1 个附加号。
    逻辑：
      - 限定在“Winning Numbers”区域附近提取数字
      - 同时在“Additional Number”区域提取 1 个数字
    做法偏稳健：优先找标题，再在相邻元素内抓连续的 1-2 位数字
    """
    soup = BeautifulSoup(html, "html.parser")

    # 取整页文本，备用的保底方案（避免抓到金额/份额等），我们会先就近定位
    full_text = " ".join(soup.get_text(separator=" ").split())

    def extract_after_heading(heading_regex: str, max_take: int) -> List[int]:
        # 找到含有标题关键字的节点
        heading = None
        regex = re.compile(heading_regex, re.IGNORECASE)
        for tag in soup.find_all(text=regex):
            heading = tag
            break
        if heading:
            # 在 heading 所在容器附近抽取数字
            # 取 heading 父节点的后代文本，尽量局部化
            container = heading.parent
            local_text = " ".join(container.get_text(separator=" ").split())
            nums = [int(x) for x in re.findall(r"\b\d{1,2}\b", local_text)]
            # 若本地文本不足，再扩大到整页文本
            if len(nums) < max_take:
                nums = [int(x) for x in re.findall(r"\b\d{1,2}\b", full_text)]
            return nums[:max_take]
        # 找不到标题时退化到整页提取（不推荐，但可容错）
        nums = [int(x) for x in re.findall(r"\b\d{1,2}\b", full_text)]
        return nums[:max_take]

    # Winning Numbers：抓 6 个
    # 有时数字会以图片/标签呈现，但文本中仍通常可匹配到
    win_candidates = extract_after_heading(r"Winning\s+Numbers", 12)
    # 过滤明显不属于开奖号码的 2 位数字（比如年份、日期中的 25、24、31 等）：
    # 简化策略：后续用附加号定位，尽量靠近“Additional Number”
    add_candidates = extract_after_heading(r"Additional\s+Number", 3)

    # 再精细一点：在“Additional Number”邻近文本里抓一个数字作为附加号
    addl = None
    for v in add_candidates:
        if 1 <= v <= 49:
            addl = v
            break

    # 最终 Winning Numbers：在全页文本中，找出紧挨着“Winning Numbers”附近的 6 个 1..49 数字
    # 粗糙但实用：从 win_candidates 中取 1..49 的数字，剔除可能的 addl，保留前 6 个
    wn = [v for v in win_candidates if 1 <= v <= 49]
    if addl in wn and wn.count(addl) >= 1:
        # 有时附加号会被混在一起，尝试移除一个实例
        wn.remove(addl)
    wn = wn[:6]
    if len(wn) != 6 or addl is None:
        # 兜底：尝试另一种方式——在“Winning Numbers”和“Additional Number”之间抓数字
        # 这一步需要全页文本的先后关系，简单起见仍用前面的结果
        pass

    wn.sort()
    return {"winning_numbers": wn, "additional_number": addl}

def fetch_draw_page(draw_no: int) -> Dict[str, Any]:
    url = SG_URL_DRAW_FMT.format(encoded=b64_draw_no(draw_no))
    html = fetch_html(url)
    # 日期
    mdate = re.search(r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s*\d{2}\s+\w+\s+\d{4}", html)
    if mdate:
        d = dtparser.parse(mdate.group(0)).astimezone(tz.gettz("Asia/Singapore")).date()
    else:
        d = None
    parsed = parse_numbers_from_html(html)
    return {
        "draw_no": draw_no,
        "date": d,
        "numbers": parsed["winning_numbers"],
        "additional": parsed["additional_number"],
        "source": url,
    }

def within_days(d: dt.date, days: int, today_sgt: Optional[dt.date] = None) -> bool:
    if not d:
        return False
    today_sgt = today_sgt or dt.datetime.now(tz.gettz("Asia/Singapore")).date()
    return (today_sgt - d).days <= days

# ----------------- 预测（娱乐性质） -----------------

import random
from collections import Counter

def build_weights(draws: List[List[int]], epsilon: float = 0.5) -> Dict[int, float]:
    cnt = Counter()
    for d in draws:
        cnt.update(d)
    return {n: cnt[n] + epsilon for n in range(1, 50)}

def has_long_consecutive(nums: List[int], max_len: int = 2) -> bool:
    nums = sorted(nums)
    run = 1
    for i in range(1, len(nums)):
        if nums[i] == nums[i-1] + 1:
            run += 1
            if run > max_len:
                return True
        else:
            run = 1
    return False

def decade_bucket(n: int) -> int:
    if 1 <= n <= 9: return 0
    if 10 <= n <= 19: return 1
    if 20 <= n <= 29: return 2
    if 30 <= n <= 39: return 3
    return 4

def passes_constraints(picks: List[int]) -> bool:
    picks = sorted(picks)
    if has_long_consecutive(picks, 2):
        return False
    buckets = {decade_bucket(n) for n in picks}
    if len(buckets) < 3:
        return False
    s = sum(picks)
    if not (90 <= s <= 200):
        return False
    odd = sum(1 for n in picks if n % 2)
    even = 6 - odd
    if odd < 2 or even < 2:
        return False
    return True

def weighted_sample_no_replacement(weights: Dict[int, float], k: int = 6, rng: Optional[random.Random] = None) -> List[int]:
    rng = rng or random
    pool = list(weights.keys())
    w = [weights[n] for n in pool]
    picks = []
    for _ in range(k):
        total = sum(w)
        r = rng.random() * total
        acc = 0.0
        for i, wi in enumerate(w):
            acc += wi
            if r <= acc:
                picks.append(pool[i])
                pool.pop(i); w.pop(i)
                break
    return sorted(picks)

def predict_one_set(recent_draws: List[List[int]], seed: Optional[int] = None, attempts: int = 5000) -> List[int]:
    rng = random.Random(seed) if seed is not None else random
    weights = build_weights(recent_draws, epsilon=0.6)
    cand = None
    for _ in range(attempts):
        cand = weighted_sample_no_replacement(weights, 6, rng)
        if passes_constraints(cand):
            return sorted(cand)
    return sorted(cand) if cand else []

# ----------------- 主流程 -----------------

def run(days: int = 366, seed: Optional[int] = None) -> None:
    today_sgt = dt.datetime.now(tz.gettz("Asia/Singapore")).date()
    print(f"📅 今天（SGT）: {today_sgt}  | 回看天数: {days}")

    # 1) 获取最近一期期号与日期
    main_html = fetch_html(SG_URL_MAIN)
    latest = extract_latest_draw_no_and_date(main_html)
    latest_no = latest["draw_no"]
    print(f"最新一期：Draw No. {latest_no} | 日期: {latest['date']}")

    # 2) 递减期号，抓取直到超过 days 的窗口
    draws: List[Dict[str, Any]] = []
    dn = latest_no
    while dn > 0:
        info = fetch_draw_page(dn)
        if not info["date"]:
            # 若某期解析不到日期，跳过
            dn -= 1
            continue
        if within_days(info["date"], days, today_sgt):
            if len(info["numbers"]) == 6 and all(1 <= x <= 49 for x in info["numbers"]):
                draws.append(info)
            dn -= 1
            continue
        else:
            # 超出窗口，结束
            break

    if not draws:
        print("未抓到最近一个月内的开奖数据。可能网站结构变更或网络不可达。")
        sys.exit(1)

    # 按日期升序输出
    draws.sort(key=lambda x: x["date"])

    print("\n=== 过去一个月 TOTO 开奖（主号 + 附加号）===\n")
    for d in draws:
        nums = ", ".join(f"{n:02d}" for n in sorted(d["numbers"]))
        addl = f"{d['additional']:02d}" if d["additional"] is not None else "??"
        print(f"{d['date']}  | Draw {d['draw_no']:<4d} | [{nums}] + ({addl})")

    # 3) 预测（娱乐）：基于近月主号频率 + 约束，输出 1 组
    recent_draws = [d["numbers"] for d in draws]
    guess = predict_one_set(recent_draws, seed=seed)
    if guess:
        print("\n🎯 娱乐预测（下一期 6 个主号）：", ", ".join(f"{n:02d}" for n in guess))
    else:
        print("\n未能生成预测号（可能数据不足）。")

In [2]:
# if __name__ == "__main__":
#     ap = argparse.ArgumentParser()
#     ap.add_argument("--days", type=int, default=31, help="回看天数（默认 31 天）")
#     ap.add_argument("--seed", type=int, default=None, help="随机种子（可选，用于复现预测结果）")
#     args = ap.parse_args()
#     run(days=args.days, seed=args.seed)

In [3]:
# 在 Notebook 中直接调用
run(days=366, seed=20250811)

📅 今天（SGT）: 2025-08-11  | 回看天数: 366
最新一期：Draw No. 4102 | 日期: 2025-08-08


  for tag in soup.find_all(text=regex):



=== 过去一个月 TOTO 开奖（主号 + 附加号）===

2024-08-12  | Draw 3999 | [02, 08, 27, 33, 34, 43] + (12)
2024-08-15  | Draw 4000 | [15, 24, 35, 36, 42, 46] + (15)
2024-08-19  | Draw 4001 | [02, 03, 16, 26, 29, 41] + (19)
2024-08-22  | Draw 4002 | [02, 05, 08, 43, 45, 48] + (22)
2024-08-26  | Draw 4003 | [06, 08, 18, 34, 35, 37] + (26)
2024-08-29  | Draw 4004 | [03, 09, 10, 12, 27, 41] + (29)
2024-09-02  | Draw 4005 | [06, 07, 13, 30, 37, 39] + (02)
2024-09-05  | Draw 4006 | [01, 13, 27, 38, 40, 43] + (05)
2024-09-09  | Draw 4007 | [01, 04, 10, 16, 18, 29] + (09)
2024-09-12  | Draw 4008 | [05, 06, 14, 36, 45, 49] + (12)
2024-09-16  | Draw 4009 | [02, 08, 15, 19, 33, 38] + (16)
2024-09-19  | Draw 4010 | [07, 13, 17, 22, 31, 37] + (19)
2024-09-23  | Draw 4011 | [09, 20, 22, 32, 37, 47] + (23)
2024-09-26  | Draw 4012 | [17, 23, 26, 31, 38, 40] + (26)
2024-09-30  | Draw 4013 | [04, 33, 34, 38, 40, 43] + (30)
2024-10-03  | Draw 4014 | [02, 09, 15, 17, 40, 48] + (03)
2024-10-07  | Draw 4015 | [16, 22, 23, 