In [4]:
# -*- coding: utf-8 -*-
# === Singapore Pools TOTO：抓取过去一年 -> 日期/星期/生日特征 -> 预测今天一组娱乐号（Jupyter友好，单文件版） ===

import re, time, base64, datetime as dt, random
from typing import List, Dict, Any, Optional
from collections import Counter

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtparser
from dateutil import tz
import numpy as np

# ===== 基本配置 =====
MY_BIRTHDAY = dt.date(1999, 10, 13)
SG_URL_MAIN = "https://www.singaporepools.com.sg/en/product/sr/pages/toto_results.aspx"
SG_URL_DRAW_FMT = "https://www.singaporepools.com.sg/en/product/sr/Pages/toto_results.aspx?sppl={encoded}"  # base64("DrawNumber=<no>")
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; TOTO-Scraper/1.0)"}

# ===== 实用函数（日期/网络） =====
def _today_sgt_date() -> dt.date:
    return dt.datetime.now(tz.gettz("Asia/Singapore")).date()

def _b64_draw_no(n: int) -> str:
    return base64.b64encode(f"DrawNumber={n}".encode("utf-8")).decode("ascii")

def _fetch_html(url: str, sleep_sec: float = 0.5) -> str:
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    time.sleep(sleep_sec)   # 礼貌限速
    return r.text

# ===== 解析官网页面 =====
def _extract_latest_draw_no_and_date(html: str) -> Dict[str, Any]:
    m = re.search(r"Draw\s*No\.\s*(\d+)", html, re.IGNORECASE)
    if not m:
        raise ValueError("未找到最近一期 Draw No.")
    draw_no = int(m.group(1))
    mdate = re.search(r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s*\d{2}\s+\w+\s+\d{4}", html)
    if not mdate:
        mdate = re.search(r"\b\d{2}\s+\w+\s+\d{4}\b", html)
    if not mdate:
        raise ValueError("未找到开奖日期文本。")
    dt_sgt = dtparser.parse(mdate.group(0)).astimezone(tz.gettz("Asia/Singapore")).date()
    return {"draw_no": draw_no, "date": dt_sgt}

def _parse_numbers_from_html(html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(html, "html.parser")
    full_text = " ".join(soup.get_text(separator=" ").split())

    def _extract_after_heading(heading_regex: str, max_take: int) -> List[int]:
        regex = re.compile(heading_regex, re.IGNORECASE)
        heading = None
        for tag in soup.find_all(text=regex):
            heading = tag; break
        if heading:
            container = heading.parent
            local_text = " ".join(container.get_text(separator=" ").split())
            nums = [int(x) for x in re.findall(r"\b\d{1,2}\b", local_text)]
            if len(nums) < max_take:
                nums = [int(x) for x in re.findall(r"\b\d{1,2}\b", full_text)]
            return nums[:max_take]
        return [int(x) for x in re.findall(r"\b\d{1,2}\b", full_text)][:max_take]

    win_candidates = _extract_after_heading(r"Winning\s+Numbers", 12)
    add_candidates = _extract_after_heading(r"Additional\s+Number", 3)
    addl = next((v for v in add_candidates if 1 <= v <= 49), None)
    wn = [v for v in win_candidates if 1 <= v <= 49]
    if addl in wn:
        wn.remove(addl)
    wn = sorted(wn[:6])
    return {"winning_numbers": wn, "additional_number": addl}

def _fetch_draw_page(draw_no: int) -> Dict[str, Any]:
    url = SG_URL_DRAW_FMT.format(encoded=_b64_draw_no(draw_no))
    html = _fetch_html(url)
    mdate = re.search(r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s*\d{2}\s+\w+\s+\d{4}", html)
    d = dtparser.parse(mdate.group(0)).astimezone(tz.gettz("Asia/Singapore")).date() if mdate else None
    parsed = _parse_numbers_from_html(html)
    return {
        "draw_no": draw_no,
        "date": d,
        "numbers": parsed["winning_numbers"],
        "additional": parsed["additional_number"],
        "source": url,
    }

def scrape_draws_since(since: dt.date, sleep_sec: float = 0.5) -> List[Dict[str, Any]]:
    """从最近一期往回抓，直到日期 < since 为止；返回按日期升序。"""
    main_html = _fetch_html(SG_URL_MAIN, sleep_sec)
    latest = _extract_latest_draw_no_and_date(main_html)
    latest_no = latest["draw_no"]
    out = []
    dn = latest_no
    while dn > 0:
        info = _fetch_draw_page(dn)
        if info["date"] and info["date"] >= since:
            if len(info["numbers"]) == 6 and all(1 <= x <= 49 for x in info["numbers"]):
                out.append(info)
            dn -= 1
            continue
        break
    out.sort(key=lambda x: x["date"])
    return out

# ===== 额外特征（日期/星期/生日） =====
def date_features(dates: list, birthday: dt.date) -> np.ndarray:
    """给一批开奖日期生成：年内日序(归一化) + 星期(7维one-hot) + 生日距离(归一化) 共 1+7+1=9 维"""
    feats = []
    for d in dates:
        day_of_year = d.timetuple().tm_yday / 366.0
        weekday = d.weekday()  # 0=Mon,...,6=Sun
        weekday_onehot = [1.0 if i == weekday else 0.0 for i in range(7)]
        this_year_bday = birthday.replace(year=d.year)
        delta_days = (d - this_year_bday).days % 365
        bday_dist_norm = delta_days / 365.0
        feats.append([day_of_year] + weekday_onehot + [bday_dist_norm])
    return np.array(feats, dtype=np.float32)

# ===== 频率加权（无DL回退） =====
def _build_weights(draws: List[List[int]], epsilon: float = 0.6) -> Dict[int, float]:
    cnt = Counter()
    for drow in draws:
        cnt.update(drow)
    return {n: cnt[n] + epsilon for n in range(1, 50)}

def _has_long_consecutive(nums: List[int], max_len: int = 2) -> bool:
    nums = sorted(nums); run = 1
    for i in range(1, len(nums)):
        run = run + 1 if nums[i] == nums[i-1] + 1 else 1
        if run > max_len: return True
    return False

def _decade_bucket(n: int) -> int:
    if 1 <= n <= 9: return 0
    if 10 <= n <= 19: return 1
    if 20 <= n <= 29: return 2
    if 30 <= n <= 39: return 3
    return 4

def _passes_constraints(picks: List[int]) -> bool:
    picks = sorted(picks)
    if _has_long_consecutive(picks, 2): return False
    if len({_decade_bucket(n) for n in picks}) < 3: return False
    s = sum(picks)
    if not (90 <= s <= 200): return False
    odd = sum(1 for n in picks if n % 2)
    if odd < 2 or (6 - odd) < 2: return False
    return True

def _weighted_sample_no_replacement(weights: Dict[int, float], k: int = 6, rng: Optional[random.Random] = None) -> List[int]:
    rng = rng or random
    pool = list(weights.keys())
    w = [weights[n] for n in pool]
    picks = []
    for _ in range(k):
        total = sum(w)
        r = rng.random() * total
        acc = 0.0
        for i, wi in enumerate(w):
            acc += wi
            if r <= acc:
                picks.append(pool[i])
                pool.pop(i); w.pop(i)
                break
    return sorted(picks)

def predict_freq_only(draws_main: List[List[int]], seed: Optional[int] = None, attempts: int = 5000) -> List[int]:
    rng = random.Random(seed) if seed is not None else random
    weights = _build_weights(draws_main, epsilon=0.6)
    cand = None
    for _ in range(attempts):
        cand = _weighted_sample_no_replacement(weights, 6, rng)
        if _passes_constraints(cand):
            return sorted(cand)
    return sorted(cand) if cand else []

# ===== 深度学习（含日期/星期/生日特征；无PyTorch时返回None） =====
def predict_with_dl(draws_main: List[List[int]], dates: list, today: dt.date, birthday: dt.date, seed: int = 2025) -> Optional[List[int]]:
    try:
        import torch, torch.nn as nn
    except Exception:
        return None  # 无 PyTorch 则回退

    rng = np.random.default_rng(seed)

    def to_multi_hot(batch):
        Y = np.zeros((len(batch), 49), dtype=np.float32)
        for i, nums in enumerate(batch):
            for n in nums:
                Y[i, n-1] = 1.0
        return Y

    Y = to_multi_hot(draws_main)

    base_feats = np.zeros((len(draws_main), 8), dtype=np.float32) + rng.normal(0, 0.01, (len(draws_main), 8)).astype(np.float32)
    date_feats = date_features(dates, birthday)   # 9 维
    X = np.hstack([base_feats, date_feats])       # 8 + 9 = 17 维

    Xt = torch.tensor(X)
    Yt = torch.tensor(Y)

    class MLP(nn.Module):
        def __init__(self, in_dim=X.shape[1], hidden=64, out_dim=49):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden), nn.ReLU(),
                nn.Linear(hidden, hidden), nn.ReLU(),
                nn.Linear(hidden, out_dim)
            )
        def forward(self, x): return self.net(x)

    model = MLP()
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    crit = nn.BCEWithLogitsLoss()

    for _ in range(600):
        model.train()
        logits = model(Xt)
        loss = crit(logits, Yt)
        opt.zero_grad(); loss.backward(); opt.step()

    # —— 用“今天”的日期特征进行推断 —— #
    today_base = np.zeros((1, 8), dtype=np.float32)
    today_date_feat = date_features([today], birthday)  # (1,9)
    today_X = np.hstack([today_base, today_date_feat])  # (1,17)
    today_Xt = torch.tensor(today_X)

    @torch.no_grad()
    def gen_one():
        model.eval()
        logits = model(today_Xt)
        probs = torch.sigmoid(logits).flatten().numpy()
        probs = np.clip(probs, 1e-6, 1.0); probs = probs / probs.sum()

        def ok(nums):
            nums = sorted(nums)
            run=1
            for i in range(1,6):
                run = run+1 if nums[i]==nums[i-1]+1 else 1
                if run>=3: return False
            if len({(n-1)//10 for n in nums}) < 3: return False
            odd = sum(n%2 for n in nums)
            if odd < 2 or (6-odd) < 2: return False
            return True

        for _ in range(5000):
            cand = sorted(np.random.default_rng(seed+_).choice(np.arange(1,50), size=6, replace=False, p=probs))
            if ok(cand): return cand
        return sorted(np.random.default_rng(seed).choice(np.arange(1,50), size=6, replace=False, p=probs))

    return gen_one()

# ===== 一键执行：抓过去一年 -> 打印样例 -> 预测今天 =====
def run_full_pipeline_with_birthday(seed: Optional[int] = 20250808, sleep_sec: float = 0.5):
    today = _today_sgt_date()
    since = today - dt.timedelta(days=366)
    print(f"📅 今天（SGT）: {today}（周{today.isoweekday()}） | 回溯自: {since}")

    # 抓取数据
    year_draws = scrape_draws_since(since, sleep_sec=sleep_sec)
    if not year_draws:
        print("❌ 未抓到数据（可能官网结构变更或网络问题）。")
        return

    print(f"✅ 抓到 {len(year_draws)} 期。最近几期：")
    for d in year_draws[-min(5, len(year_draws)):]:
        nums = ", ".join(f"{n:02d}" for n in sorted(d["numbers"]))
        addl = f"{d['additional']:02d}" if d["additional"] is not None else "??"
        # 打印周几（1=周一…7=周日）
        print(f"  {d['date']} (周{d['date'].isoweekday()}) | Draw {d['draw_no']:<4d} | [{nums}] + ({addl})")

    draws_main = [rec["numbers"] for rec in year_draws]
    draw_dates = [rec["date"] for rec in year_draws]

    # 深度学习（含日期/星期/生日），失败则回退到频率版
    pred_dl = predict_with_dl(draws_main, draw_dates, today, MY_BIRTHDAY, seed=2025)
    if pred_dl:
        final_pick = pred_dl
        print("\n🎯 深度学习娱乐预测（含日期/星期/生日）：", ", ".join(f"{n:02d}" for n in final_pick))
    else:
        final_pick = predict_freq_only(draws_main, seed=seed)
        print("\n🎯 频率加权娱乐预测：", ", ".join(f"{n:02d}" for n in final_pick))

    # 输出“今天”的预测（娱乐）
    print(f"\n🧾 基于 {since}—{today} 的数据，给出 {today} (周{today.isoweekday()}) 的娱乐选号：")
    print("   " + ", ".join(f"{n:02d}" for n in final_pick))


# —— 直接运行 —— #
run_full_pipeline_with_birthday()


📅 今天（SGT）: 2025-08-11（周1） | 回溯自: 2024-08-10


  for tag in soup.find_all(text=regex):


✅ 抓到 104 期。最近几期：
  2025-07-24 (周4) | Draw 4098 | [09, 11, 24, 32, 39, 49] + (24)
  2025-07-28 (周1) | Draw 4099 | [02, 14, 16, 21, 36, 47] + (28)
  2025-07-31 (周4) | Draw 4100 | [07, 19, 20, 21, 22, 29] + (31)
  2025-08-04 (周1) | Draw 4101 | [30, 32, 40, 43, 45, 49] + (04)
  2025-08-08 (周5) | Draw 4102 | [02, 15, 28, 39, 42, 44] + (08)

🎯 深度学习娱乐预测（含日期/星期/生日）： 08, 17, 18, 24, 33, 40

🧾 基于 2024-08-10—2025-08-11 的数据，给出 2025-08-11 (周1) 的娱乐选号：
   08, 17, 18, 24, 33, 40
