In [None]:
import os
import re
from datetime import datetime, date
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup
import pandas as pd

UA = {"User-Agent": "Mozilla/5.0"}
TAG_URL = "https://min-repo.com/tag/%E3%83%80%E3%82%A4%E3%82%A8%E3%83%BC%E7%94%BA%E5%8C%97%E7%94%BA%E5%BA%97/"
AIM_PATTERN = re.compile(r"ネオアイムジャグラーEX")

def _extract_dateid_from_href(href: str) -> str | None:
    if not href:
        return None
    u = urlparse(href)
    last = u.path.rstrip("/").split("/")[-1]
    if last.isdigit():
        return last
    qs = parse_qs(u.query)
    for k in ("dateid", "date_id", "id", "p"):
        if k in qs and qs[k]:
            return qs[k][0]
    m = re.search(r"(?<!\d)(\d{6,})(?!\d)", href)
    return m.group(1) if m else None

def get_dateids(list_url: str = TAG_URL) -> list[str]:
    r = requests.get(list_url, headers=UA, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")
    ids, seen = [], set()
    for tr in soup.select("div.table_wrap table tbody tr"):
        a = tr.select_one("td:nth-of-type(1) a[href]")
        if not a: 
            continue
        did = _extract_dateid_from_href(a.get("href"))
        if did and did not in seen:
            ids.append(did)
            seen.add(did)
    return ids


def _to_int(s):

    """
    数値を整数に変換する関数。
    サイト側の表記揺れ（全角マイナスやダッシュ等）によるエラーを回避し、
    確実にマイナス値を取得できるように処理。
    """
   
    if pd.isna(s) or s is None:
        return None
    
   
    s_str = str(s).replace(",", "")
    
    s_str = re.sub(r"[－−—–]", "-", s_str)
    
    try:
        return int(s_str)
    except ValueError:
       
        return None


def parse_daily_page(dateid: str, base: str = "https://min-repo.com/") -> dict:
    """
    1日のまとめページにアクセスし、「日付」と「対象機種のリンク」取得する。
    """
    day_url = urljoin(base, f"{dateid}/")
    r = requests.get(day_url, headers=UA, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

  
    kishu_url = None
    for a in soup.select("table.kishu a[href]"):
        if AIM_PATTERN.search((a.get_text(strip=True) or "")):
            kishu_url = urljoin(day_url, a["href"])
            break
    if not kishu_url:
        a_tag = soup.find("a", string=AIM_PATTERN)
        if a_tag and a_tag.has_attr("href"):
            kishu_url = urljoin(day_url, a_tag["href"])

   
    report_date = None
    time_tag = soup.find("time", class_="date")
    if time_tag and time_tag.has_attr("datetime"):
        date_str = time_tag["datetime"][:10] # YYYY-MM-DD
        try:
            report_date = datetime.strptime(date_str, "%Y-%m-%d").date()
        except ValueError:
            pass
            
    if not report_date:
        m = re.search(r"date=(\d{4}-\d{2}-\d{2})", r.text)
        if m:
            report_date = datetime.strptime(m.group(1), "%Y-%m-%d").date()
        else:
            raise RuntimeError(f"日付の特定に失敗しました: {day_url}")

    return {
        "date": report_date,
        "dow": report_date.strftime("%a"),
        "kishu_url": kishu_url
    }

def scrape_kishu_data_table(kishu_url: str) -> pd.DataFrame:
    r = requests.get(kishu_url, headers=UA, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    h = soup.find(lambda t: t.name in ("h2","h3") and "データ一覧" in t.get_text())
    table = h.find_next("table") if h else None

    if table is None:
        for t in soup.select("table"):
            ths = [th.get_text(strip=True) for th in t.select("tr th")]
            if {"台番","差枚","G数"}.issubset(set(ths)):
                table = t; break
    if table is None:
        raise RuntimeError("データ一覧のテーブルが見つかりませんでした。")

    headers = [th.get_text(strip=True) for th in table.select("tr")[0].select("th")]
    rows = []
    for tr in table.select("tr")[1:]:
        tds = tr.find_all("td")
        if not tds:
            continue
        row = {}
        for hname, td in zip(headers, tds):
            row[hname] = td.get_text(strip=True)
        rows.append(row)

    df = pd.DataFrame(rows)

   
    for c in ("差枚","G数","BB","RB"):
        if c in df.columns:
            df[c] = df[c].map(_to_int)

    return df

def save_aim_table(dateid: str, out_dir: str = ".") -> str:

    """
    指定した日付のデータをスクレイピングし、必要な列だけを抽出してCSVとして保存する。
    """
    page_info = parse_daily_page(dateid)
    
    if not page_info["kishu_url"]:
        raise RuntimeError("対象機種のリンクが見つかりません。")
        
    df = scrape_kishu_data_table(page_info["kishu_url"])

    
    wanted = ["台番", "差枚", "G数", "出率", "BB", "RB", "合成", "BB率", "RB率"]
    df = df[[c for c in wanted if c in df.columns]]

    fname = f"aim-{page_info['date']:%Y-%m-%d}-{page_info['dow']}.csv"
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, fname)

    if os.path.exists(path):
        print(f"[SKIP] 既存データあり: {path}")
        return path

    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"[OK] 保存完了: {path}")
    return path

def main():
    ids = get_dateids(TAG_URL)
    if not ids:
        raise SystemExit("dateidが取れませんでした。")
    latest_yesterday = ids[0]  # 0 なら前日分
    #保存先ディレクトリを指定
    save_aim_table(latest_yesterday, out_dir="data")

if __name__ == "__main__":
    main()

[OK] 保存完了: .\aim-2026-02-13-Fri.csv
