In [7]:
import time
import csv
import sys
from urllib.parse import urljoin, urlparse, parse_qs, urlencode, urlunparse
import requests
import pandas as pd
from bs4 import BeautifulSoup # --> pip install beautifulsoup4

In [5]:

BASE = "https://www.chess.com/tournament/live/titled-tuesdays"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; titled-tuesday-archiver/1.2; +https://example.org)",
    "Accept-Language": "en-US,en;q=0.9",
}
REQUEST_DELAY_SECS = 1.25  # polite delay


def get_soup(session, url):
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")


def extract_rows_with_date_and_link(soup):
    """
    Extract rows where we can find a date (class 'tournaments-live-date')
    and a corresponding tournament link on the same row/container.

    Since we don't control the exact HTML structure, we:
      1) Find every date node with class 'tournaments-live-date'
      2) Walk up to a reasonable parent container (e.g., a row/div/li)
      3) Inside that container, look for an <a> to a '/tournament/live/' page
         whose slug contains 'titled' (robust filter for Titled Tuesday)
    """
    # results: List[Dict[str, str]] = []
    results = []
    date_nodes = soup.find_all(attrs={"class": lambda c: c and "tournaments-live-date" in c.split()})

    for date_node in date_nodes:
        date_text = date_node.get_text(strip=True)

        # Heuristic: find a nearby container that also has the link
        container = date_node
        for _ in range(4):  # climb a few levels max to find a row container
            container = container.find_parent()
            if container is None:
                break
            # try to find a suitable link inside this container
            a = container.find("a", href=True)
            if a:
                hrefs = [a["href"]]
                # sometimes there may be multiple anchors in the row; try all of them
                hrefs.extend([x["href"] for x in container.find_all("a", href=True)])
                # filter candidates
                candidates = [
                    urljoin(BASE, h)
                    for h in hrefs
                    if "/tournament/live/" in h and "titled" in h.lower()
                ]
                if candidates:
                    # pick the first candidate (most rows have one canonical link)
                    results.append({"Date": date_text, "tournament_url": candidates[0]})
                    break

    return results


def find_next_page_href(soup):
    # rel="next"
    a = soup.find("a", rel=lambda v: v and "next" in v.lower())
    if a and a.get("href"):
        return urljoin(BASE, a["href"])
    # aria-label or textual indicators
    for a in soup.find_all("a", href=True):
        label = a.get("aria-label", "").lower()
        txt = a.get_text(strip=True).lower()
        if label == "next" or txt in {"next", "older", "›", "»"}:
            return urljoin(BASE, a["href"])
    return None


def with_page_param(url, page):
    parts = list(urlparse(url))
    qs = parse_qs(parts[4])
    qs["page"] = [str(page)]
    parts[4] = urlencode({k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in qs.items()})
    return urlunparse(parts)


def crawl_pages(start_url = BASE, max_pages = 3, start_page = 1):
    """
    Crawl up to 'max_pages' listing pages and return a DataFrame with:
        columns = ['Date', 'tournament_url']

    Behavior:
      - Follow a visible 'Next' link if available.
      - Otherwise, probe ?page=N for N=2,3,... until 'max_pages' are fetched.
      - Deduplicate rows by (Date, URL).
    """
    session = requests.Session()
    seen_pages = set()
    rows = []

    url = with_page_param(start_url, start_page) if start_page > 1 else start_url
    page_count = 0
    next_mode = "auto"   # 'next' while Next works; fall back to 'probe'
    probe_page = max(start_page, 1)

    while url and (max_pages is None or page_count < max_pages):
        if url in seen_pages:
            break
        seen_pages.add(url)
        page_count += 1

        print(f"[page {page_count}] {url}", file=sys.stderr)
        soup = get_soup(session, url)

        page_rows = extract_rows_with_date_and_link(soup)
        print(f"  extracted {len(page_rows)} date/url pairs", file=sys.stderr)
        rows.extend(page_rows)

        time.sleep(REQUEST_DELAY_SECS)

        # Try a visible Next link first
        if next_mode in ("auto", "next"):
            nxt = find_next_page_href(soup)
            if nxt and nxt not in seen_pages:
                next_mode = "next"
                url = nxt
                continue
            # switch to probe mode
            next_mode = "probe"
            try:
                cur_q = parse_qs(urlparse(url).query)
                cur_p = int(cur_q.get("page", [probe_page])[0])
                probe_page = cur_p + 1
            except ValueError:
                probe_page += 1

        # Probe mode via ?page=N
        if next_mode == "probe":
            probe_page += 1 if page_count > 1 else 0
            candidate = with_page_param(start_url, probe_page)
            url = candidate if candidate not in seen_pages else None

    # Build DataFrame and de-duplicate
    df = pd.DataFrame(rows, columns=["Date", "tournament_url"]).drop_duplicates().reset_index(drop=True)
    return df


def main():
    # Example: crawl first 2 pages; adjust as needed
    df = crawl_pages(max_pages=2)
    print(f"\nCollected {len(df)} rows")
    # Save if you want:
    out_csv = "titled_tuesday_dates_and_links.csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved to {out_csv}")
    # Show a quick preview
    print(df.head().to_string(index=False))


In [8]:
links = crawl_pages(BASE, max_pages=24)

[page 1] https://www.chess.com/tournament/live/titled-tuesdays
  extracted 25 date/url pairs
[page 2] https://www.chess.com/tournament/live/titled-tuesdays?page=2
  extracted 25 date/url pairs
[page 3] https://www.chess.com/tournament/live/titled-tuesdays?page=3
  extracted 25 date/url pairs
[page 4] https://www.chess.com/tournament/live/titled-tuesdays?page=4
  extracted 25 date/url pairs
[page 5] https://www.chess.com/tournament/live/titled-tuesdays?page=5
  extracted 25 date/url pairs
[page 6] https://www.chess.com/tournament/live/titled-tuesdays?page=6
  extracted 25 date/url pairs
[page 7] https://www.chess.com/tournament/live/titled-tuesdays?page=7
  extracted 25 date/url pairs
[page 8] https://www.chess.com/tournament/live/titled-tuesdays?page=8
  extracted 25 date/url pairs
[page 9] https://www.chess.com/tournament/live/titled-tuesdays?page=9
  extracted 25 date/url pairs
[page 10] https://www.chess.com/tournament/live/titled-tuesdays?page=10
  extracted 25 date/url pairs
[page

In [15]:
links_df = links
links_df['Date'] = pd.to_datetime(links_df.Date)
links_df = links_df.sort_values(by='Date', ascending=False)

In [16]:
links_df

Unnamed: 0,Date,tournament_url
0,2025-10-21 08:00:00,https://www.chess.com/tournament/live/titled-t...
1,2025-10-14 08:00:00,https://www.chess.com/tournament/live/titled-t...
2,2025-10-07 08:00:00,https://www.chess.com/tournament/live/titled-t...
3,2025-09-30 08:00:00,https://www.chess.com/tournament/live/titled-t...
4,2025-09-23 08:00:00,https://www.chess.com/tournament/live/titled-t...
...,...,...
556,2015-03-03 11:00:00,https://www.chess.com/tournament/live/-titled-...
557,2015-02-24 11:00:00,https://www.chess.com/tournament/live/-titled-...
558,2015-01-27 12:30:00,https://www.chess.com/tournament/live/-titled-...
559,2015-01-27 11:00:00,https://www.chess.com/tournament/live/-titled-...


In [17]:
links_df.to_csv('tournament_link.csv')