In [8]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import random
from IPython.display import display
import re

# 1) Create a Session with a random or fixed User-Agent
s = requests.Session()
s.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/109.0.0.0 Safari/537.36"
    )
})

def get_soup(session, url):
    """Fetch a URL, return BeautifulSoup or None on failure."""
    try:
        resp = session.get(url, timeout=10)
        # Check status code
        if resp.status_code == 200:
            return BeautifulSoup(resp.text, 'html.parser')
        elif resp.status_code == 429:
            print("Received 429. Too many requests. Backing off.")
            # Wait longer or do an exponential backoff
            time.sleep(60)
            return None
        else:
            print(f"Error: got status {resp.status_code} for {url}")
            return None
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
    
def get_competition_links(allowed_links, soup):
    table = soup.find('table', class_='liste')
    if not table:
        print("Could not find table with class 'liste'")
        return []
    
    links = table.find_all('a')
    print(f"Found {len(links)} links in table")

    hrefs = [l.get('href') for l in links if l.get('href')]
    print(f"Found {len(hrefs)} hrefs")
    
    #Select only for Top 5 competitions
    competition_urls = []
    for link in links:
        if link['href'] in allowed_links:
            competition_urls.append(f"http://clubelo.com/{link['href']}")
         
    return competition_urls

def main():
    elo_url = "http://clubelo.com/"
    print(f"Fetching {elo_url}")
    
    soup_elo = get_soup(s, elo_url)
    if not soup_elo:
        print("Failed to get soup from main page")
        return
    
    allowed_links = {"/ENG", "/ESP", "/ITA", "/GER", "/FRA"}
    competition_links = get_competition_links(allowed_links, soup_elo)
    print(f"\nTotal competition links found: {len(competition_links)}")

    for link in competition_links:
        print(link)

if __name__ == "__main__":
    main()


    




Fetching http://clubelo.com/
Found 60 links in table
Found 60 hrefs

Total competition links found: 5
http://clubelo.com//ENG
http://clubelo.com//ESP
http://clubelo.com//ITA
http://clubelo.com//FRA
http://clubelo.com//GER
