<a href="https://colab.research.google.com/github/thmswhelan/GAA_Rankings/blob/main/Gaa_Football_Rankings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# URL for GAA fixtures or competition page
url = "https://www.gaa.ie/fixtures-results"

# Send HTTP request to fetch the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    page_content = response.content
    soup = BeautifulSoup(page_content, "html.parser")

    # Prepare CSV file to save the data
    with open('gaa_matches.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write headers to the CSV
        writer.writerow(["Competition", "Home Team", "Away Team", "Date", "Venue", "Home Score", "Away Score"])

        # Extract match details
        match_details = soup.find_all("div", class_="gar-match-item")

        for match in match_details:
            # Example of extracting team names
            home_team = match.find("div", class_="gar-match-item__team -home")
            away_team = match.find("div", class_="gar-match-item__team -away")

            # Extract the competition name for each match (assuming it may be inside the match item)
            competition = match.find_previous("h3", class_="gar-matches-list__group-name")
            competition_name = competition.get_text(strip=True) if competition else "Unknown Competition"

            # Extract the date of the match
            match_date = match.get("data-match-date", "No date available")

            # Extract the venue for the match
            venue = match.find("div", class_="gar-match-item__venue")
            venue_name = venue.get_text(strip=True) if venue else "No venue available"

            # Extract the home and away scores
            scores = match.find_all("div", class_="gar-match-item__score")

            # If scores are found, we process them correctly
            if len(scores) == 2:
                home_score = scores[0].get_text(strip=True)
                away_score = scores[1].get_text(strip=True)
            else:
                home_score = "N/A"
                away_score = "N/A"

            # If the score contains a date, we make sure to extract only the score
            if '-' not in home_score or '-' not in away_score:
                home_score = "Invalid score"
                away_score = "Invalid score"

            # Get team names
            if home_team and away_team:
                home_team_name = home_team.get_text(strip=True)
                away_team_name = away_team.get_text(strip=True)
            else:
                home_team_name = "Unknown"
                away_team_name = "Unknown"

            # Write each match's data to the CSV file
            writer.writerow([competition_name, home_team_name, away_team_name, match_date, venue_name, home_score, away_score])

    print("Data saved to 'gaa_matches.csv'.")
else:
    print("Failed to retrieve the page. Status Code:", response.status_code)



Data saved to 'gaa_matches.csv'.


In [3]:
# Install Playwright and dependencies
!pip install playwright
!playwright install


╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
    at async Registry._validateHostRequirements (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/l

In [12]:
!pip install playwright nest_asyncio
!playwright install


╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [15]:
import nest_asyncio
import asyncio
import pandas as pd
from playwright.async_api import async_playwright

nest_asyncio.apply()

async def scrape_gaa_fixtures():
    years = [2024, 2025]
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]

    all_matches = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        print("Opening GAA Fixtures page...")
        await page.goto("https://www.gaa.ie/fixtures-results/", timeout=90000)

        # ✅ Wait for a *more reliable* element (like the header) to confirm page is ready
        await page.wait_for_selector('header', timeout=90000)
        print("Page loaded!")

        for year in years:
            for month in months:
                print(f"Selecting {year} - {month}")

                try:
                    # Open Year dropdown
                    await page.click('div[data-name="year"]')
                    await asyncio.sleep(1)

                    # Select year
                    year_locator = page.locator(f'ul[role="listbox"] >> text="{year}"')
                    await year_locator.click()
                    await asyncio.sleep(1)

                    # Open Month dropdown
                    await page.click('div[data-name="month"]')
                    await asyncio.sleep(1)

                    # Select month
                    month_locator = page.locator(f'ul[role="listbox"] >> text="{month}"')
                    await month_locator.click()
                    await asyncio.sleep(1)

                    # ✅ Click Filter button AFTER filters selected
                    await page.click('button:has-text("Filter")')
                    await asyncio.sleep(3)  # wait for results to update

                    matches = await page.query_selector_all('.gar-match-item')

                    if not matches:
                        print(f"No matches for {year}-{month}")
                        continue

                    for match in matches:
                        try:
                            competition = await match.query_selector('.gar-match-item__competition')
                            competition = await competition.inner_text() if competition else ''

                            home_team = await match.query_selector('.gar-match-item__team.-home')
                            home_team = await home_team.inner_text() if home_team else ''

                            away_team = await match.query_selector('.gar-match-item__team.-away')
                            away_team = await away_team.inner_text() if away_team else ''

                            scores = await match.query_selector_all('.gar-match-item__score')
                            home_score = await scores[0].inner_text() if len(scores) > 0 else ''
                            away_score = await scores[1].inner_text() if len(scores) > 1 else ''

                            date = await match.query_selector('.gar-match-item__date')
                            date = await date.inner_text() if date else ''

                            venue = await match.query_selector('.gar-match-item__venue')
                            venue = await venue.inner_text() if venue else ''

                            all_matches.append({
                                'Competition': competition,
                                'Home Team': home_team,
                                'Home Score': home_score,
                                'Away Team': away_team,
                                'Away Score': away_score,
                                'Date': date,
                                'Venue': venue,
                                'Month': month,
                                'Year': year
                            })
                        except Exception as e:
                            print(f"Error parsing match: {e}")

                except Exception as e:
                    print(f"Error selecting {year}-{month}: {e}")

        await browser.close()

    # Save the results
    df = pd.DataFrame(all_matches)
    df.to_csv('gaa_fixtures_results.csv', index=False)
    print("✅ Data saved to gaa_fixtures_results.csv")

# Run
await scrape_gaa_fixtures()


Opening GAA Fixtures page...
Page loaded!
Selecting 2024 - January
Error selecting 2024-January: Page.click: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("div[data-name=\"year\"]")

Selecting 2024 - February
Error selecting 2024-February: Page.click: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("div[data-name=\"year\"]")

Selecting 2024 - March
Error selecting 2024-March: Page.click: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("div[data-name=\"year\"]")

Selecting 2024 - April
Error selecting 2024-April: Page.click: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("div[data-name=\"year\"]")

Selecting 2024 - May


CancelledError: 

In [19]:
# Install necessary dependencies
!apt-get update
!apt-get install -y wget
!apt-get install -y libnss3 libgdk-pixbuf2.0-0 libasound2 libatk-bridge2.0-0 libgtk-3-0
!pip install playwright
!playwright install

# Run your Playwright scraping script in headless mode
import asyncio
from playwright.async_api import async_playwright
import csv

# Function to scrape GAA fixtures and results
async def scrape_gaa_fixtures():
    async with async_playwright() as p:
        # Launch the browser in headless mode
        browser = await p.chromium.launch(headless=True)  # Use 'chromium' for headless mode
        page = await browser.new_page()

        # Navigate to the GAA fixtures page
        print("Opening GAA Fixtures page...")
        await page.goto("https://www.gaa.ie/fixtures-results/")
        print("Page loaded!")

        # Wait for the page to load completely
        await page.wait_for_selector('div.gar-fr-filter__year')  # Wait for the year filter to load

        # Now, let's go through the year and month options
        years = ["2024", "2025"]  # Example years to scrape
        months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]

        all_matches = []

        for year in years:
            # Select the year filter
            print(f"Selecting {year}")
            year_selector = await page.query_selector(f'div.head-text:has-text("{year}")')
            if year_selector:
                await year_selector.click()
                await page.wait_for_selector(f'li.dropdown-li:has-text("{year}")')

            for month in months:
                print(f"Selecting {year}-{month}")
                month_selector = await page.query_selector(f'div.head-text:has-text("{month}")')
                if month_selector:
                    await month_selector.click()
                    await page.wait_for_selector(f'li.dropdown-li:has-text("{month}")')

                # Click the "Filter" button to apply the selected year and month
                await page.click("button:has-text('Filter')")
                await page.wait_for_selector("div.gar-match-item")  # Wait for the matches to load

                # Now, scrape the match data for the selected month and year
                matches = await page.query_selector_all('div.gar-match-item')

                for match in matches:
                    try:
                        competition = await match.query_selector('div.gar-match-item__competition')
                        competition_text = await competition.inner_text() if competition else 'N/A'

                        home_team = await match.query_selector('div.gar-match-item__team.-home')
                        home_team_text = await home_team.inner_text() if home_team else 'N/A'

                        away_team = await match.query_selector('div.gar-match-item__team.-away')
                        away_team_text = await away_team.inner_text() if away_team else 'N/A'

                        scores = await match.query_selector_all('div.gar-match-item__score')
                        home_score = await scores[0].inner_text() if len(scores) > 0 else 'N/A'
                        away_score = await scores[1].inner_text() if len(scores) > 1 else 'N/A'

                        date = await match.query_selector('div.gar-match-item__date')
                        date_text = await date.inner_text() if date else 'N/A'

                        venue = await match.query_selector('div.gar-match-item__venue')
                        venue_text = await venue.inner_text() if venue else 'N/A'

                        all_matches.append({
                            'Competition': competition_text,
                            'Home Team': home_team_text,
                            'Home Score': home_score,
                            'Away Team': away_team_text,
                            'Away Score': away_score,
                            'Date': date_text,
                            'Venue': venue_text,
                            'Month': month,
                            'Year': year
                        })
                    except Exception as e:
                        print(f"Error parsing match: {e}")

        # Save all data to a CSV file
        with open('gaa_fixtures_results.csv', 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date', 'Venue', 'Month', 'Year']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_matches)

        print("✅ Data saved to gaa_fixtures_results.csv")

        # Close the browser after scraping
        await browser.close()

# Run the scraper using asyncio
asyncio.run(scrape_gaa_fixtures())


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:2 https://dl.google.com/linux/chrome/deb stable InRelease [1,825 B]
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://dl.google.com/linux/chrome/deb stable/main amd64 Packages [1,218 B]
Hit:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 260 kB in 1s (182 kB/s)
Reading package lists... Done
W: Skipping acquire of con

TimeoutError: Page.click: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("button:has-text('Filter')")
