In [None]:
pip install requests



In [162]:
# Import necessary libraries
# import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

class ResultsScraper:
    """
    A class to scrape and process results from a web page.
    """

    def __init__(self, base_url):
        """
        Initialize the ResultsScraper object.

        Parameters:
            base_url (str): The base URL of the web page to scrape.
        """
        self.base_url = base_url
        self.column_names = None

    def fetch_html_data(self, url):
        """
        Fetch HTML data from a URL.

        Parameters:
            url (str): The URL to fetch HTML data from.

        Returns:
            str: The HTML data fetched from the URL.
        """
        try:
            # response = requests.get(url)
            # response.raise_for_status()  # Raise an exception for HTTP errors
            # return response.text
            pass  # Placeholder for actual implementation
        except requests.RequestException as e:
            print(f"Error fetching HTML from {url}: {e}")
            return None

    def parse_html(self, html_data):
        """
        Parse HTML data using BeautifulSoup.

        Parameters:
            html_data (str): The HTML data to parse.

        Returns:
            BeautifulSoup object: Parsed BeautifulSoup object.
        """
        if html_data:
            return BeautifulSoup(html_data, "html.parser")
        return None

    def get_column_names(self, soup):
        """
        Extract column names from the HTML data.

        Parameters:
            soup (BeautifulSoup object): Parsed BeautifulSoup object.

        Returns:
            list: List of column names.
        """
        if not self.column_names:
            self.column_names = []
            header_tags = soup.find_all('tr')[1].find_all('a')[:12]
            for header_tag in header_tags:
                self.column_names.append(header_tag.text)
            self.column_names[self.column_names.index('Race No')] = 'Name'
            self.column_names.insert(self.column_names.index('Time') + 1, 'Medal')
        return self.column_names

    def extract_results(self, soup, page_number):
        """
        Extract results from the HTML data.

        Parameters:
            soup (BeautifulSoup object): Parsed BeautifulSoup object.
            page_number (int): The page number of the results.

        Returns:
            list: List of dictionaries containing result data.
        """
        if not soup:
            return []
        all_results = []
        result_rows = soup.find_all('tr')[2:-1]
        for row in result_rows:
            result_data = row.find_all('td')[2:21]
            pos = result_data[0].text
            name = result_data[4].find('a').text
            time = result_data[6].text
            medal = result_data[7].text
            club = result_data[8].text
            gender = result_data[9].find('a').text
            category = result_data[11].find('a').text
            start = result_data[13].text
            km_5 = result_data[14].text
            km_14 = result_data[15].text
            km_28 = result_data[16].text
            km_42 = result_data[17].text
            km_50 = result_data[18].text

            result = {
                "Page No": page_number,
                "Pos": pos,
                "Name": name,
                "Time": time,
                "Medal": medal,
                "Club": club,
                "Gender": gender,
                "Category": category,
                "Start": start,
                "5km": km_5,
                "14km": km_14,
                "28km": km_28,
                "42.2km": km_42,
                "50km": km_50
            }
            all_results.append(result)
        return all_results


In [163]:
# Define the base URL for scraping results
base_url = "https://results.finishtime.co.za/results.aspx?CId=35&RId=30353&EId=1&dt=0&PageNo="

# Initialize an empty list to store all the scraped results
all_results = []


In [164]:
# Fetch column names from the first page
first_page_url = base_url + "1"
first_page_scraper = ResultsScraper(base_url)
first_page_html_data = first_page_scraper.fetch_html_data(first_page_url)
first_page_soup = first_page_scraper.parse_html(first_page_html_data)
column_names = first_page_scraper.get_column_names(first_page_soup)

In [165]:
# Scrape and process subsequent pages
for page_number in range(1, 246):
    url = base_url + str(page_number)
    scraper = ResultsScraper(base_url)
    html_data = scraper.fetch_html_data(url)
    soup = scraper.parse_html(html_data)
    results = scraper.extract_results(soup, page_number)
    all_results.extend(results)


In [177]:
all_results[0]

{'Page No': 1,
 'Pos': '1',
 'Name': 'ONALENNA KHONKHOBE',
 'Time': '03:09:30',
 'Medal': 'Gold',
 'Club': 'NEDBANK DEVELOPMENT CLUB CENTRAL NORTH WEST',
 'Gender': 'Male',
 'Category': 'Senior',
 'Start': '05:15:04',
 '5km': '00:17:38',
 '14km': '00:48:12',
 '28km': '01:36:39',
 '42.2km': '02:25:21',
 '50km': '02:52:00'}

In [173]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_results)

In [176]:
# Write DataFrame to CSV
df.to_csv('56km_results.csv', index=False)