In [None]:
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Get user input for the URL
user_input_url = input("Enter the tournament URL: ")
# Append the "&zeilen=99999" string to the user-provided URL, this expands to show all the rows beyond the default 150
url_to_scrape = user_input_url + "&zeilen=99999"

# Record start time and print validation statement
start_time = time.time()

print('Fetching FIDE ID data from',
      url_to_scrape)  # Print a validation statement

# Send a GET request and parse the HTML content
response = requests.get(url_to_scrape)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all h2 tags
h2_tags = soup.find_all('h2')

# Find the tournament name from the first <h2> tag
tournament_name = soup.find('h2').text.strip()


# Defining a function that collects the FIDE IDs of all participants in a starting rank list. This is already sorted by Elo!
def scrape_fide_ids(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the first table after the second h2 tag
    table = soup.find_all('h2')[1].find_next('table')

    # Find the first row of the table
    first_row = table.find('tr')

    # Find the indices of the "FideID", "RtgI", and "Rtg" in the header
    cells = first_row.find_all('td')
    fide_id_index = None
    rtg_i_index = None
    rtg_index = None
    for index, cell in enumerate(cells):
        cell_text = cell.text
        if "FideID" in cell_text:
            fide_id_index = index
            break
        elif "RtgI" in cell_text:
            rtg_i_index = index
        elif "Rtg" in cell_text and rtg_i_index is None:
            rtg_index = index

    if fide_id_index is None:
        print("FideID column not found in the header.")
        return []

    # Extract FIDE IDs from the "FideID" column
    fide_ids = []
    rows = table.find_all('tr')[1:]  # Exclude the first row (header)
    for row in rows:
        cells = row.find_all('td')
        if fide_id_index < len(cells):
            fide_id = cells[fide_id_index].text.strip()
            fide_ids.append(fide_id)

    return fide_ids


# Collect FIDE IDs
fide_ids = scrape_fide_ids(url_to_scrape)

# Check if there are at least two h2 tags
if len(h2_tags) >= 2:
    # Get the second h2 tag
    second_h2 = h2_tags[1]

    # Find the table following the second h2 tag
    table = second_h2.find_next('table')

    # Find all rows in the table
    rows = table.find_all('tr')

    # Initialize empty lists to store data
    rtg_data = []

    # Identify the positions of 'Rk.', 'SNo', and 'Rtg' in the header
    header_row = rows[0]  # Header row is the first row
    header_cells = header_row.find_all(['th', 'td'])

    rtg_i_index = None
    rtg_index = None

    for index, cell in enumerate(header_cells):
        cell_text = cell.text
        if 'RtgI' in cell_text:
            rtg_i_index = index
            break
        elif 'Rtg' in cell_text and rtg_i_index is None:
            rtg_index = index

    # Check if 'RtgI' or 'Rtg' were found in the header
    if rtg_i_index is not None or rtg_index is not None:
        # Loop through rows and extract data
        for row in rows[1:]:  # Skip header row
            columns = row.find_all(['td', 'th'])
            if rtg_i_index is not None:
                rtg_data.append(int(columns[rtg_i_index].text.strip()))
            elif rtg_index is not None:
                rtg_data.append(int(columns[rtg_index].text.strip()))

# Create a DataFrame from the collected data
data = {'Rtg': rtg_data}
df = pd.DataFrame(data)

# Append FIDE IDs to the DataFrame
df['FideID'] = fide_ids

# Load the pre-processed urs_ratings_df DataFrame
urs_ratings_df = pd.read_pickle('urs_ratings.pkl')

# Make sure both DataFrames have the FideID entries as strings
df['FideID'] = df['FideID'].astype(str)
urs_ratings_df['FideID'] = urs_ratings_df['FideID'].astype(str)

# Merge the 'URating' column by matching on 'FideID' as the index
df = df.merge(urs_ratings_df, on='FideID', how='left')

print('URS data loaded successfully and merged with original DataFrame')

# Print summary stats in boldface and some color formatting for readability
print(
    f"\033[1m{tournament_name} has an average Elo of {df['Rtg'].mean():.0f} and average URS of {df['URating'].mean():.0f}\033[0m"
)

# Calculate and print runtime evaluation
end_time = time.time()
runtime = end_time - start_time
print(f"Script runtime: {runtime:.4f} seconds")

df['URating'] = df['URating'].astype(float)
df.index = df.index + 1
df.head(50).drop(columns=['FideID']).style.format({'URating': '{:.0f}'}) # Shows a cross-section of the tournament start list