In [40]:
import re 
import time

import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# Scraping 
* Get dataframe gets all information from the tables and parses into a dataframe. 
* Exceptions: 'percent' is the metric used in 2013-2015 years, the table column is replaced with points however the metric is still stored as a percent only for 2013-2015
* The year 2022 has a weird structure and a seperate function had to be written 

## 2022 
* The structure of this table is different, the function aims to scrape the data and return it in the structure suitable for appending 

In [220]:
# functions for scraping data 
def get_dataframe(driver, page, year): 
    if year == 2022: 
        return get_dataframe_for_2022(driver, page)
    
    wait = WebDriverWait(driver=driver, timeout=10)
    driver.get(page)
    tabular_data = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'table-responsive')))
    error = []
    for idx,record in enumerate(tabular_data.find_elements(By.TAG_NAME, 'tr')): 
        if not(idx):
            data = pd.DataFrame(columns=record.text.strip().split('\n')[-4:] + ['year','ranking'])
            if 'Percent' in data.columns: 
                data = data.rename(columns={'Percent':'Points'})
            continue
        try:
            data.loc[len(data)] = record.text.strip().split('\n')[-4:] + [year] + [idx]
        except: 
            error.append(record.text.strip().split('\n') + [year] +  [idx])
            
    return data, error

def get_dataframe_for_2022(driver, page, year=2022):
    wait = WebDriverWait(driver=driver, timeout=10)
    driver.get(page)
    tabular_data = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'table-responsive')))
    error = []
    for idx,record in enumerate(tabular_data.find_elements(By.TAG_NAME, 'tr')):
        if not(idx): 
            columns = record.text.strip().split('\n')[-5:] + ['year', 'ranking']
            columns.remove('Position')
            data = pd.DataFrame(columns=columns)
            continue
        try:
            record_list = record.text.strip().split('\n')
            if '/' in record_list[4]:
                record_list[4] += record_list[5]
                record_list.remove(record_list[5])
            record_list = [record_list[1]] + record_list[3:]
            data.loc[len(data)] = record_list + [year] + [idx]
        except:
            record_list = record.text.strip().split('\n')
            record_list = [record_list[1]] + record_list[3:]
            error.append(record_list + [year] +  [idx])

    return data, error    
            
    

In [227]:
driver.quit()
driver = (webdriver.Safari())
data = pd.DataFrame()
errors = []
for year in range(1956,2024):
    if year == 2020: 
        continue 
    page = f'https://www.topendsports.com/sport/soccer/awards/ballondor-{str(year)}.htm' 
    yeardata, error = get_dataframe(driver, page, year)
    errors.append(error) if error else errors
    data = pd.concat([data, yeardata])
    time.sleep(0.5)
driver.quit()

In [230]:
data.reset_index(inplace=True)
data.drop(columns=['index'], inplace=True)
data

Unnamed: 0,Player,Club,Nationality,Points,year,ranking
0,Stanley Matthews,Blackpool,England,47,1956,1
1,Alfredo Di Stéfano,Real Madrid,Spain,44,1956,2
2,Raymond Kopa,Reims / Real Madrid,France,33,1956,3
3,Ferenc Puskás,Budapest Honvéd,Hungary,32,1956,4
4,Lev Yashin,Dynamo Moscow,Soviet Union,19,1956,5
...,...,...,...,...,...,...
1944,Jamal Musiala,Bayern Munich,Germany,0,2023,26
1945,Nicolo Barella,Inter Milan,Italy,0,2023,27
1946,Randal Kolo Muani,Eintracht Frankfurt / PSG,France,0,2023,28
1947,Martin Odegaard,Arsenal,Norway,0,2023,29


In [231]:
data[['Player','Club','Nationality']] = data[['Player','Club','Nationality']].map(lambda x: re.sub('\\xa0',' ',x))
data

Unnamed: 0,Player,Club,Nationality,Points,year,ranking
0,Stanley Matthews,Blackpool,England,47,1956,1
1,Alfredo Di Stéfano,Real Madrid,Spain,44,1956,2
2,Raymond Kopa,Reims / Real Madrid,France,33,1956,3
3,Ferenc Puskás,Budapest Honvéd,Hungary,32,1956,4
4,Lev Yashin,Dynamo Moscow,Soviet Union,19,1956,5
...,...,...,...,...,...,...
1944,Jamal Musiala,Bayern Munich,Germany,0,2023,26
1945,Nicolo Barella,Inter Milan,Italy,0,2023,27
1946,Randal Kolo Muani,Eintracht Frankfurt / PSG,France,0,2023,28
1947,Martin Odegaard,Arsenal,Norway,0,2023,29


In [232]:
data.to_csv(path_or_buf='./datasets/winnersList.csv', index=False)

# improvements
* scrape the feminine baloon d or list too  