In [5]:
import requests
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup as bs
from time import sleep

In [6]:
def save_html(html, path):
    with open(path, 'wb') as f:
        f.write(html)
        
def open_html(path):
    with open(path, 'rb') as f:
        return f.read() 

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
def save_scrapes (urls, file_names):
    for page, file_name in zip(urls, file_names):
        r = requests.get(page,headers=headers)
        save_html(r.content, file_names)

In [7]:
#Only extract All England, Indonesia Open, China Open, and Malaysia Open
open_keywords = ["all-england-open", "malaysia-open", "indonesia-open", "victor-china-open"]
tour_urls = [
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2018&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2019&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2020&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2021&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2022&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2023&rstate=completed",
    "https://bwfworldtour.bwfbadminton.com/calendar/?cyear=2024&rstate=completed"
]
tour_files = ['tour_2018.txt', 'tour_2019.txt', 'tour_2020.txt',
              'tour_2021.txt','tour_2022.txt', 'tour_2023.txt', 'tour_2024.txt']
#save_scrapes(tour_urls, tour_files)

In [10]:
game_keywords = ["all-england-open", "malaysia-open", "indonesia-open", "victor-china-open"]
#Exclude cancelled opens and non super-1000 malaysia opens
exclude_opens = ['malaysia-open-2018',
                'malaysia-open-2019',
                'malaysia-open-2020',
                'malaysia-open-2021',
                'malaysia-open-2022',
                'china-open-2020',
                'china-open-2021',
                'china-open-2022',
                'indonesia-open-2020']
tour_urls = []
for file in tour_files:
    r = open_html(file)
    soup = bs(r, 'html.parser')
    game_list = soup.select('div.page-content')
    games = game_list[0].select('div.item-results')
    for game in games:
        links = game.select('div.tblResultLanding a')
        for link in links:
            href = link.get('href', '')
            if any(keyword in href for keyword in game_keywords):
                tour_urls.append(href)

#Filter the urls
filtered_urls = [url for url in tour_urls if not any(keyword in url for keyword in exclude_opens)]
filtered_urls = [url + '/results/podium/' for url in filtered_urls]

In [5]:
matches_url = []
for url in filtered_urls:
    r = requests.get(url, headers=headers)
    soup = bs(r.content, 'html.parser')
    match_days = soup.select('#ajaxTabsResults li')[1:-1]
    for day in match_days:
        a_tag = day.find('a')
        if a_tag and 'href' in a_tag.attrs:
            match_url = a_tag['href']
            matches_url.append(match_url)

In [12]:
#Create the data dicts
data = {
    "Tournament Name": [],
    "Tournament Date": [],
    "Tournament Country": [],
    "Discipline": [],
    "Number of Sets Played": [],
    "Retired": [],  # 1 if true, 0 if false
    "Match Duration": [],
    "Team 1 Nationalities": [],
    "Team 2 Nationalities": [],
    "Team 1 Name(s)": [],
    "Team 1 Seed": [],
    "Team 2 Name(s)": [],
    "Team 2 Seed": [],
    "Points Set 1 Team 1": [],
    "Points Set 1 Team 2": [],
    "Points Set 2 Team 1": [],
    "Points Set 2 Team 2": [],
    "Points Set 3 Team 1": [],  # empty if not applicable
    "Points Set 3 Team 2": [], 
    "Total Points Team 1": [],
    "Total Points Team 2": [],
    "Sets Won Team 1": [],
    "Sets Won Team 2": [],
    "Total Game Points Team 1": [],
    "Total Game Points Team 2": [],
    "Most Consecutive Points Team 1": [],
    "Most Consecutive Points Team 2": [],
    "Team 1 Head to Head Analysis": [],
    "Team 2 Head to Head Analysis": [],
    "Match Winner": []  # 1 if Team 1 wins, 2 if Team 2 wins
}

In [13]:
matches_filename = []
for url in matches_url:
    # Split the URL to extract relevant parts
    url_parts = url.split('/')
    tournament_name = url_parts[5].replace('-', '_')
    match_date = url_parts[-1]
    # Generate the file name
    file_name = f"{tournament_name}_{match_date}_match.txt"
    matches_filename.append(file_name)
save_scrapes(matches_url, matches_filename)

In [14]:
# Tournament function
def extract_tournament_info(file_name):
    parts = file_name.replace(".txt", "").split("_")
    tournament_name = "_".join(parts[:-2])
    tournament_name = tournament_name[:-5]
    tournament_date = parts[-2]
    country_keywords = ['indonesia', 'china', 'england', 'malaysia']
    for country in country_keywords:
        if country in file_name.lower():
            tournament_country = country.capitalize()
            break  # Stop once the country is found
    return tournament_name, tournament_date, tournament_country

# Name and seed extraction function
def extract_name_and_seed(player_info):
    # Assuming format: 'Name [Seed]'
    if '[' in player_info and ']' in player_info:
        name, seed = player_info.split('[', 1)
        seed = seed.split(']', 1)[0]
        return name.strip(), seed.strip()
    else:
        return player_info.strip(), None

# Extract nationality from png
def extract_nationality(flag_url):
    # Extracts the country name from the URL (e.g., 'bulgaria.png')
    return flag_url.split('/')[-1].replace('.png', '')

# Match duration extraction
def extract_duration(duration_str):
 # Time format where '1.07' = 1 hour 7 minutes
    parts = duration_str.split(':')
    hours = int(parts[0])
    minutes = int(parts[1])
    return hours * 60 + minutes

def determine_match_winner(sets_won_team_1, sets_won_team_2):
    team1_won = False
    team2_won = False
    # Determine match winner
    if sets_won_team_1 > sets_won_team_2:
        match_winner = 1  # Team 1 wins
    elif sets_won_team_1 < sets_won_team_:
        match_winner = 2  # Team 2 wins
    else:
        match_winner = 0  # Draw
    
    return match_winner

def process_score_stats(scores):
    # Initialize variables
    team1_points = []
    team2_points = []
    
    # Assign points alternately to Team 1 and Team 2
    for index, score in enumerate(scores):
        if index % 2 == 0:
            team1_points.append(score)
        else:
            team2_points.append(score)
    
    # Calculate total points for each team
    total_points_team_1 = int(sum(team1_points))
    total_points_team_2 = int(sum(team2_points))
    
    # Determine the number of sets won by each team
    sets_won_team_1 = int(sum(1 for i in range(len(team1_points)) if team1_points[i] > team2_points[i]))
    sets_won_team_2 = int(sum(1 for i in range(len(team2_points)) if team2_points[i] > team1_points[i]))
    return total_points_team_1, total_points_team_2, sets_won_team_1, sets_won_team_2

In [15]:
def process_single_match(file, headers, doubles_data):
    #Open saved web scrape 
    r = open_html(file)
    
    #Extract tournament info from file name 
    soup = bs(r, 'html.parser')
    tournament_name, tournament_date, tournament_country = extract_tournament_info(file)
    ul_elements = soup.select('ul.list-sort-time')
    for ul in ul_elements:
        li_elements = ul.find_all('li')
        for li in li_elements:
            if 'location-name' not in li.get('class', []) and 'stats' not in li.get('class', []):
                # Extract Game and Player Information
                discipline = li.select_one('div.round').text.strip()
                scores = li.select('div.player-score-wrap > div.score')
                set_scores = [score.text.strip().split(',') for score in scores]
                set_scores = [score for sublist in set_scores for score in sublist]
                
                player_info = li.select('div.player-wrap > div.team-details-wrap')
                if 'Walkover' not in set_scores: #skip over walkover games
                # Match Statistics and Player Information
                    match_duration = extract_duration(li.select_one('div.timer1 > span').text)
                    team_1_nationality = extract_nationality(player_info[0].select_one('div.flag > img')['src'])
                    team_2_nationality = extract_nationality(player_info[1].select_one('div.flag > img')['src'])
                    points_set_1_team_1, points_set_1_team_2 = None, None
                    points_set_2_team_1, points_set_2_team_2 = None, None
                    points_set_3_team_1, points_set_3_team_2 = None, None
                    retired = False
                    if any('retired' in score.lower() for score in set_scores):
                        retired = True
                        set_scores = [score for score in set_scores if 'retired' not in score.lower()]
                    
                    set_scores = [score.split('-') for score in set_scores]
                    set_scores = [score for sublist in set_scores for score in sublist]
                    set_scores = [int(score) for score in set_scores]
                    if len(set_scores) >= 2:
                        points_set_1_team_1, points_set_1_team_2 = set_scores[0], set_scores[1]
                    if len(set_scores) >= 4:
                        points_set_2_team_1, points_set_2_team_2 = set_scores[2], set_scores[3]
                    if len(set_scores) >= 6:
                        points_set_3_team_1, points_set_3_team_2 = set_scores[4], set_scores[5]
        
                    #Calculate score game stats:
                    total_points_team_1, total_points_team_2, sets_won_team_1, sets_won_team_2 = process_score_stats(set_scores)
                    match_winner = determine_match_winner(sets_won_team_1, sets_won_team_2)
                    match_link_tag = li.select_one('a#match-link')
                    href = match_link_tag.get('href')
                    modified_href = href.replace('stab=result', 'stab=match')
            
                    # Find corresponding stats div for each match 
                    match_classes = [cls for cls in li.get('class', []) if cls.startswith('match-')]
                    table_r = requests.get(modified_href, headers=headers)
                    table_soup = bs(table_r.content, 'html.parser')
                    match_number = match_classes[0].split('-')[-1] #extract match number to get corresponding stats div
                    stats_div = table_soup.select_one(f'li.stats.stats-{match_number}') 
            
                    # Extracting the game points and consecutive points from stats table
                    table = stats_div.find('table')
                    rows = table.find_all('tr')
                    most_consecutive_points_row = rows[0]
                    t1_most_consecutive_text = most_consecutive_points_row.find('td', class_='t1').get_text(strip=True)
                    t2_most_consecutive_text = most_consecutive_points_row.find('td', class_='t2').get_text(strip=True)
                    
                    t1_most_consecutive = int(t1_most_consecutive_text) if t1_most_consecutive_text.isdigit() else 0
                    t2_most_consecutive = int(t2_most_consecutive_text) if t2_most_consecutive_text.isdigit() else 0
                    
                    # Extract and handle 'game points'
                    game_points_row = rows[1]
                    t1_game_points_text = game_points_row.find('td', class_='t1').get_text(strip=True)
                    t2_game_points_text = game_points_row.find('td', class_='t2').get_text(strip=True)
                    
                    t1_game_points = int(t1_game_points_text) if t1_game_points_text.isdigit() else 0
                    t2_game_points = int(t2_game_points_text) if t2_game_points_text.isdigit() else 0
                    # Extracting the h2h analysis
                    modified_href = href.replace('stab=result', 'stab=h2h')
                    h2h_r = requests.get(modified_href, headers=headers)
                    h2h_soup = bs(h2h_r.content, 'html.parser')
                    stats_div_h2h = h2h_soup.select_one(f'li.stats.stats-{match_number}')
                    t1_h2h= int(stats_div_h2h.select_one('div.cifr1').get_text(strip=True))
                    t2_h2h= int(stats_div_h2h.select_one('div.cifr2').get_text(strip=True))
            
                
                    if discipline in ['MS', 'WS']:  # Singles Match
                        player_1_info = player_info[0].select_one('div.player1-wrap').text.strip()
                        player_2_info = player_info[1].select_one('div.player3-wrap').text.strip()
                        
                        team_1_name, team_1_seed = extract_name_and_seed(player_1_info)
                        team_2_name, team_2_seed = extract_name_and_seed(player_2_info)
                    
                    elif discipline in ['MD', 'WD', 'XD']:  # Doubles Match
                        player_1_team_1_info = player_info[0].select_one('div.player1-wrap').text.strip()
                        player_2_team_1_info = player_info[0].select_one('div.player2-wrap').text.strip()
                        player_1_team_2_info = player_info[1].select_one('div.player3-wrap').text.strip()
                        player_2_team_2_info = player_info[1].select_one('div.player4-wrap').text.strip()
                        
                        player_1_team_1_name, team_1_seed = extract_name_and_seed(player_1_team_1_info)
                        player_2_team_1_name, _ = extract_name_and_seed(player_2_team_1_info)  # Same seed as player 1
                        player_1_team_2_name, team_2_seed = extract_name_and_seed(player_1_team_2_info)
                        player_2_team_2_name, _ = extract_name_and_seed(player_2_team_2_info)

                        team_1_name = f"{player_1_team_1_name}, {player_2_team_1_name}"
                        team_2_name = f"{player_1_team_2_name}, {player_2_team_2_name}"
                        
                    data["Tournament Name"].append(tournament_name)
                    data["Tournament Date"].append(tournament_date)
                    data["Tournament Country"].append(tournament_country)
                    data["Discipline"].append(discipline)
                    data["Number of Sets Played"].append(len(set_scores))
                    data["Retired"].append(retired)
                    data["Match Duration"].append(match_duration)
                    data["Team 1 Nationalities"].append(team_1_nationality)
                    data["Team 2 Nationalities"].append(team_2_nationality)
                    data["Team 1 Name(s)"].append(team_1_name)
                    data["Team 2 Name(s)"].append(team_2_name)
                    data["Team 1 Seed"].append(team_1_seed)
                    data["Team 2 Seed"].append(team_2_seed)
                    data["Points Set 1 Team 1"].append(points_set_1_team_1)
                    data["Points Set 1 Team 2"].append(points_set_1_team_2)
                    data["Points Set 2 Team 1"].append(points_set_2_team_1)
                    data["Points Set 2 Team 2"].append(points_set_2_team_2)
                    data["Points Set 3 Team 1"].append(points_set_3_team_1)
                    doubles_data["Points Set 3 Team 2"].append(points_set_3_team_2)
                    data["Total Points Team 1"].append(total_points_team_1)
                    data["Total Points Team 2"].append(total_points_team_2)
                    data["Sets Won Team 1"].append(sets_won_team_1)
                    data["Sets Won Team 2"].append(sets_won_team_2)
                    data["Total Game Points Team 1"].append(t1_game_points)  
                    data["Total Game Points Team 2"].append(t2_game_points)
                    data["Most Consecutive Points Team 1"].append(t1_most_consecutive)
                    data["Most Consecutive Points Team 2"].append(t2_most_consecutive)
                    data["Team 1 Head to Head Analysis"].append(t1_h2h) 
                    data["Team 2 Head to Head Analysis"].append(t2_h2h)
                    data["Match Winner"].append(match_winner)

def process_matches_in_parallel(matches_filename, headers):
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_single_match, file, headers,data): file for file in matches_filename}
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error processing file {futures[future]}: {e}")
        return data

In [16]:
data = process_matches_in_parallel(matches_filename, headers)

In [21]:
#Create dataframes and fix some formatting 

data_df= pd.DataFrame(data)
cap_columns = [
    'Team 1 Name(s)', 'Team 2 Name(s)', 
    'Team 1 Nationalities', 'Team 2 Nationalities', 
    'Tournament Country', 'Tournament Name'
]
for col in cap_columns:
    data_df[col] = data_df[col].str.title()
data_df['Team 1 Nationalities'] = data_df['Team 1 Nationalities'].replace('Chn', 'China')
data_df['Team 2 Nationalities'] = data_df['Team 2 Nationalities'].replace('Chn', 'China')
data_df.loc[data_df['Tournament Country'] == 'England', 'Tournament Name'] = 'Yonex_All_England_Open'
data_df['Number of Sets Played'] = data_df['Number of Sets Played'].astype(int)
data_df['Retired'] = data_df['Retired'].astype(int)

data_df['Team 1 Seed'] = pd.to_numeric(data_df['Team 1 Seed'])
data_df['Team 2 Seed'] = pd.to_numeric(data_df['Team 2 Seed'])

to_int_points = [
    'Points Set 1 Team 1', 'Points Set 1 Team 2',
    'Points Set 2 Team 1', 'Points Set 2 Team 2',
    'Points Set 3 Team 1', 'Points Set 3 Team 2'
]

for col in to_int_points:
    data_df[col] = pd.to_numeric(data_df[col])

data_df.rename(columns={'Match Duration': 'Match Duration (min)'}, inplace=True)



### Player Ranking Web Scrape

In [22]:
discipline_ids = {
    "57" : "MS",
    "58" : "WS",
    "59" : "MD",
    "60" : "WD",
    "61" : "XD"
}

def fetch_options_url(discipline, cat_id, headers):
    url = f"https://bwfworldtour.bwfbadminton.com/rankings/?id=9&cat_id={cat_id}&ryear=2018&week=3&page_size=100&page_no=1"
    r = requests.get(url, headers=headers)
    soup = bs(r.text, 'html.parser')
    
    ranking_select = soup.find('select', id='ranking-week')
    urls = []
    if ranking_select:
        for option in ranking_select.find_all('option'):
            value = option['value']
            year, week = value.split('--')
            new_url = f"{url}&ryear={year}&week={week}&page_size=100&page_no=1"
            urls.append(new_url)
    
    return discipline, urls

def process_url(discipline, url, headers):
    r = requests.get(url, headers=headers)
    soup = bs(r.content, 'html.parser')
    
    rank_data = {
        "Rank": [],
        "Country/Territory": [],
        "Name": [],
        "Date": [],
        "Points Accumulated": []
    }
    
    date_option = soup.find('select', {'id': 'ranking-week'}).find('option', {'selected': 'selected'})
    date_text = date_option.text.strip()
    date = date_text.split('(')[1].split(')')[0]  # Extract date within brackets
    
    table = soup.find('table')
    for row in table.find_all('tr')[1:]:  # Skip the header row
        if not row.get('class') or 'row-even' in row.get('class'):
            cols = row.find_all('td')
            rank_data["Rank"].append(cols[0].text.strip())
            rank_data["Country/Territory"].append(cols[1].find('span').text.strip())
            rank_data["Name"].append(cols[2].find('a').text.strip())
            rank_data["Date"].append(date)
            rank_data["Points Accumulated"].append(cols[4].text.strip())
    
    return discipline, rank_data

def main(discipline_ids, headers):
    rankings_urls = {}
    rank_data = {discipline: {"Rank": [], "Country/Territory": [], "Name": [], "Date": [], "Points Accumulated": []} for discipline in discipline_ids.values()}
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_options_url, discipline, cat_id, headers) for cat_id, discipline in discipline_ids.items()]
        for future in as_completed(futures):
            discipline, urls = future.result()
            rankings_urls[discipline] = urls
        
        futures = [executor.submit(process_url, discipline, url, headers) for discipline, urls in rankings_urls.items() for url in urls]
        for future in as_completed(futures):
            discipline, data = future.result()
            rank_data[discipline]["Rank"].extend(data["Rank"])
            rank_data[discipline]["Country/Territory"].extend(data["Country/Territory"])
            rank_data[discipline]["Name"].extend(data["Name"])
            rank_data[discipline]["Date"].extend(data["Date"])
            rank_data[discipline]["Points Accumulated"].extend(data["Points Accumulated"])

    return rank_data

rank_data = main(discipline_ids, headers)

In [23]:
dataframes = {}
# Create a DataFrame for each discipline
for discipline in rank_data.keys():
    # Create DataFrame for the current discipline
    df = pd.DataFrame(rank_data[discipline])
    
    # Store the DataFrame in the dictionary
    dataframes[discipline] = df
WS_rank = dataframes['WS']
WS_rank["Name"] = WS_rank["Name"].str.title()
MS_rank = dataframes['MS']
MS_rank["Name"] = MS_rank["Name"].str.title()
WD_rank = dataframes['WD']
WD_rank["Name"] = WD_rank["Name"].str.title()
MD_rank = dataframes['MD']
MD_rank["Name"] = MD_rank["Name"].str.title()
XD_rank = dataframes['XD']
XD_rank["Name"] = XD_rank["Name"].str.title()

In [24]:
# Export DataFrames as CSV
WS_rank.to_csv('CSV Files/WS_rank.csv', index=False)
MS_rank.to_csv('CSV Files/MS_rank.csv', index=False)
WD_rank.to_csv('CSV Files/WD_rank.csv', index=False)
MD_rank.to_csv('CSV Files/MD_rank.csv', index=False)
XD_rank.to_csv('CSV Files/XD_rank.csv', index=False)

data_df.to_csv('CSV Files/data_df.csv', index=True)