In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the website
url = 'https://gol.gg/tournament/tournament-picksandbans/Worlds%20Main%20Event%202024/'

# Send GET request to the page
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract win rate, picks, and bans
data = []
for champion in soup.find_all('div', class_='ChampionLine'):
    bans = champion.find('div', class_='bans').text.strip()
    picks = champion.find('div', class_='picks').text.strip()
    winrate = champion.find('div', class_='winrate').text.strip()
    data.append({'bans': bans, 'picks': picks, 'winrate': winrate})

# Output the data
for entry in data:
    print(entry)


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: URL to scrape
url = "https://gol.gg/players/player-matchlist/5177/season-ALL/split-ALL/tournament-LPL%20Summer%20Playoffs%202024/"

# Step 2: Send HTTP request
headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a browser
response = requests.get(url, headers=headers)
response.raise_for_status()  # Ensure request was successful

# Step 3: Parse HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Step 4: Locate the table
table = soup.find("table", {"class": "table_list"})

# Step 5: Extract data
data = []
rows = table.find_all("tr")

for row in rows[1:]:  # Skip the header row
    cells = row.find_all("td")
    data.append([cell.text.strip() for cell in cells])

# Step 6: Convert to DataFrame
columns = [header.text.strip() for header in rows[0].find_all("th")]  # Extract headers
df = pd.DataFrame(data, columns=columns)

# Step 7: Display or save the data
print(df)
df.to_csv("player_matchlist.csv", index=False)


       Champion   Result  Score Build Duration        Date       Game  \
0         Varus   Defeat  0/2/7          30:32  2024-08-11  FPX vs TT   
1  Miss Fortune  Victory  2/0/6          41:31  2024-08-11  FPX vs TT   
2        Ezreal   Defeat  3/3/5          34:08  2024-08-11  FPX vs TT   
3        Ezreal   Defeat  4/1/2          28:25  2024-08-11  FPX vs TT   
4        Ezreal  Victory  9/2/8          30:37  2024-08-11  FPX vs TT   
5        Ezreal  Victory  4/0/5          25:01  2024-08-05  OMG vs TT   
6  Miss Fortune  Victory  5/0/8          23:20  2024-08-05  OMG vs TT   
7       Smolder  Victory  2/1/9          26:22  2024-08-05  OMG vs TT   

                 Tournament  
0  LPL Summer Playoffs 2024  
1  LPL Summer Playoffs 2024  
2  LPL Summer Playoffs 2024  
3  LPL Summer Playoffs 2024  
4  LPL Summer Playoffs 2024  
5  LPL Summer Playoffs 2024  
6  LPL Summer Playoffs 2024  
7  LPL Summer Playoffs 2024  


In [None]:
import openpyxl

# Load the Excel file
file_path = "player_links.xlsx"  # Replace with the actual file path
wb = openpyxl.load_workbook(file_path)
sheet = wb.active  # Access the active sheet (or specify the sheet name)

# Iterate through rows and extract names and hyperlinks
player_data = []
for row in sheet.iter_rows():
    for cell in row:
        if cell.hyperlink:
            player_name = cell.value
            player_link = cell.hyperlink.target  # Extract the hyperlink
            player_data.append((player_name, player_link))

# Print or save the extracted data
for name, link in player_data:
    print(f"Player: {name}, Link: {link}")

# Optionally save to CSV
import csv
with open("extracted_player_links.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Player Name", "Hyperlink"])
    writer.writerows(player_data)


Player: 909, Link: https://gol.gg/players/player-stats/4768/season-S14/split-ALL/tournament-ALL/
Player: 1xn, Link: https://gol.gg/players/player-stats/5177/season-S14/split-ALL/tournament-ALL/
Player: 369, Link: https://gol.gg/players/player-stats/1922/season-S14/split-ALL/tournament-ALL/
Player: AKi, Link: https://gol.gg/players/player-stats/3298/season-S14/split-ALL/tournament-ALL/
Player: Able, Link: https://gol.gg/players/player-stats/1266/season-S14/split-ALL/tournament-ALL/
Player: Ahn, Link: https://gol.gg/players/player-stats/4403/season-S14/split-ALL/tournament-ALL/
Player: Ale, Link: https://gol.gg/players/player-stats/1968/season-S14/split-ALL/tournament-ALL/
Player: Angel, Link: https://gol.gg/players/player-stats/1708/season-S14/split-ALL/tournament-ALL/
Player: Assum, Link: https://gol.gg/players/player-stats/4767/season-S14/split-ALL/tournament-ALL/
Player: Beichuan, Link: https://gol.gg/players/player-stats/3767/season-S14/split-ALL/tournament-ALL/
Player: Bin, Link: h

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Step 1: Read player data from the extracted file
input_csv = "extracted_player_links.csv"  # Replace with your actual file name
players_df = pd.read_csv(input_csv)

# Extract player IDs and names from the `player-stats` URLs
players = players_df[['Player Name', 'Hyperlink']]
players['Player ID'] = players['Hyperlink'].apply(lambda x: x.split('/')[4])  # Extract player ID from URL

# Step 2: Define matchlist URL format and parameters
base_url = "https://gol.gg/players/player-matchlist/{player_id}/season-{season}/split-{split}/tournament-ALL/"
seasons = [f"S{i}" for i in range(6, 15)]  # Seasons S6 to S14
splits = ["Pre-Season", "Spring", "Summer"]  # Splits

# Step 3: Function to scrape match history for a single player, season, and split
def scrape_matches(player_name, player_id, season, split):
    url = base_url.format(player_id=player_id, season=season, split=split)
    print(f"Scraping: {url}")
    
    headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a browser
    response = requests.get(url, headers=headers)
    
    # Handle request errors
    if response.status_code != 200:
        print(f"Failed to scrape {url}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", {"class": "table_list"})
    
    # Skip if no table found
    if not table:
        print(f"No data found for {player_name} ({player_id}) - {season} {split}")
        return None

    # Extract match data
    data = []
    rows = table.find_all("tr")
    columns = [header.text.strip() for header in rows[0].find_all("th")]  # Extract headers
    
    for row in rows[1:]:  # Skip the header row
        cells = row.find_all("td")
        data.append([cell.text.strip() for cell in cells])
    
    # Return data as DataFrame
    df = pd.DataFrame(data, columns=columns)
    df['Player Name'] = player_name
    df['Player ID'] = player_id
    df['Season'] = season
    df['Split'] = split
    return df

# Step 4: Loop through all players, seasons, and splits
all_data = []
error_log = []

for _, player in players.iterrows():
    player_name = player['Player Name']
    player_id = player['Player ID']
    
    for season in seasons:
        for split in splits:
            try:
                df = scrape_matches(player_name, player_id, season, split)
                if df is not None:
                    all_data.append(df)
            except Exception as e:
                error_message = f"Error scraping {player_name} ({player_id}) - {season} {split}: {e}"
                print(error_message)
                error_log.append(error_message)

# Step 5: Combine all data and save
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    output_file = "all_players_matchlist_S6_S14.csv"
    final_df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Step 6: Save error log if any
if error_log:
    with open("error_log.txt", "w") as f:
        for error in error_log:
            f.write(error + "\n")
    print("Errors logged to error_log.txt")


Scraping: https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Pre-Season/tournament-ALL/
Failed to scrape https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Pre-Season/tournament-ALL/: 404
Scraping: https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Spring/tournament-ALL/
Failed to scrape https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Spring/tournament-ALL/: 404
Scraping: https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Summer/tournament-ALL/
Failed to scrape https://gol.gg/players/player-matchlist/player-stats/season-S6/split-Summer/tournament-ALL/: 404
Scraping: https://gol.gg/players/player-matchlist/player-stats/season-S7/split-Pre-Season/tournament-ALL/
Failed to scrape https://gol.gg/players/player-matchlist/player-stats/season-S7/split-Pre-Season/tournament-ALL/: 404
Scraping: https://gol.gg/players/player-matchlist/player-stats/season-S7/split-Spring/tournament-ALL/
Failed to scrape h

KeyboardInterrupt: 

In [16]:
input_csv = "extracted_player_links.csv"  # Replace with your actual file name
players_df = pd.read_csv(input_csv)

# Extract player IDs and names from the `player-stats` URLs
players = players_df[['Player Name', 'Hyperlink']]
players['Player ID'] = players['Hyperlink'].apply(lambda x: x.split('/')[5])  # Extract player ID from URL

players

Unnamed: 0,Player Name,Hyperlink,Player ID
0,909,https://gol.gg/players/player-stats/4768/seaso...,4768
1,1xn,https://gol.gg/players/player-stats/5177/seaso...,5177
2,369,https://gol.gg/players/player-stats/1922/seaso...,1922
3,AKi,https://gol.gg/players/player-stats/3298/seaso...,3298
4,Able,https://gol.gg/players/player-stats/1266/seaso...,1266
...,...,...,...
113,shanji,https://gol.gg/players/player-stats/3312/seaso...,3312
114,sheer,https://gol.gg/players/player-stats/5351/seaso...,5351
115,xiaofang,https://gol.gg/players/player-stats/5360/seaso...,5360
116,xiaohu,https://gol.gg/players/player-stats/347/season...,347


In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Step 1: Read player data from the extracted file
input_csv = "extracted_player_links.csv"  # Replace with your actual file name
players_df = pd.read_csv(input_csv)

# Extract player IDs and names from the `player-stats` URLs
players = players_df[['Player Name', 'Hyperlink']]
players['Player ID'] = players['Hyperlink'].apply(lambda x: x.split('/')[5])  # Extract player ID from URL

# Step 2: Define matchlist URL format and parameters
base_url = "https://gol.gg/players/player-matchlist/{player_id}/season-{season}/split-{split}/tournament-ALL/"
seasons = [f"S{i}" for i in range(6, 15)]  # Seasons S6 to S14
splits = ["Pre-Season", "Spring", "Summer"]  # Splits

# Step 3: Function to scrape match history for a single player, season, and split
def scrape_matches(player_name, player_id, season, split, scraped_urls):
    url = base_url.format(player_id=player_id, season=season, split=split)
    
    # Skip URLs already processed
    if url in scraped_urls:
        print(f"Skipping already scraped URL: {url}")
        return None
    scraped_urls.add(url)

    print(f"Scraping: {url}")
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Timeout added
        if response.status_code != 200:
            print(f"Failed to scrape {url}: {response.status_code}")
            return None
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Check if the table exists; skip parsing if not
        table = soup.find("table", {"class": "table_list"})
        if not table:
            print(f"No data found for {player_name} ({player_id}) - {season} {split}")
            return None

        # Extract match data
        data = []
        rows = table.find_all("tr")
        columns = [header.text.strip() for header in rows[0].find_all("th")]  # Extract headers
        
        for row in rows[1:]:  # Skip the header row
            cells = row.find_all("td")
            data.append([cell.text.strip() for cell in cells])
        
        # Return data as DataFrame
        df = pd.DataFrame(data, columns=columns)
        df['Player Name'] = player_name
        df['Player ID'] = player_id
        df['Season'] = season
        df['Split'] = split
        return df

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Step 4: Main Loop for Scraping
all_data = []
error_log = []
scraped_urls = set()  # Track already processed URLs
total_players = len(players)

for idx, player in players.iterrows():
    player_name = player['Player Name']
    player_id = player['Player ID']
    
    print(f"Processing player {idx + 1} of {total_players}: {player_name} ({player_id})")
    for season in seasons:
        for split in splits:
            df = scrape_matches(player_name, player_id, season, split, scraped_urls)
            if df is not None:
                all_data.append(df)
            time.sleep(1)  # Add delay between requests to avoid overwhelming the server

# Step 5: Combine all data and save
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    output_file = "all_players_matchlist_S6_S14.csv"
    final_df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Step 6: Save error log if any
if error_log:
    with open("error_log.txt", "w") as f:
        for error in error_log:
            f.write(error + "\n")
    print("Errors logged to error_log.txt")


Processing player 1 of 118: 909 (4768)
Scraping: https://gol.gg/players/player-matchlist/4768/season-S6/split-Pre-Season/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S6/split-Spring/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S6/split-Summer/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S7/split-Pre-Season/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S7/split-Spring/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S7/split-Summer/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S8/split-Pre-Season/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S8/split-Spring/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S8/split-Summer/tournament-ALL/
Scraping: https://gol.gg/players/player-matchlist/4768/season-S9/split-Pre-Season/tournament-ALL/
Scrap

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Step 1: Read player data from the extracted file
input_csv = "extracted_player_links.csv"  # Replace with your actual file name
players_df = pd.read_csv(input_csv)

# Extract player IDs and names from the `player-stats` URLs
players = players_df[['Player Name', 'Hyperlink']]
players['Player ID'] = players['Hyperlink'].apply(lambda x: x.split('/')[5])  # Extract player ID from URL

# Step 2: Define matchlist URL format and parameters
base_url = "https://gol.gg/players/player-matchlist/{player_id}/season-ALL/split-ALL/tournament-{tournament}/"
tournaments = [
    "World%20Championship%202016",
    "Mid-Season%20Invitational%202016",
    "World%20Championship%202017",
    "World%20Championship%20Play-In%202017",
    "Mid-Season%20Invitational%202017",
    "MSI%20Play-In%202017",
    "World%20Championship%202018",
    "World%20Championship%20Play-In%202018",
    "Mid-Season%20Invitational%202018",
    "MSI%20Play-In%202018",
    "World%20Championship%202019",
    "World%20Championship%20Play-In%202019",
    "Mid-Season%20Invitational%202019",
    "MSI%20Play-In%202019",
    "World%20Championship%202020",
    "World%20Championship%20Play-In%202020",
    "Mid-Season%20Cup%202020",
    "World%20Championship%202021",
    "World%20Championship%20Play-In%202021",
    "MSI%202021",
    "World%20Championship%202022",
    "World%20Championship%20Play-In%202022",
    "MSI%202022",
    "Worlds%20Main%20Event%202023",
    "Worlds%20Play-In%202023",
    "Worlds%20Qualifying%20Series%202023",
    "MSI%202023",
    "Worlds%20Main%20Event%202024",
    "Worlds%20Play-In%202024",
    "MSI%202024"
]



# Step 3: Function to scrape match history for a single player, season, and split
def scrape_matches(player_name, player_id, tournament, scraped_urls):
    url = base_url.format(player_id=player_id, tournament=tournament)
    
    # Skip URLs already processed
    if url in scraped_urls:
        print(f"Skipping already scraped URL: {url}")
        return None
    scraped_urls.add(url)

    print(f"Scraping: {url}")
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Timeout added
        if response.status_code != 200:
            print(f"Failed to scrape {url}: {response.status_code}")
            return None
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Check if the table exists; skip parsing if not
        table = soup.find("table", {"class": "table_list"})
        if not table:
            print(f"No data found for {player_name} ({player_id}) - {tournament}")
            return None

        # Extract match data
        data = []
        rows = table.find_all("tr")
        columns = [header.text.strip() for header in rows[0].find_all("th")]  # Extract headers
        
        for row in rows[1:]:  # Skip the header row
            cells = row.find_all("td")
            data.append([cell.text.strip() for cell in cells])
        
        # Return data as DataFrame
        df = pd.DataFrame(data, columns=columns)
        df['Player Name'] = player_name
        df['Player ID'] = player_id
        return df

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Step 4: Main Loop for Scraping
all_data = []
error_log = []
scraped_urls = set()  # Track already processed URLs
total_players = len(players)

for idx, player in players.iterrows():
    player_name = player['Player Name']
    player_id = player['Player ID']
    
    print(f"Processing player {idx + 1} of {total_players}: {player_name} ({player_id})")
    for tournament in tournaments:
        df = scrape_matches(player_name, player_id, tournament, scraped_urls)
        if df is not None:
            all_data.append(df)
        time.sleep(0.01)  # Add delay between requests to avoid overwhelming the server

# Step 5: Combine all data and save
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    output_file = "all_players_matchlist_worlds.csv"
    final_df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Step 6: Save error log if any
if error_log:
    with open("error_log.txt", "w") as f:
        for error in error_log:
            f.write(error + "\n")
    print("Errors logged to error_log.txt")


Processing player 1 of 118: 909 (4768)
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-World%20Championship%202016/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-Mid-Season%20Invitational%202016/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-World%20Championship%202017/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-World%20Championship%20Play-In%202017/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-Mid-Season%20Invitational%202017/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-MSI%20Play-In%202017/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-World%20Championship%202018/
Scraping: https://gol.gg/players/player-matchlist/4768/season-ALL/split-ALL/tournament-World%20Championship%20Play-In%202018/
Scraping: 