# VLR.gg Scraping

## Getting match data

In [74]:
import json
import requests
from bs4 import BeautifulSoup

# Load the URLs from the JSON file
with open('./data/match_urls.json', 'r') as f:
    urls = json.load(f)

In [75]:
print(f'Total urls: {len(urls)}')
first_two = urls[:2]
first_two[0]

Total urls: 23722


'https://www.vlr.gg/293948/bar-a-esports-vs-aym-esports-challengers-league-2024-spain-rising-split-1-r9'

In [76]:
def extract_overview(team_div: BeautifulSoup) -> dict:
    # Extract the team's score
    team_score_div = team_div.find('div', class_='score')
    team_score = team_score_div.text.strip() if team_score_div else None

    # Extract the team's name
    team_name_div = team_div.find('div', class_='team-name')
    team_name = team_name_div.text.strip() if team_name_div else None

    # Extract the T-side and CT-side rounds won
    t_side_score_div = team_div.find('span', class_='mod-t')
    t_side_score = t_side_score_div.text.strip() if t_side_score_div else None

    ct_side_score_div = team_div.find('span', class_='mod-ct')
    ct_side_score = ct_side_score_div.text.strip() if ct_side_score_div else None

    team_overview = {
        'name': team_name,
        'score': team_score,
        't_side_score': t_side_score,
        'ct_side_score': ct_side_score
    }

    return team_overview

In [77]:
def extract_player_info(table):
    player_info = []

    # Find the tbody in the table
    tbody = table.find('tbody')

    # Find all rows in the tbody
    rows = tbody.find_all('tr') if tbody else []

    # Iterate over each row
    for row in rows:
        # Extract player data
        player_data = {}

        # Find the player td in the row
        player_details_row = row.find('td', class_='mod-player')
        if player_details_row:
            country_div = player_details_row.find('i', class_='flag')
            player_data['country'] = country_div.get('title') if country_div else None

            name_div = player_details_row.find('div', class_='text-of')
            player_data['name'] = name_div.text.strip() if name_div else None

            team_code_div = player_details_row.find('div', class_='ge-text-light')
            player_data['team_code'] = team_code_div.text.strip() if team_code_div else None

        # Find the agent td in the row
        agent_row = row.find('td', class_='mod-agents')
        # Extract the agent's name
        if agent_row:
            agent_img = agent_row.find('img')
            player_data['agent'] = agent_img.get('title') if agent_img else None

        # Find the R stats td in the row
        r_stats_row = row.find_all('td')[2]  # Assuming R stats is always the third td - some tds have the same class
        if r_stats_row:
            r_stats_div = r_stats_row.find('span', class_='stats-sq')
            if r_stats_div:
                r_stats = {
                    'both': r_stats_div.find('span', class_='mod-both').text.strip() if r_stats_div.find('span', class_='mod-both') else None,
                    't': r_stats_div.find('span', class_='mod-t').text.strip() if r_stats_div.find('span', class_='mod-t') else None,
                    'ct': r_stats_div.find('span', class_='mod-ct').text.strip() if r_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['r_stats'] = r_stats

        # Find the ACS stats td in the row
        acs_stats_row = row.find_all('td')[3]  # Assuming ACS stats is always the fourth td
        if acs_stats_row:
            acs_stats_div = acs_stats_row.find('span', class_='stats-sq')
            if acs_stats_div:
                acs_stats = {
                    'both': acs_stats_div.find('span', class_='mod-both').text.strip() if acs_stats_div.find('span', class_='mod-both') else None,
                    't': acs_stats_div.find('span', class_='mod-t').text.strip() if acs_stats_div.find('span', class_='mod-t') else None,
                    'ct': acs_stats_div.find('span', class_='mod-ct').text.strip() if acs_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['acs_stats'] = acs_stats

        # Find the kills stats td in the row
        kill_stats_row = row.find('td', class_='mod-vlr-kills')
        if kill_stats_row:
            kill_stats_div = kill_stats_row.find('span', class_='stats-sq')
            if kill_stats_div:
                kill_stats = {
                    'both': kill_stats_div.find('span', class_='mod-both').text.strip() if kill_stats_div.find('span', class_='mod-both') else None,
                    't': kill_stats_div.find('span', class_='mod-t').text.strip() if kill_stats_div.find('span', class_='mod-t') else None,
                    'ct': kill_stats_div.find('span', class_='mod-ct').text.strip() if kill_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['kill_stats'] = kill_stats

        # Find the death stats td in the row
        death_stats_row = row.find('td', class_='mod-vlr-deaths')
        if death_stats_row:
            death_stats_div = death_stats_row.find('span', class_='stats-sq')
            if death_stats_div:
                death_stats = {
                    'both': death_stats_div.find('span', class_='mod-both').text.strip() if death_stats_div.find('span', class_='mod-both') else None,
                    't': death_stats_div.find('span', class_='mod-t').text.strip() if death_stats_div.find('span', class_='mod-t') else None,
                    'ct': death_stats_div.find('span', class_='mod-ct').text.strip() if death_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['death_stats'] = death_stats

        # Find the assist stats td in the row
        assist_stats_row = row.find('td', class_='mod-vlr-assists')
        if assist_stats_row:
            assist_stats_div = assist_stats_row.find('span', class_='stats-sq')
            if assist_stats_div:
                assist_stats = {
                    'both': assist_stats_div.find('span', class_='mod-both').text.strip() if assist_stats_div.find('span', class_='mod-both') else None,
                    't': assist_stats_div.find('span', class_='mod-t').text.strip() if assist_stats_div.find('span', class_='mod-t') else None,
                    'ct': assist_stats_div.find('span', class_='mod-ct').text.strip() if assist_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['assist_stats'] = assist_stats

        # Find the KD difference stats td in the row
        kd_diff_stats_row = row.find('td', class_='mod-kd-diff')
        if kd_diff_stats_row:
            kd_diff_stats_div = kd_diff_stats_row.find('span', class_='stats-sq')
            if kd_diff_stats_div:
                kd_diff_stats = {
                    'both': kd_diff_stats_div.find('span', class_='mod-both').text.strip() if kd_diff_stats_div.find('span', class_='mod-both') else None,
                    't': kd_diff_stats_div.find('span', class_='mod-t').text.strip() if kd_diff_stats_div.find('span', class_='mod-t') else None,
                    'ct': kd_diff_stats_div.find('span', class_='mod-ct').text.strip() if kd_diff_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['kd_diff_stats'] = kd_diff_stats

        # Find the KAST stats td in the row
        kast_stats_row = row.find_all('td')[8]  # Assuming KAST stats is always the ninth td
        if kast_stats_row:
            kast_stats_div = kast_stats_row.find('span', class_='stats-sq')
            if kast_stats_div:
                kast_stats = {
                    'both': kast_stats_div.find('span', class_='mod-both').text.strip() if kast_stats_div.find('span', class_='mod-both') else None,
                    't': kast_stats_div.find('span', class_='mod-t').text.strip() if kast_stats_div.find('span', class_='mod-t') else None,
                    'ct': kast_stats_div.find('span', class_='mod-ct').text.strip() if kast_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['kast_stats'] = kast_stats

        # Find the ADR stats td in the row
        adr_stats_row = row.find_all('td')[9]  # Assuming ADR stats is always the tenth td
        if adr_stats_row:
            adr_stats_div = adr_stats_row.find('span', class_='stats-sq')
            if adr_stats_div:
                adr_stats = {
                    'both': adr_stats_div.find('span', class_='mod-both').text.strip() if adr_stats_div.find('span', class_='mod-both') else None,
                    't': adr_stats_div.find('span', class_='mod-t').text.strip() if adr_stats_div.find('span', class_='mod-t') else None,
                    'ct': adr_stats_div.find('span', class_='mod-ct').text.strip() if adr_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['adr_stats'] = adr_stats

        # Find the headshot stats td in the row
        headshot_stats_row = row.find_all('td')[10]  # Assuming headshot stats is always the eleventh td
        if headshot_stats_row:
            headshot_stats_div = headshot_stats_row.find('span', class_='stats-sq')
            if headshot_stats_div:
                headshot_stats = {
                    'both': headshot_stats_div.find('span', class_='mod-both').text.strip() if headshot_stats_div.find('span', class_='mod-both') else None,
                    't': headshot_stats_div.find('span', class_='mod-t').text.strip() if headshot_stats_div.find('span', class_='mod-t') else None,
                    'ct': headshot_stats_div.find('span', class_='mod-ct').text.strip() if headshot_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['headshot_stats'] = headshot_stats

        # Find the FK stats td in the row
        fk_stats_row = row.find_all('td')[11]  # Assuming FK stats is always the twelfth td
        if fk_stats_row:
            fk_stats_div = fk_stats_row.find('span', class_='stats-sq')
            if fk_stats_div:
                fk_stats = {
                    'both': fk_stats_div.find('span', class_='mod-both').text.strip() if fk_stats_div.find('span', class_='mod-both') else None,
                    't': fk_stats_div.find('span', class_='mod-t').text.strip() if fk_stats_div.find('span', class_='mod-t') else None,
                    'ct': fk_stats_div.find('span', class_='mod-ct').text.strip() if fk_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['fk_stats'] = fk_stats

        # Find the FD stats td in the row
        fd_stats_row = row.find_all('td')[12]  # Assuming FD stats is always the thirteenth td
        if fd_stats_row:
            fd_stats_div = fd_stats_row.find('span', class_='stats-sq')
            if fd_stats_div:
                fd_stats = {
                    'both': fd_stats_div.find('span', class_='mod-both').text.strip() if fd_stats_div.find('span', class_='mod-both') else None,
                    't': fd_stats_div.find('span', class_='mod-t').text.strip() if fd_stats_div.find('span', class_='mod-t') else None,
                    'ct': fd_stats_div.find('span', class_='mod-ct').text.strip() if fd_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['fd_stats'] = fd_stats

        # Find the FK-Diff stats td in the row
        fk_diff_stats_row = row.find_all('td')[13]  # Assuming FK-Diff stats is always the fourteenth td
        if fk_diff_stats_row:
            fk_diff_stats_div = fk_diff_stats_row.find('span', class_='stats-sq')
            if fk_diff_stats_div:
                fk_diff_stats = {
                    'both': fk_diff_stats_div.find('span', class_='mod-both').text.strip() if fk_diff_stats_div.find('span', class_='mod-both') else None,
                    't': fk_diff_stats_div.find('span', class_='mod-t').text.strip() if fk_diff_stats_div.find('span', class_='mod-t') else None,
                    'ct': fk_diff_stats_div.find('span', class_='mod-ct').text.strip() if fk_diff_stats_div.find('span', class_='mod-ct') else None
                }
                player_data['fk_diff_stats'] = fk_diff_stats

        player_info.append(player_data)

    return player_info

In [78]:
def scrape_game(game_div: BeautifulSoup) -> dict:
    # Check if the div contains details
    if not game_div.find('div', class_='vm-stats-game-header'):
        return None

    # Find the team_left_div and team_right_div
    team_divs = game_div.find_all('div', class_='team')
    team_left_div, team_right_div = team_divs[0], team_divs[1]

    # Extract the game overview
    team_left_overview = extract_overview(team_left_div)
    team_right_overview = extract_overview(team_right_div)

    # Extract the map name and game duration
    map_div = game_div.find('div', class_='map')
    map_name_span = map_div.find('span')
    map_name = map_name_span.text.strip() if map_name_span else None

    game_duration_div = map_div.find('div', class_='map-duration')
    game_duration = game_duration_div.text.strip() if game_duration_div else None

    # Extract player info
    player_tables = game_div.find_all('table', class_='wf-table-inset')
    team_left_players = extract_player_info(player_tables[0])
    team_right_players = extract_player_info(player_tables[1])

    game_data = {
        'map': {
            'name': map_name,
            'duration': game_duration
        },
        'team_left': {
            'team_overview': team_left_overview,
            'players': team_left_players
        },
        'team_right': {
            'team_overview': team_right_overview,
            'players': team_right_players
        }
    }

    return game_data

In [79]:
def scrape_match(url: str) -> dict:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Basic data
    match_header_vs_div = soup.find('div', class_='match-header-vs')
    team_names = [div.text.strip() for div in match_header_vs_div.find_all('div', class_='wf-title-med')]

    scoreline_div = soup.find('div', class_='match-header-vs-score')
    winner_score = scoreline_div.find('span', class_='match-header-vs-score-winner').text.strip()
    loser_score = scoreline_div.find('span', class_='match-header-vs-score-loser').text.strip()
    scoreline = f'{winner_score}:{loser_score}'

    stage = soup.find('div', class_='match-header-vs-note').text.strip() # e.g. Final, Semi-final, etc.
    match_type = soup.find_all('div', class_='match-header-vs-note')[1].text.strip() # e.g. Best of 3, Best of 5, etc.

    match_header_date_div = soup.find('div', class_='match-header-date')
    date = match_header_date_div.find_all('div', class_='moment-tz-convert')[0].text.strip()
    time = match_header_date_div.find_all('div', class_='moment-tz-convert')[1].text.strip()

    # Map-specific details
    stats_container_div = soup.find('div', class_='vm-stats-container')
    game_divs = stats_container_div.find_all('div', class_='vm-stats-game')
    game_data = [scrape_game(div) for div in game_divs if scrape_game(div) is not None]

    match_data = {
        'team_1': team_names[0],
        'team_2': team_names[1],
        'scoreline': scoreline,
        'stage': stage,
        'match_type': match_type,
        'date': date,
        'time': time,
        'games': game_data
    }

    return match_data

In [None]:
from pprint import pprint

# Test match, random
pprint(scrape_match(first_two[0]))

## Testing on a few URLs

In [None]:
urls = []
data = []

# Logging the URLs we've scraped
with open('scraped_urls.log', 'a') as log_file:

    for i, url in enumerate(urls):
        scraped_data = scrape_match(url)
        data.append(scraped_data)
        log_file.write(url + '\n')

        # Save the data every 100 URLs
        if i % 100 == 0 and i > 0:
            with open('scraped_data.json', 'a') as data_file:
                json.dump(data, data_file)
                data = []

    if data:
        with open('scraped_data.json', 'a') as data_file:
            json.dump(data, data_file)

In [81]:
# VCT game, recent
# pprint(scrape_match('https://www.vlr.gg/286651/team-liquid-vs-edward-gaming-afreecatv-valorant-league-decider-b/'))

In [82]:
# Old game
# pprint(scrape_match('https://www.vlr.gg/191/prodigy-vs-fish123-cooler-cup-playoffs-qf'))