# VLR.gg Scraping

## Getting match data

In [37]:
import json
import requests
from bs4 import BeautifulSoup

# Load the URLs from the JSON file
with open('./data/match_urls.json', 'r') as f:
    urls = json.load(f)

In [38]:
print(f'Total urls: {len(urls)}')
first_two = urls[:2]
first_two[0]

Total urls: 23722


'https://www.vlr.gg/293948/bar-a-esports-vs-aym-esports-challengers-league-2024-spain-rising-split-1-r9'

In [39]:
def extract_overview(team_div: BeautifulSoup) -> dict:
    # Extract the team's score
    team_score_div = team_div.find('div', class_='score')
    team_score = team_score_div.text.strip() if team_score_div else None

    # Extract the team's name
    team_name_div = team_div.find('div', class_='team-name')
    team_name = team_name_div.text.strip() if team_name_div else None

    # Extract the T-side and CT-side rounds won
    t_side_score_div = team_div.find('span', class_='mod-t')
    t_side_score = t_side_score_div.text.strip() if t_side_score_div else None

    ct_side_score_div = team_div.find('span', class_='mod-ct')
    ct_side_score = ct_side_score_div.text.strip() if ct_side_score_div else None

    team_overview = {
        'name': team_name,
        'score': team_score,
        't_side_score': t_side_score,
        'ct_side_score': ct_side_score
    }

    return team_overview

In [40]:
def scrape_game(game_div: BeautifulSoup) -> dict:
    # Check if the div contains details
    if not game_div.find('div', class_='vm-stats-game-header'):
        return None

    # Find the team_left_div and team_right_div
    team_divs = game_div.find_all('div', class_='team')
    team_left_div, team_right_div = team_divs[0], team_divs[1]

    # Extract the game overview
    team_left_overview = extract_overview(team_left_div)
    team_right_overview = extract_overview(team_right_div)

    # Extract the map name and game duration
    map_div = game_div.find('div', class_='map')
    map_name_span = map_div.find('span')
    map_name = map_name_span.text.strip() if map_name_span else None

    game_duration_div = map_div.find('div', class_='map-duration')
    game_duration = game_duration_div.text.strip() if game_duration_div else None

    game_data = {
        'map': {
            'name': map_name,
            'duration': game_duration
        },
        'team_left': team_left_overview,
        'team_right': team_right_overview
    }

    return game_data

In [41]:
def scrape_match(url: str) -> dict:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Basic data
    match_header_vs_div = soup.find('div', class_='match-header-vs')
    team_names = [div.text.strip() for div in match_header_vs_div.find_all('div', class_='wf-title-med')]

    scoreline_div = soup.find('div', class_='match-header-vs-score')
    winner_score = scoreline_div.find('span', class_='match-header-vs-score-winner').text.strip()
    loser_score = scoreline_div.find('span', class_='match-header-vs-score-loser').text.strip()
    scoreline = f'{winner_score}:{loser_score}'

    stage = soup.find('div', class_='match-header-vs-note').text.strip() # e.g. Final, Semi-final, etc.
    match_type = soup.find_all('div', class_='match-header-vs-note')[1].text.strip() # e.g. Best of 3, Best of 5, etc.

    match_header_date_div = soup.find('div', class_='match-header-date')
    date = match_header_date_div.find_all('div', class_='moment-tz-convert')[0].text.strip()
    time = match_header_date_div.find_all('div', class_='moment-tz-convert')[1].text.strip()

    # Map-specific details
    stats_container_div = soup.find('div', class_='vm-stats-container')
    game_divs = stats_container_div.find_all('div', class_='vm-stats-game')
    game_data = [scrape_game(div) for div in game_divs if scrape_game(div) is not None]

    match_data = {
        'team_1': team_names[0],
        'team_2': team_names[1],
        'scoreline': scoreline,
        'stage': stage,
        'match_type': match_type,
        'date': date,
        'time': time,
        'games': game_data
    }

    return match_data

In [42]:
from pprint import pprint

# Test match, random
match_data = scrape_match(first_two[0])

In [43]:
from pprint import pprint

# VCT game, recent
pprint(scrape_match('https://www.vlr.gg/286651/team-liquid-vs-edward-gaming-afreecatv-valorant-league-decider-b/'))

{'date': 'Friday, December 8th',
 'games': [{'map': {'duration': '38:00',
                    'name': 'Haven\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n'
                            '\t\t\t\t\t\t\t\tPICK'},
            'team_left': {'ct_side_score': '9',
                          'name': 'Team Liquid',
                          'score': '13',
                          't_side_score': '4'},
            'team_right': {'ct_side_score': '2',
                           'name': 'EDward Gaming',
                           'score': '5',
                           't_side_score': '3'}},
           {'map': {'duration': '34:22',
                    'name': 'Lotus\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n'
                            '\t\t\t\t\t\t\t\tPICK'},
            'team_left': {'ct_side_score': '11',
                          'name': 'Team Liquid',
                          'score': '13',
                          't_side_score': '2'},
            'team_right': {'ct_side_score': '2',
                           'name'

In [44]:
# Old game
pprint(scrape_match('https://www.vlr.gg/191/prodigy-vs-fish123-cooler-cup-playoffs-qf'))

{'date': 'Saturday, April 25th',
 'games': [{'map': {'duration': '-', 'name': 'Split'},
            'team_left': {'ct_side_score': '3',
                          'name': 'Prodigy',
                          'score': '5',
                          't_side_score': '2'},
            'team_right': {'ct_side_score': '4',
                           'name': 'fish123',
                           'score': '13',
                           't_side_score': '9'}}],
 'match_type': 'Bo1',
 'scoreline': '1:0',
 'stage': 'final',
 'team_1': 'Prodigy',
 'team_2': 'fish123',
 'time': '9:00 PM EEST'}
