# CSGO Liquipedia Scraper

### Attariq Muhammad Azhar (18221043)

### Tugas Seleksi Calon Asisten Lab Basis Data

## Import Libraries

In [8]:
from bs4 import BeautifulSoup
import requests 
import json
import time

## Function Declarations

In [5]:
# Get the soup object from url
def parseHTML(url):
    # Send GET request to url with headers
    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64); Basis Data/Admin Basis Data/basisdata@std.stei.itb.ac.id'}
    req = requests.get(url, headers = headers).text

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(req, 'lxml')
    return soup

# Get liquipedia url destination from href
def liquipedia_url(href):
    return 'https://liquipedia.net' + href

# Get the team name that participated in the tournament 
def getTeamName(tournament, is_winner):
    if is_winner:
        # Get url of the first place team
        href = tournament.find('div', class_ = 'gridCell Placement FirstPlace').find('span', class_ = 'team-template-team-short').find('span', class_ = 'team-template-text').find('a')['href']
    else:
        # Get url of the second place team
        href = tournament.find('div', class_ = 'gridCell Placement SecondPlace').find('span', class_ = 'team-template-team-short').find('span', class_ = 'team-template-text').find('a')['href']
    
    # Parse HTML from url of the team
    subsoup = parseHTML(liquipedia_url(href))

    # Get team name from subsoup
    team_name = subsoup.find('div', class_='infobox-header wiki-backgroundcolor-light').text[6:]

    return team_name

# Get the team data that participated in the tournament
def getTeamData(tournament, is_winner):
    try:
        if is_winner:
            # Get url of the first place team
            href = tournament.find('div', class_ = 'gridCell Placement FirstPlace').find('span', class_ = 'team-template-team-short').find('span', class_ = 'team-template-text').find('a')['href']
        else:
            # Get url of the second place team
            href = tournament.find('div', class_ = 'gridCell Placement SecondPlace').find('span', class_ = 'team-template-team-short').find('span', class_ = 'team-template-text').find('a')['href']
    except Exception as e:
        print("Error while scraping href")
        print(e)
    
    # Parse HTML from url of the team and get the team name from subsoup
    try:
        subsoup = parseHTML(liquipedia_url(href))
        team_name = subsoup.find('div', class_='infobox-header wiki-backgroundcolor-light').text[6:]
    except Exception as e:
        print("Error while scraping team_name")
        print(e)

    # Get team region from subsoup
    try:
        region_div = subsoup.find('div', class_='infobox-cell-2 infobox-description', text='Region:')
        region = region_div.find_next_sibling('div').text.strip()
    except Exception as e:
        print("Error while scraping region")
        print(e)

    # Get approx. total winnings of the team from subsoup
    try:
        approx_div = subsoup.find('div', class_='infobox-cell-2 infobox-description', text='Approx. Total Winnings:')
        approx = approx_div.find_next_sibling('div').text.strip()
        
        # Cast the approx. total winnings from string to int
        approx = int(approx[1:].replace(',', ''))
    except:
        # If the team has no approx. total winnings, the variable is set to 0
        approx = 0
    
    # Return the data as a dictionary
    team = {
        "team_name" : team_name,
        "region" : region,
        "approx_total_winnings": approx
    }

    return team

# Get the tournament data
def getTournamentData(tournament):
    # tournament variable contains div tag with class = 'gridRow'

    # Get tournament name from tournament
    try:
        tournament_name = tournament.find('div', class_ = 'gridCell Tournament Header').b.a.text
    except Exception as e:
        print("Error while scraping tournament_name")
        print(e)

    # Parse HTML from the tournament and get the subsoup
    # This parsing is done to get the tournament's start date, end date, and type (online/offline)
    try: 
        href = tournament.find('div', class_ = 'gridCell Tournament Header').find('b').find('a')['href']
        subsoup = parseHTML(liquipedia_url(href))
    except Exception as e:
        print("Error while scraping href")
        print(e)

    # Get the tournament's start date from subsoup
    try:
        start_date_div = subsoup.find('div', class_ = 'infobox-cell-2 infobox-description', text = 'Start Date:')
        start_date = start_date_div.find_next_sibling('div').text.strip()
    except Exception as e:
        print("Error while scraping start_date")
        print(e)

    # Get the tournament's end date from subsoup
    try:
        end_date_div = subsoup.find('div', class_ = 'infobox-cell-2 infobox-description', text = 'End Date:')
        end_date = end_date_div.find_next_sibling('div').text.strip()
    except Exception as e:
        print("Error while scraping end_date")
        print(e)
    
    # Get the tournament's city from the gridRow
    try: 
        city = tournament.find('div', class_='gridCell EventDetails Location Header').text
        city = city.replace('\xa0', '')
    except Exception as e:
        print("Error while scraping city")
        print(e)
    
    # Get the tournament's country from the gridRow
    try:
        country = tournament.find('div', class_='gridCell EventDetails Location Header').find('span', class_ = 'flag').find('img')['title']
    except Exception as e:
        print("Error while scraping country")
        print(e)

    # Get the tournament's type from subsoup
    try: 
        type_div = subsoup.find('div', class_ = 'infobox-cell-2 infobox-description', text = 'Type:')
        type_ = type_div.find_next_sibling('div').text.strip()
    except Exception as e:
        print("Error while scraping type")
        print(e)
    
    # If the tournament is online, the city and country variables are set to empty strings
    if type_ == 'Online':
        city = ''
        country = ''

    # Get the statement of whether the tournament is a major or not from subsoup
    # (The statement value is boolean)
    try:
        is_major = True
        valve_tier = subsoup.find('div', class_ = 'valvepremier-highlighted')
        
        if type(valve_tier) == type(None):
            is_major = False
        
    except Exception as e:
        print("Error while scraping is_major")
        print(e)

    # Get the tournament's prize pool from the gridRow
    try:
        prize_pool = tournament.find('div', class_ = 'gridCell EventDetails Prize Header').text
        prize_pool = float(prize_pool[1:].replace(',', ''))
    except Exception as e:
        print("Error while scraping prize_pool")
        print(e)

    # Get the tournament's number of participants from the gridRow
    try:
        number_of_participants = tournament.find('div', class_ = 'gridCell EventDetails PlayerNumber Header').text
        number_of_participants = int(number_of_participants.replace('\u00a0teams', ''))
    except Exception as e:
        print("Error while scraping number_of_participants")
        print(e)
    
    # Return the data as a dictionary
    tourney = {
        "tournament_name" : tournament_name,
        "start_date" : start_date,
        "end_date" : end_date, 
        "city" : city,
        "country" : country,
        "type" : type_,
        "is_major" : is_major,
        # Get the tournament's winner and runner up 
        "winner" : getTeamName(tournament, True),
        "runner_up" : getTeamName(tournament, False),
        "prize_pool" : prize_pool,
        "number_of_participants" : number_of_participants
    }

    return tourney

## Scraping Data

In [6]:
# Variables declaration
soup = parseHTML('https://liquipedia.net/counterstrike/S-Tier_Tournaments')

# Variable contains all the div tags with class = 'gridRow'
# Each div tag contains the data of a tournament
tournaments = soup.find_all('div', class_ = 'gridRow')

# Lists to store the data
tournament_data = []
team_data = []

# Counters to keep track of the number of tournaments that had been scraped and errors
count = 0
error_count = 0

# Loop through all the tournaments
for tournament in tournaments:
    try: 
        # Get the tournament's winner
        winner = tournament.find('div', class_ = 'gridCell Placement FirstPlace').find('span', class_ = 'team-template-team-short').find('span', class_ = 'team-template-text').find('a')['href']

        # Get the game type of the tournament (CS:GO / CS)
        game = tournament.find('div', class_ = 'gridCell Game Header').find('span', class_ = 'icon-16px').a['title']
        if game == 'Counter-Strike: Global Offensive':
            try:
                # Get the tournament's data and store it in the tournament_data list
                tourney = getTournamentData(tournament)
                tournament_data.append(tourney)

                # Get the tournament's winner and store it in the team_data list (if it's not already in the list)
                winner = getTeamData(tournament, True)
                if winner not in team_data:
                    team_data.append(winner)

                # Get the tournament's runner up and store it in the team_data list (if it's not already in the list)
                runner_up = getTeamData(tournament, False)
                if runner_up not in team_data:
                    team_data.append(runner_up)

                # Output to notify the user that the tournament had been scraped 
                # and the number of tournaments that had been scraped
                count += 1
                print('\n' + tourney.get('tournament_name') + ' scraped')
                print(f'{count} tournaments had been scraped\n')

            except Exception as e:
                # Exception handler and counter to keep track of the number of errors
                print(e)
                error_count += 1
        
        # If the game type is CS (not CS:GO), the tournament will be skipped
        # (This scraping process is only for CS:GO Tournaments)
        else:
            pass

    # If the tournament is not conducted yet, the winner variable will be None
    # Hence the tournament will be skipped
    except:
        pass

    # Sleep for 3 seconds to prevent getting blocked by the website
    time.sleep(3)

# Output to notify the user that the scraping process had been completed
if error_count > 0:
    print(f'{error_count} errors occured while scraping')
else:
    print('No errors occured while scraping')
    
          




BLAST Premier: Spring Final 2023 scraped
1 tournaments had been scraped


Intel Extreme Masters Dallas 2023 scraped
2 tournaments had been scraped


BLAST.tv Paris Major 2023 scraped
3 tournaments had been scraped


Intel Extreme Masters Rio 2023 scraped
4 tournaments had been scraped


ESL Pro League Season 17 scraped
5 tournaments had been scraped


Intel Extreme Masters Katowice 2023 scraped
6 tournaments had been scraped


BLAST Premier: World Final 2022 scraped
7 tournaments had been scraped


BLAST Premier: Fall Finals 2022 scraped
8 tournaments had been scraped


Intel Extreme Masters Rio Major 2022 scraped
9 tournaments had been scraped


ESL Pro League Season 16 scraped
10 tournaments had been scraped


Intel Extreme Masters XVII - Cologne scraped
11 tournaments had been scraped


Roobet Cup 2022 scraped
12 tournaments had been scraped


BLAST Premier: Spring Finals 2022 scraped
13 tournaments had been scraped


Intel Extreme Masters XVII - Dallas scraped
14 tournaments had b

## Writing Data to JSON

In [7]:
# Write the tournaments data to a JSON file
with open('CSGO_Tournaments.json', 'w') as outfile:
    json.dump(tournament_data, outfile, indent=4)

# Write the teams data to a JSON file
with open('CSGO_Teams.json', 'w') as outfile:
    json.dump(team_data, outfile, indent=4)
    