In [1]:
import numpy as np
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
def get_match_urls(url):

    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    soup = soup.find_all('div', class_ = 'results-sublist')

    url_collection = []

    for day in soup:
        matches = day.find_all('a', class_ = 'a-reset', href = True)

        for match in matches:
            match_url = match['href']
            url_collection.append(match_url)

    return list(set(url_collection))


In [14]:
def get_stats_url_and_match_info(match_url):

    url = f'https://www.hltv.org{match_url}'

    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    result = soup.find('div', class_ = 'small-padding stats-detailed-stats')
    
    if result == None:
        result = 'ignore'
    else:
        result = result.find('a')['href']
        
    text = soup.find('div', class_='standard-box veto-box').find('div',class_='padding preformatted-text').get_text()

    text = list(filter(None,text.split('\n')))
    
    match_type = text[0]
    
    if len(text) == 1:
        match_round = 'NA'
    else:
        match_round = text[1][2:]

        if '.' in match_round:
            match_round = match_round.split('.')[0]
    
    match_info = [match_type,match_round]
    
    return result, match_info

In [4]:
def get_match_info(match_url):
    
    url = f'https://www.hltv.org{match_url}'
    
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')
    
    text = soup.find('div', class_='standard-box veto-box').find('div',class_='padding preformatted-text').get_text()

    text = list(filter(None,text.split('\n')))
    
    match_type = text[0]
    
    if len(text) == 1:
        match_round = 'NA'
    else:
        match_round = text[1][2:]

        if '.' in match_round:
            match_round = match_round.split('.')[0]
    
    match_info = [match_type,match_round]
    
    return match_info

In [5]:
def get_stats_table(stats_url, match_info):
    
    url = f'https://www.hltv.org{stats_url}'
    
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    championship = soup.find('div',class_='menu-header').get_text()

    teams = [None] * 2
    scores = [None] * 2

    match_info_box = soup.find('div', class_='match-info-box')
    time = match_info_box.find('span').get_text()
    teams[0] = match_info_box.find('div', class_='team-left').find('a', class_='block text-ellipsis').get_text()
    scores[0] = match_info_box.find('div', class_='team-left').find('div').get_text()
    teams[1] = match_info_box.find('div', class_='team-right').find('a', class_='block text-ellipsis').get_text()
    scores[1] = match_info_box.find('div', class_='team-right').find('div').get_text()

    init_row = [championship, time] + match_info + [teams[0], scores[0], teams[1], scores[1]]

    stats_tables = soup.find_all('table', class_='stats-table')

    stats = []

    for i, table in enumerate(stats_tables):
        trs = table.find('tbody').find_all('tr')

        for tr in trs:
            tds = tr.find_all('td')
            row = init_row + [teams[i]]

            for td in tds:
                row.append(td.get_text())

            stats.append(row)

    df = pd.DataFrame(stats)
    
    return(df)

In [11]:
def clean_stats_table(df):
    df.iloc[:,1] = pd.to_datetime(df.iloc[:,1], format = '%Y-%m-%d %H:%M')
    df.iloc[:,5] = pd.to_numeric(df.iloc[:,5])
    df.iloc[:,7] = pd.to_numeric(df.iloc[:,7])
    df.iloc[:,12] = pd.to_numeric(df.iloc[:,12])
    df.iloc[:,13] = pd.to_numeric(df.iloc[:,13].str.strip('%'))
    df.iloc[:,14] = pd.to_numeric(df.iloc[:,14])
    df.iloc[:,15] = pd.to_numeric(df.iloc[:,15])
    df.iloc[:,16] = pd.to_numeric(df.iloc[:,16])
    df.iloc[:,17] = pd.to_numeric(df.iloc[:,17])

    df.head()

    idx = pd.IndexSlice

    def correct_cols(df,col):
        extract = df.iloc[:,col].str.extractall(r'(\d{1,3})')

        df[f'{col}1'] = pd.to_numeric(extract.loc[idx[:,0],:].reset_index().iloc[:,-1])
        df[f'{col}2'] = pd.to_numeric(extract.loc[idx[:,1],:].reset_index().iloc[:,-1])
        correct_df = df.drop(col,axis=1)

        return correct_df

    df_new = correct_cols(df,11)
    df_new = correct_cols(df_new,10)

    colnames = ['Championship','Date','Match Type','Round','Team 1', 'Team 1 Score','Team 2', 'Team 2 Score','Player for','Player',
                'Deaths','KAST %','KD Diff','ADR','FK Diff','Rating','Assists','Flash Assists','Kills','Headshots']

    df_new.columns = colnames

    return df_new

In [None]:
# data = pd.DataFrame()

# for pg in range(3,7):
    
#     page = f'https://www.hltv.org/results?offset={pg}00&stars=1'
    
#     match_urls = get_match_urls(page)
#     print(page)
#     page_data = pd.DataFrame()

#     for url in match_urls:
#         stats_url = get_stats_url(url)
#         if stats_url != 'ignore':
#             stats_table = get_stats_table(stats_url)
#             clean_df = clean_stats_table(stats_table)
#             page_data = page_data.append(clean_df)
    
#     data = data.append(page_data)

#     data.to_csv(f'csgo_scraped_data_page{pg}.csv')

In [15]:
def get_match_data(pages):
    
    data = pd.DataFrame()
    
    for i, page in enumerate(pages):
    
        match_urls = get_match_urls(page)
        
        page_data = pd.DataFrame()

        for url in match_urls:
            
            #match_info = get_match_info(url)
            
            stats_url, match_info = get_stats_url_and_match_info(url)
            if stats_url != 'ignore':
                stats_table = get_stats_table(stats_url,match_info)
                clean_df = clean_stats_table(stats_table)
                page_data = page_data.append(clean_df)

        data = data.append(page_data)
        
        print(f'Page {i+1} was scraped successfully')
        
        return data

        #data.to_csv(f'csgo_scraped_data_page{pg}.csv')

In [16]:
data = get_match_data(['https://www.hltv.org/results?stars=1'])

Page 1 was scraped successfully
