## Importing the necessary dependencies

In [3]:
!pip install requests beautifulsoup4 selenium pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install undetected_chromedriver

Defaulting to user installation because normal site-packages is not writeable
Collecting undetected_chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting selenium>=4.9.0 (from undetected_chromedriver)
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting websockets (from undetected_chromedriver)
  Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Collecting trio~=0.17 (from selenium>=4.9.0->undetected_chromedriver)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium>=4.9.0->undetected_chromedriver)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium>=4.9.0->undetected_chromedriver)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium>=



In [3]:
import pandas as pd
import requests

import undetected_chromedriver as uc
import time
import pandas as pd 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Predefined Functions

In [4]:
# Convert "X / Y" to percentage
def to_percentage(value):
    if (value == '-'):
        return f"{(0 / 1) :.2f}"
    x, y = map(int, value.split(' / '))
    return f"{(x / y) :.2f}"

# Mapping function
def get_advantage(xvy):
    x, y = map(int, xvy.split('v'))
    return f"+{x - y}" if x - y > 0 else f"{x - y}"

## Scraping the Data

#### Pistol Rounds, First Kills, Clutch%

In [5]:
def scrape_main_data(driver, teams):
    all_stats = {}

    # Find all stat categories
    stat_names = driver.find_elements(By.XPATH, "//div[contains(text(), 'Pistol Rounds Won') or contains(text(), 'First Kills') or contains(text(), 'KAST') or contains(text(), 'Clutches')]")

    for stat in stat_names:
        stat_name = stat.text

        try:
            # Look for numerical values in adjacent elements
            team1_value_elements = stat.find_elements(By.XPATH, "./preceding-sibling::div[contains(@class, 'css-') and string-length(text()) > 0]")
            team2_value_elements = stat.find_elements(By.XPATH, "./following-sibling::div[contains(@class, 'css-') and string-length(text()) > 0]")

            all_stats[stat_name] = [team1_value_elements[0].text.strip().replace('%', '').replace('.', ''), team2_value_elements[0].text.strip().replace('%', '').replace('.', '')]
        
        except Exception as e:
            print(f"Error extracting {stat_name}: {e}")

    # Create a DataFrame
    if all_stats and len(teams) == 2:
        main_df = pd.DataFrame.from_dict(all_stats, orient='index', columns=teams).transpose()
        print(main_df)

        # Converting the clutches into percentages
        main_df["Clutches"] = main_df["Clutches"].str.split(" / ").apply(lambda x: int(x[0]) / int(x[1]) if int(x[1]) != 0 else 0)
        main_df['KAST'] = main_df['KAST'].astype('float').div(100)

    else:
        print("Failed to extract meaningful stat data")

    return main_df

In [None]:
url = "https://www.rib.gg/series/83355?tab=team-stats"
driver = uc.Chrome(version_main=135)
driver.get(url)

# Extract the team names
team_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'css-1aujtoy') and string-length(text()) > 0]")
teams = [team.text.strip() for team in team_elements if len(team.text.strip()) > 0][:2]  # Extract first two team names
print(f"Teams detected: {teams}")

main_df = scrape_main_data(driver, teams)
driver.quit()

main_df

Teams detected: ['G2 Esports', 'Cloud9']
Results detected: ['Winner', 'Loser']
           Pistol Rounds Won First Kills KAST Clutches
G2 Esports                 3          26   80   3 / 15
Cloud9                     1          14   59   3 / 28


Unnamed: 0,Pistol Rounds Won,First Kills,KAST,Clutches
G2 Esports,3,26,0.8,0.2
Cloud9,1,14,0.59,0.107143


#### Economy Data

In [8]:
def scrape_eco_data(driver, teams):
    all_stats = {}

    # Find all stat categories
    stat_names = driver.find_elements(By.XPATH, "//div[text()='Eco' or text()='Semi-Eco' or text()='Half-Buy' or text()='Full-Buy']")

    for stat in stat_names:
        stat_name = stat.text
    #     print(f"\nFound stat: {stat_name}")

        try:
            # Look for numerical values in adjacent elements
            team1_value_elements = stat.find_elements(By.XPATH, "./parent::div/preceding-sibling::div/child::div[contains(@class, 'css-') and string-length(text()) > 0]")
            team2_value_elements = stat.find_elements(By.XPATH, "./parent::div/following-sibling::div/child::div[contains(@class, 'css-') and string-length(text()) > 0]")

            all_stats[stat_name] = [team1_value_elements[0].text.strip().replace('%', '').replace('.', '').replace('-', '0'), team2_value_elements[0].text.strip().replace('%', '').replace('.', '').replace('-', '0')]
        
        except Exception as e:
            print(f"Error extracting {stat_name}: {e}")

    # Create a DataFrame
    if all_stats and len(teams) == 2:
        eco_df = pd.DataFrame.from_dict(all_stats, orient='index', columns=teams).transpose()
        eco_df = eco_df.astype('float').div(100) # Transpose and divide by 100 to turn percentages into decimals
    else:
        print("Failed to extract meaningful stat data")
    
    return eco_df
    

In [None]:
url = "https://www.rib.gg/series/83355?tab=team-stats"
driver = uc.Chrome(version_main=135)
driver.get(url)

# Extract the team names
team_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'css-1aujtoy') and string-length(text()) > 0]")
teams = [team.text.strip() for team in team_elements if len(team.text.strip()) > 0][:2]  # Extract first two team names
print(f"Teams detected: {teams}")

eco_df = scrape_eco_data(driver, teams)
driver.quit()

eco_df

Teams detected: ['G2 Esports', 'Cloud9']
Results detected: ['Winner', 'Loser']


Unnamed: 0,Eco,Semi-Eco,Half-Buy,Full-Buy
G2 Esports,0.0,0.0,0.64,0.67
Cloud9,0.0,0.0,0.2,0.52


#### XvY Conversions

In [10]:
def scrape_xvy_data(driver, teams):
    all_stats = {}

    # Find all stat categories
    stat_names = driver.find_elements(By.XPATH, "//div[contains(text(), '5v4') or contains(text(), '4v5') or contains(text(), '4v4') or contains(text(), '4v3') or contains(text(), '3v4') or contains(text(), '3v3') or contains(text(), '5v3') or contains(text(), '3v5') or contains(text(), '3v2') or contains(text(), '2v3') or contains(text(), '4v2') or contains(text(), '2v4') or contains(text(), '2v2') or contains(text(), '3v1') or contains(text(), '1v3') or contains(text(), '1v1') or contains(text(), '2v1') or contains(text(), '1v2') or contains(text(), '1v4') or contains(text(), '4v1') or contains(text(), '5v2') or contains(text(), '2v5') or contains(text(), '5v1') or contains(text(), '1v5')]")

    for stat in stat_names:
        stat_name = stat.text

        try:
            # Look for numerical values in adjacent elements
            team1_value_elements = stat.find_elements(By.XPATH, "./parent::div/parent::div/preceding-sibling::div[contains(@class, 'css-') and string-length(text()) > 0]")
            team2_value_elements = stat.find_elements(By.XPATH, "./parent::div/parent::div/following-sibling::div[contains(@class, 'css-') and string-length(text()) > 0]")

            all_stats[stat_name] = [team1_value_elements[0].text.strip().replace('%', '').replace('.', ''), team2_value_elements[0].text.strip().replace('%', '').replace('.', '')]

        except Exception as e:
            print(f"Error extracting {stat_name}: {e}")

    # Create a DataFrame
    if all_stats and len(teams) == 2:
        xvy_df = pd.DataFrame.from_dict(all_stats, orient='index', columns=teams).transpose()

        # Apply transformation to all columns except 'Team'
        for col in xvy_df.columns:  
            xvy_df[col] = xvy_df[col].apply(to_percentage)

        xvy_df = xvy_df.astype('float')
        
        # Creating the new aggregated df
        # eg: 2v4, 1v3, and 3v5 all get aggregated into "+2"
        agg_data = {}
        counts = {}

        for col in xvy_df.columns:
            category = get_advantage(col)
            if category not in agg_data:
                agg_data[category] = xvy_df[col].copy()
                counts[category] = 1
            else:
                agg_data[category] += xvy_df[col]
                counts[category] += 1

        xvy_df = pd.DataFrame({key: agg_data[key] / counts[key] for key in agg_data})

    else:
        print("Failed to extract meaningful stat data")

    return xvy_df

In [None]:
url = "https://www.rib.gg/series/83355?tab=team-stats"
driver = uc.Chrome(version_main=135)
driver.get(url)

# Extract the team names
team_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'css-1aujtoy') and string-length(text()) > 0]")
teams = [team.text.strip() for team in team_elements if len(team.text.strip()) > 0][:2]  # Extract first two team names
print(f"Teams detected: {teams}")

xvy_df = scrape_xvy_data(driver, teams)
driver.quit()

xvy_df

Teams detected: ['G2 Esports', 'Cloud9']
Results detected: ['Winner', 'Loser']


Unnamed: 0,+1,-1,0,+2,-2,-3,+3,+4,-4
G2 Esports,0.8975,0.3525,0.5925,1.0,0.11,0.0,1.0,1.0,0.0
Cloud9,0.6475,0.1025,0.4075,0.89,0.0,0.0,0.5,0.0,0.0


---
### Aggregating all the data into one df

In [None]:
def scrape_data(match_id):
    url = f"https://www.rib.gg/series/{match_id}?tab=team-stats"
    driver = uc.Chrome(version_main=135)
    driver.get(url)

    print("Waiting for page to load...")
    # time.sleep(5)  # Give the page time to fully load

    # Extract the team names
    team_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'css-1aujtoy') and string-length(text()) > 0]")
    teams = [team.text.strip() for team in team_elements if len(team.text.strip()) > 0][:2]  # Extract first two team names

    print(f"Teams detected: {teams}")

    result_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'css-tesx4r') or contains(@class, 'css-hov9j3') and string-length(text()) > 0]")
    results = [result.text.strip() for result in result_elements if len(result.text.strip()) > 0][:2]  # Extract the win and loss results

    print(f"Results detected: {results}")

    # PISTOL ROUNDS, FIRST KILLS, KAST #
    main_df = scrape_main_data(driver, teams)

    # ECO INFORMATION #
    eco_df = scrape_eco_data(driver, teams)

    # XVY CONVERSIONS #
    xvy_df = scrape_xvy_data(driver, teams)
    
    driver.quit()
    
    # CONCATINATING THE DATAFRAMES #
    df = pd.concat([main_df, eco_df, xvy_df], axis=1)
    df = df.reset_index(drop=True)

    df.insert(0, "Team", teams)  # Ensure "Team" is always the first column

    # Copy stats from row 1 to row 0 and vice versa for opponent fields
    df.loc[0, 'Pistol Rounds Won Opp'], df.loc[1, 'Pistol Rounds Won Opp'] = df.loc[1, 'Pistol Rounds Won'], df.loc[0, 'Pistol Rounds Won']
    df.loc[0, 'First Kills Opp'], df.loc[1, 'First Kills Opp'] = df.loc[1, 'First Kills'], df.loc[0, 'First Kills']
    df.loc[0, 'KAST Opp'], df.loc[1, 'KAST Opp'] = df.loc[1, 'KAST'], df.loc[0, 'KAST']
    df.loc[0, 'Clutches Opp'], df.loc[1, 'Clutches Opp'] = df.loc[1, 'Clutches'], df.loc[0, 'Clutches']
    df.loc[0, 'Eco Opp'], df.loc[1, 'Eco Opp'] = df.loc[1, 'Eco'], df.loc[0, 'Eco']
    df.loc[0, 'Semi-Eco Opp'], df.loc[1, 'Semi-Eco Opp'] = df.loc[1, 'Semi-Eco'], df.loc[0, 'Semi-Eco']
    df.loc[0, 'Half-Buy Opp'], df.loc[1, 'Half-Buy Opp'] = df.loc[1, 'Half-Buy'], df.loc[0, 'Half-Buy']
    df.loc[0, 'Full-Buy Opp'], df.loc[1, 'Full-Buy Opp'] = df.loc[1, 'Full-Buy'], df.loc[0, 'Full-Buy']
    df.loc[0, '+1 Opp'], df.loc[1, '+1 Opp'] = df.loc[1, '+1'], df.loc[0, '+1']
    df.loc[0, '-1 Opp'], df.loc[1, '-1 Opp'] = df.loc[1, '-1'], df.loc[0, '-1']
    df.loc[0, '0 Opp'], df.loc[1, '0 Opp'] = df.loc[1, '0'], df.loc[0, '0']
    df.loc[0, '+2 Opp'], df.loc[1, '+2 Opp'] = df.loc[1, '+2'], df.loc[0, '+2']
    df.loc[0, '-2 Opp'], df.loc[1, '-2 Opp'] = df.loc[1, '-2'], df.loc[0, '-2']
    df.loc[0, '-3 Opp'], df.loc[1, '-3 Opp'] = df.loc[1, '-3'], df.loc[0, '-3']
    df.loc[0, '+3 Opp'], df.loc[1, '+3 Opp'] = df.loc[1, '+3'], df.loc[0, '+3']
    try:
        df.loc[0, '+4 Opp'], df.loc[1, '+4 Opp'] = df.loc[1, '+4'], df.loc[0, '+4']
    except KeyError:
        print("Column '+4' not found, setting default values.")
        df.loc[0, '+4 Opp'], df.loc[1, '+4 Opp'] = 1.0, 1.0  # Default values

    try:
        df.loc[0, '-4 Opp'], df.loc[1, '-4 Opp'] = df.loc[1, '-4'], df.loc[0, '-4']
    except KeyError:
        print("Column '-4' not found, setting default values.")
        df.loc[0, '-4 Opp'], df.loc[1, '-4 Opp'] = 0.0, 0.0  # Default values

    # Map results based on teams
    df["Result"] = df["Team"].map(dict(zip(teams, [1 if r == "Winner" else 0 for r in results])))

    return df

In [None]:
match_id = "83360" # Choose the Match ID from rib.gg
url = f"https://www.rib.gg/series/{match_id}?tab=team-stats"

In [13]:
match_id = 89294
df = scrape_data(match_id)
df

Waiting for page to load...
Teams detected: ['TALON', 'Gen.G Esports']
Results detected: ['Loser', 'Winner']
              Pistol Rounds Won First Kills KAST Clutches
TALON                         3          26   64   2 / 32
Gen.G Esports                 3          27   72   2 / 23


Unnamed: 0,Team,Pistol Rounds Won,First Kills,KAST,Clutches,Eco,Semi-Eco,Half-Buy,Full-Buy,+1,...,+1 Opp,-1 Opp,0 Opp,+2 Opp,-2 Opp,-3 Opp,+3 Opp,+4 Opp,-4 Opp,Result
0,TALON,3,26,0.64,0.0625,0.0,0.0,0.36,0.5,0.7775,...,0.72,0.2225,0.385,0.9,0.0,0.0,1.0,0.0,0.0,0
1,Gen.G Esports,3,27,0.72,0.086957,0.0,0.0,0.62,0.68,0.72,...,0.7775,0.28,0.615,1.0,0.1,0.0,1.0,1.0,0.0,1


In [14]:
temp_df = df.copy()

In [77]:
temp_df = pd.concat([temp_df, df], ignore_index=True)

In [78]:
temp_df

Unnamed: 0,Team,Pistol Rounds Won,First Kills,KAST,Clutches,Eco,Semi-Eco,Half-Buy,Full-Buy,+1,...,+1 Opp,-1 Opp,0 Opp,+2 Opp,-2 Opp,-3 Opp,+3 Opp,+4 Opp,-4 Opp,Result
0,G2 Esports,3,26,0.8,0.2,0.0,0.0,0.64,0.67,0.8975,...,0.6475,0.1025,0.4075,0.89,0.0,0.0,0.5,0.0,0.0,1
1,Cloud9,1,14,0.59,0.107143,0.0,0.0,0.2,0.52,0.6475,...,0.8975,0.3525,0.5925,1.0,0.11,0.0,1.0,1.0,0.0,0
2,G2 Esports,3,26,0.8,0.2,0.0,0.0,0.64,0.67,0.8975,...,0.6475,0.1025,0.4075,0.89,0.0,0.0,0.5,0.0,0.0,1
3,Cloud9,1,14,0.59,0.107143,0.0,0.0,0.2,0.52,0.6475,...,0.8975,0.3525,0.5925,1.0,0.11,0.0,1.0,1.0,0.0,0


In [15]:
# Automating the scraping of data
match_ids = [89295, 89296, 89297, 89298, 89299, 89367, 89368, 89366, 89737, 89921, 89922]

for match_id in match_ids:
    df = scrape_data(match_id)
    temp_df = pd.concat([temp_df, df], ignore_index=True)

temp_df

KeyboardInterrupt: 

---

## Saving and Loading the Datasets

In [None]:
kickoff_df = pd.read_csv('Datasets/kickoff.csv')
kickoff_df = kickoff_df.drop(columns=['Unnamed: 0'])

In [90]:
kickoff_df.to_csv('Datasets/kickoff.csv')

In [45]:
masters_df = pd.read_csv('Datasets/kickoff_with_masters.csv')
masters_df = masters_df.drop(columns=['Unnamed: 0'])

In [63]:
masters_df.to_csv('Datasets/kickoff_with_masters.csv')

In [5]:
stage1_df = pd.read_csv('Datasets/stage1_14_04_25.csv')
stage1_df = stage1_df.drop(columns=['Unnamed: 0'])

In [3]:
playoffs_df = pd.read_csv('Datasets/playoffs.csv')
playoffs_df = playoffs_df.drop(columns=['Unnamed: 0'])

In [40]:
temp_df.to_csv('Datasets/playoffs.csv')

In [47]:
total_df = pd.concat([masters_df, stage1_df], ignore_index=True)

In [57]:
temp_df.to_csv('Datasets/apac_stage1_playoffs.csv')

In [135]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Team                   120 non-null    object 
 1   Pistol Rounds Won      120 non-null    int64  
 2   First Kills            120 non-null    int64  
 3   KAST                   120 non-null    float64
 4   Clutches               120 non-null    float64
 5   Eco                    120 non-null    float64
 6   Semi-Eco               120 non-null    float64
 7   Half-Buy               120 non-null    float64
 8   Full-Buy               120 non-null    float64
 9   +1                     120 non-null    float64
 10  -1                     120 non-null    float64
 11  0                      120 non-null    float64
 12  +2                     120 non-null    float64
 13  -2                     120 non-null    float64
 14  -3                     120 non-null    float64
 15  +3    