In [13]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

In [14]:
# Đọc file csv
df = pd.read_csv('data/crawled_data/match_link.csv')

# Lấy tất cả các link trong cột 'link'
links = df['link']
df.head()

Unnamed: 0,link
0,/match/las/1292816024
1,/match/vn/82488841
2,/match/lan/1382606234
3,/match/sg/6763657
4,/match/eune/3359078179


In [15]:
class GameStatsExtractor:
    def __init__(self, soup, lane_dict, lanes_graphs):
        self.soup = soup
        self.lane_dict = lane_dict
        self.lanes_graphs = lanes_graphs

    def _extract_graph_data(self, graph_identifier):
        script_tags = self.soup.find_all('script', type='text/javascript')
        for tag in script_tags:
            if tag.string and graph_identifier in tag.string:
                data = re.findall(r'data:\s*(\[\[.*?\]\])', tag.string)
                if data:
                    if len(data) >= 2:
                        return data[0], data[1]
        return None, None

    def get_champion_names(self):
        stats = {}
        i = 0
        lane_dict = {
            1: 'top_blue',
            2: 'top_red',
            3: 'jungle_blue',
            4: 'jungle_red',
            5: 'mid_blue',
            6: 'mid_red',
            7: 'ad_blue',
            8: 'ad_red',
            9: 'support_blue',
            10: 'support_red'
        }
        # Lọc ra các tags có chứa dữ liệu
        div_tags = self.soup.find_all('div', {'class': 'relative'})
        
        # Duyệt qua từng tag và kiểm tra nội dung
        for div_tag in div_tags:
            a_tag = div_tag.find('a')
            if a_tag:
                i= i+1
                img_tag = a_tag.find('img')
                if img_tag:
                    champion_name = img_tag['alt']
                    lane = lane_dict[i]
                    stats[lane]=champion_name
        return stats

    def extract_kda_stats(self):
        stats = {}
        kda_blocks = self.soup.find_all("div", class_="kda")
        for i, kda_block in enumerate(kda_blocks[:len(self.lanes_graphs)*2], start=1):
            lane_index = (i + 1) // 2
            lane = self.lane_dict.get(lane_index, f"Unknown_Lane_{lane_index}")
            team = "blue" if i % 2 != 0 else "red"
            lane_team = f"{lane}_{team}"
            kills = self._safe_int(kda_block.find("span", class_="kills"))
            deaths = self._safe_int(kda_block.find("span", class_="deaths"))
            assists = self._safe_int(kda_block.find("span", class_="assists"))
            stats[lane_team] = {"kills": kills, "deaths": deaths, "assists": assists}
        return stats
    
    def extract_team_stats(self, graph_id, stat_name):
        """Extract statistics for both teams based on a graph identifier."""
        data = self._extract_graph_data(graph_id)
        return {
            f"{stat_name}_blue": data[0] if data else None,
            f"{stat_name}_red": data[1] if data else None
        }

    def extract_all_gold_stats(self):
        """Extract gold statistics for all lanes."""
        gold_stats = {}
        for lane, graph_id in self.lanes_graphs.items():
            lane_gold_stats = self.extract_team_stats(graph_id, f"{lane}_gold")
            gold_stats.update(lane_gold_stats)
        return gold_stats

    def extract_all_minions_stats(self):
        """Extract minion statistics for all lanes."""
        minions_stats = {}
        for lane, graph_id in self.lanes_graphs.items():
            lane_minion_stats = self.extract_team_stats(graph_id, f"{lane}_minions")
            minions_stats.update(lane_minion_stats)
        return minions_stats
    
    def extract_dragons_kill_stat(self, graph_id):
        """Extract dragon kills statistics for both teams."""
        return self.extract_team_stats(graph_id, "dragons_killed")

    def extract_turrets_killed_stat(self, graph_id):
        """Extract dragon kills statistics for both teams."""
        return self.extract_team_stats(graph_id, "turrets_killed")

    def extract_all_wards_stats(self):
        """Extract wards placed and killed statistics for all lanes."""
        wards_stats = {}
        wards_placed_graphs = {
            "top": 'graphDD45',
            "jungle": 'graphDD103',
            "mid": 'graphDD161',
            "ad": 'graphDD219',
            "support": 'graphDD277'
        }
        wards_killed_graphs = {
            "top": 'graphDD48',
            "jungle": 'graphDD106',
            "mid": 'graphDD164',
            "ad": 'graphDD222',
            "support": 'graphDD280'
        }

        for lane in self.lanes_graphs.keys():
            # Extract wards placed stats
            graph_id_placed = wards_placed_graphs.get(lane, '')
            if graph_id_placed:
                wards_stats.update(self.extract_team_stats(graph_id_placed, f"{lane}_wards_placed"))

            # Extract wards killed stats
            graph_id_killed = wards_killed_graphs.get(lane, '')
            if graph_id_killed:
                wards_stats.update(self.extract_team_stats(graph_id_killed, f"{lane}_wards_killed"))

        return wards_stats
    
    def get_team_win(self):
        teamWin = "Unknown"
        div_tag = self.soup.find('div', {'class': 'box matchBox'})
        if not div_tag:
            return teamWin

        th_blue_tag = div_tag.find('th', {'class': 'text-left no-padding-right'})
        if not th_blue_tag:
            return teamWin

        blue_victory_tag = th_blue_tag.find('span', {'class': 'victory'})
        blue_defeat_tag = th_blue_tag.find('span', {'class': 'defeat'})

        if blue_victory_tag and not blue_defeat_tag:
            teamWin = "blue"
        elif blue_defeat_tag and not blue_victory_tag:
            teamWin = "red"

        return teamWin

    def extract_all_stats(self):
        all_stats = {}
        all_stats['dragons_killed'] = self.extract_dragons_kill_stat()
        all_stats['champions'] = self.get_champion_names()
        all_stats['kda'] = self.extract_kda_stats()
        all_stats['gold'] = self.extract_all_gold_stats()
        all_stats['minions'] = self.extract_all_minions_stats()
        all_stats['wards'] = self.extract_all_wards_stats()
        return all_stats

    def _safe_int(self, tag):
        try:
            return int(tag.get_text(strip=True))
        except (AttributeError, ValueError):
            return 0


In [16]:
def initialize_lane_dict():
    """Initialize lane dictionary."""
    return {1: "top", 2: "jungle", 3: "mid", 4: "ad", 5: "support"}

def initialize_lanes_graphs():
    """Initialize lane graphs."""
    return {
        "top": 'graphDD29',
        "jungle": 'graphDD87',
        "mid": 'graphDD145',
        "ad": 'graphDD203',
        "support": 'graphDD261'
    }

def setup_webdriver():
    """Setup Selenium WebDriver."""
    options = webdriver.ChromeOptions()
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def main():
    lane_dict = initialize_lane_dict()
    lanes_graphs = initialize_lanes_graphs()
    driver = setup_webdriver()

    columns = [
        'top_blue', 'top_red', 'jungle_blue', 'jungle_red', 'mid_blue', 'mid_red',
        'ad_blue', 'ad_red', 'support_blue', 'support_red', 'top_blue_kills',
        'top_blue_deaths', 'top_blue_assists', 'top_red_kills', 'top_red_deaths',
        'top_red_assists', 'jungle_blue_kills', 'jungle_blue_deaths',
        'jungle_blue_assists', 'jungle_red_kills', 'jungle_red_deaths',
        'jungle_red_assists', 'mid_blue_kills', 'mid_blue_deaths',
        'mid_blue_assists', 'mid_red_kills', 'mid_red_deaths', 'mid_red_assists',
        'ad_blue_kills', 'ad_blue_deaths', 'ad_blue_assists', 'ad_red_kills',
        'ad_red_deaths', 'ad_red_assists', 'support_blue_kills',
        'support_blue_deaths', 'support_blue_assists', 'support_red_kills',
        'support_red_deaths', 'support_red_assists', 'top_blue_gold',
        'top_red_gold', 'jungle_blue_gold', 'jungle_red_gold', 'mid_blue_gold',
        'mid_red_gold', 'ad_blue_gold', 'ad_red_gold', 'support_blue_gold',
        'support_red_gold', 'top_blue_minions', 'top_red_minions',
        'jungle_blue_minions', 'jungle_red_minions', 'mid_blue_minions',
        'mid_red_minions', 'ad_blue_minions', 'ad_red_minions',
        'support_blue_minions', 'support_red_minions', 'top_blue_wards_placed',
        'top_red_wards_placed', 'top_blue_wards_killed', 'top_red_wards_killed',
        'jungle_blue_wards_placed', 'jungle_red_wards_placed',
        'jungle_blue_wards_killed', 'jungle_red_wards_killed',
        'mid_blue_wards_placed', 'mid_red_wards_placed',
        'mid_blue_wards_killed', 'mid_red_wards_killed', 'ad_blue_wards_placed',
        'ad_red_wards_placed', 'ad_blue_wards_killed', 'ad_red_wards_killed',
        'support_blue_wards_placed', 'support_red_wards_placed',
        'support_blue_wards_killed', 'support_red_wards_killed', 'team_win',
        'match_id'
    ]
    df_output = pd.DataFrame(columns=columns)

    try:
        df_links = pd.read_csv('data/crawled_data/match_link.csv')
        links = df_links['link']
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        driver.quit()
        return

    for i, link in enumerate(links):
        try:
            print(f"Processing link {i + 1}/{len(links)}: {link}")
            driver.get(f'https://www.leagueofgraphs.com{link}')
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.box.matchBox'))
            )
            WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'kda'))
            )

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            extractor = GameStatsExtractor(soup, lane_dict, lanes_graphs)
            all_stats = extractor.extract_all_stats()
            team_win = extractor.get_team_win()
            match_id = link.split('/')[-1]

            champions = all_stats.get('champions')
            for lane in lane_dict.values():
                df_output.loc[i, f"{lane}_blue"] = champions.get(f"{lane}_blue", "")
                df_output.loc[i, f"{lane}_red"] = champions.get(f"{lane}_red", "")

            kda_stats = all_stats.get('kda', {})
            for lane in lane_dict.values():
                df_output.loc[i, f"{lane}_blue_kills"] = kda_stats.get(f"{lane}_blue", {}).get("kills", 0)
                df_output.loc[i, f"{lane}_blue_deaths"] = kda_stats.get(f"{lane}_blue", {}).get("deaths", 0)
                df_output.loc[i, f"{lane}_blue_assists"] = kda_stats.get(f"{lane}_blue", {}).get("assists", 0)
                df_output.loc[i, f"{lane}_red_kills"] = kda_stats.get(f"{lane}_red", {}).get("kills", 0)
                df_output.loc[i, f"{lane}_red_deaths"] = kda_stats.get(f"{lane}_red", {}).get("deaths", 0)
                df_output.loc[i, f"{lane}_red_assists"] = kda_stats.get(f"{lane}_red", {}).get("assists", 0)

            gold_stats = all_stats.get('gold', {})
            for lane in lanes_graphs.keys():
                df_output.loc[i, f"{lane}_blue_gold"] = gold_stats.get(f"{lane}_blue_gold", 0)
                df_output.loc[i, f"{lane}_red_gold"] = gold_stats.get(f"{lane}_red_gold", 0)

            minions_stats = all_stats.get('minions', {})
            for lane in lanes_graphs.keys():
                df_output.loc[i, f"{lane}_blue_minions"] = minions_stats.get(f"{lane}_blue_minions", 0)
                df_output.loc[i, f"{lane}_red_minions"] = minions_stats.get(f"{lane}_red_minions", 0)

            wards_stats = all_stats.get('wards', {})
            for lane in lanes_graphs.keys():
                df_output.loc[i, f"{lane}_blue_wards_placed"] = wards_stats.get(f"{lane}_blue_wards_placed", 0)
                df_output.loc[i, f"{lane}_red_wards_placed"] = wards_stats.get(f"{lane}_red_wards_placed", 0)
                df_output.loc[i, f"{lane}_blue_wards_killed"] = wards_stats.get(f"{lane}_blue_wards_killed", 0)
                df_output.loc[i, f"{lane}_red_wards_killed"] = wards_stats.get(f"{lane}_red_wards_killed", 0)

            df_output.loc[i, 'team_win'] = team_win
            df_output.loc[i, 'match_id'] = match_id
            df_output.to_csv('./test.csv', index=False)
            break
        except Exception as e:
            print(f"Error processing link {link}: {e}")
    driver.quit()

if __name__ == "__main__":
    main()

Processing link 1/10151: /match/las/1292816024
