In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

# Images
import urllib
import requests
from PIL import Image
from io import BytesIO

# Performance
import logging
import time

# Path
import os
from pathlib import Path
import re

In [34]:
# Set general settings
XPATH = "xpath"
erwo = "*Error*"
dash = '-'
slash = '/'
logger = logging.getLogger(__name__)

url_main = "https://decklist.tistory.com/"
xp_title = '//*[@id="container"]/main/div/div[2]/div[1]/div/div/h2'
xp_man = '//*[@id="container"]/main/div/div[2]/div[2]/div[2]/table/tbody/tr[1]/td[2]'

regex_rank = r'\d+[위강]'
regex_nth = r'제\s+(\d+)\s+회\s'
regex_team = r'(\d+)인팀전'

cwd = Path.cwd()
dir_main = Path.cwd().parent

In [30]:
texts = '1923 CS 3인팀전'

In [35]:
team_elements = re.findall(regex_nth, texts)
print(team_elements)

None


In [55]:
regex_team = r'(\d+)\s*인\s*팀전'
regex_nth = r'제\s+(\d+)\s+회\s'
texts = '제 1 회 1923 CS 3인팀전'

team_elements = re.findall(regex_team, texts)
print(team_elements)

nth_elements = re.findall(regex_nth, texts)
print(nth_elements)

['3']
['1']


In [3]:
# Create headless ChromeOptions object
options = Options()
options.add_argument('--headless')

# Create a new ChromeDriverService object with the path to the Chromedriver executable
service = Service('C://chromedriver.exe')

# Initialize Chrome driver with headless options
driver = webdriver.Chrome(service=service, options=options)

In [4]:
def xp_deck_ind(ind):
    return f'//*[@id="container"]/main/div/div[2]/div[2]/div[2]/p[{ind}]'

In [15]:
def get_deck_info(driver, url_numb, save_img):
    url_comb = url_main + str(url_numb)
    driver.get(url_comb)
    #####################################################################
    # Get title of the Game
    try:
        title_element = driver.find_element(By.XPATH, xp_title).text
        game_cs = title_element.split("(")[0].strip()
        game_date = title_element.split(")")[1].strip()
    except:
        title_element = erwo
        game_cs = erwo
        game_date = erwo
        logger.error(f'{url_numb:04d} | Failed to get | Title info')
    #####################################################################
    # Get Number of Players
    try:
        man_element = driver.find_element(By.XPATH, xp_man).text
    except:
        man_element = erwo
        logger.error(f'{url_numb:04d} | Failed to get | Man info')
    #####################################################################
    #####################################################################
    # Get Name of the Decks
    #####################################################################
    #####################################################################
    deck_element_scrap = []
    i = 1
    while True:
        try:
            element = driver.find_element(By.XPATH, xp_deck_ind(i))
            deck_element_scrap.append(element.text)
            i += 1
        except NoSuchElementException:
            break
    #####################################################################
    # Remove \n
    for i in range(len(deck_element_scrap)):
        while '\n' in deck_element_scrap[i]:
            deck_element_scrap[i] = deck_element_scrap[i].replace('\n', '')
    #####################################################################
    # Modify Name of the Decks
    # deck_elements = (filter(lambda x: x != '', deck_element_scrap))
    # deck_elements = list(filter(lambda x: x != ' ', deck_elements))
    deck_elements = filter(lambda x: x.strip() != '', deck_element_scrap)
    #####################################################################
    # Remove dash
    deck_elements_dash = []
    for deck in deck_elements:
        if dash in deck:
            deck = deck.split(dash)[1]
        deck_elements_dash.append(deck)
    #####################################################################
    # Remove slash
    deck_elements_slash = []
    for deck in deck_elements_dash:
        if slash in deck:
            # If the deck contains a slash, split it and append each element to the list
            elements = deck.split(slash)
            deck_elements_slash.extend(iter(elements))
        else:
            # Otherwise, append the deck to the list as is
            deck_elements_slash.append(deck)
    #####################################################################
    deck_elements = [s.strip() for s in deck_elements_slash]
    deck_elements_text = ' '.join(deck_element_scrap)
    rank_elements = re.findall(regex_rank, deck_elements_text)
    #####################################################################
    #####################################################################
    # Get the Images
    #####################################################################
    #####################################################################
    img_elements = driver.find_elements(By.TAG_NAME, 'img')
    db_comb = []
    for i, img_element in enumerate(img_elements):
        deck_code = f"{url_numb:04d}-{i+1:03d}"
        filename = f"{dir_main}\Deck_Log\{deck_code}.jpg"
        img_url = img_element.get_attribute('src')
    #####################################################################
        # Skip GIF images
        if img_url.startswith('data:image/gif'):
            continue
    #####################################################################
        # Get the size of the images
        response = requests.get(img_url)
        img = Image.open(BytesIO(response.content))
        width, height = img.size
    #####################################################################
        # Skip small images
        if (width == 343 and height == 353) or width < 50:
            pass
        elif width > 0:
            #####################################################################
            # Download Image
            # urllib.request.urlretrieve(img_url, filename)
            if save_img:
                with open(filename, 'wb') as f:
                    f.write(response.content)
            #####################################################################
            if len(deck_elements) <= i:
                deck_element = erwo
            else:
                deck_element = deck_elements[i]

            if len(rank_elements) <= i:
                rank_element = erwo
            else:
                rank_element = rank_elements[i]
            #####################################################################
            db_comb.append([
                deck_code, game_cs, game_date, man_element, rank_element, deck_element, title_element, deck_elements_text
                ])
                    #####################################################################

    return db_comb

In [6]:
run_s = 1359
run_e = 1360

master_comb =[]
for i, url_numb in enumerate(range(run_s, run_e)):
    print(f'{url_numb:04d} | Running')
    try: 
        db_comb = get_deck_info(driver,url_numb, True)
    except:
        print(f'{url_numb:04d} | Unexpected Error')
        break
    master_comb.append(db_comb)
    if i% 5 == 0:
        time.sleep(2)
        print("-- Waiting to cool down --")

master_comb_clean = []
for first in master_comb:
    for second in first:
        joined_str = "\t".join(second)
        master_comb_clean.append (joined_str)

dir_write = f'{dir_main}\Output\deck_stack.txt'
open(dir_write, 'w').close()
with open(dir_write , 'w', encoding="utf-8") as fp:
    for item in master_comb_clean:
        fp.write("%s\n" % item)
os.startfile(dir_write)

print('____ | Finished')

1359 | Running
-- Waiting to cool down --
____ | Finished


In [6]:
import numpy as np

In [14]:
dir_deck_db = 'C:/ML_YGO/data/update_deck.txt'
deck_code_list = np.loadtxt(dir_deck_db, dtype='str')

In [17]:
master_comb =[]
for i, url_numb in enumerate(deck_code_list):
    url_numb = int(url_numb)
    print(f'{url_numb:04d} | Running')
    try: 
        db_comb = get_deck_info(driver,url_numb, False)
    except:
        print(f'{url_numb:04d} | Unexpected Error')
        break
    master_comb.append(db_comb)
    if i% 5 == 0:
        time.sleep(2)
        print("-- Waiting to cool down --")

master_comb_clean = []
for first in master_comb:
    for second in first:
        joined_str = "\t".join(second)
        master_comb_clean.append (joined_str)

dir_write = f'{dir_main}\Output\deck_stack.txt'
open(dir_write, 'w').close()
with open(dir_write , 'w', encoding="utf-8") as fp:
    for item in master_comb_clean:
        fp.write("%s\n" % item)
os.startfile(dir_write)

print('____ | Finished')

0156 | Running
-- Waiting to cool down --
0158 | Running
0193 | Running
0195 | Running
0218 | Running
0220 | Running
-- Waiting to cool down --
0231 | Running
0234 | Running
0236 | Running
0239 | Running
0247 | Running
-- Waiting to cool down --
0252 | Running
0262 | Running
0264 | Running
0269 | Running
0273 | Running
-- Waiting to cool down --
0275 | Running
0285 | Running
0287 | Running
0291 | Running


291 | Failed to get | Man info


0292 | Running


292 | Failed to get | Man info


-- Waiting to cool down --
0294 | Running
0298 | Running
0305 | Running
0307 | Running
0308 | Running
-- Waiting to cool down --
0311 | Running
0315 | Running


315 | Failed to get | Man info


0319 | Running
0322 | Running
0324 | Running
-- Waiting to cool down --
0340 | Running


340 | Failed to get | Man info


0344 | Running


344 | Failed to get | Man info


0351 | Running
0355 | Running
0360 | Running
-- Waiting to cool down --
0363 | Running


363 | Failed to get | Man info


0372 | Running
0377 | Running


377 | Failed to get | Man info


0378 | Running
0381 | Running
-- Waiting to cool down --
0382 | Running
0385 | Running
0392 | Running
0397 | Running
0400 | Running


400 | Failed to get | Man info


-- Waiting to cool down --
0401 | Running


401 | Failed to get | Man info


0405 | Running
0406 | Running
0409 | Running
0422 | Running
-- Waiting to cool down --
0423 | Running
0425 | Running
0426 | Running
0427 | Running
0428 | Running


428 | Failed to get | Man info


-- Waiting to cool down --
0445 | Running
0447 | Running
0449 | Running
0450 | Running


450 | Failed to get | Man info


0451 | Running
-- Waiting to cool down --
0452 | Running


452 | Failed to get | Man info


0463 | Running
0467 | Running
0469 | Running
0476 | Running
-- Waiting to cool down --
0500 | Running
0504 | Running
0505 | Running
0524 | Running
0525 | Running
-- Waiting to cool down --
0526 | Running
0529 | Running
0531 | Running
0540 | Running
0542 | Running
-- Waiting to cool down --
0543 | Running
0544 | Running


544 | Failed to get | Man info


0553 | Running
0556 | Running
0558 | Running
-- Waiting to cool down --
0559 | Running
0560 | Running
0561 | Running
0564 | Running


564 | Failed to get | Man info


0572 | Running
-- Waiting to cool down --
0579 | Running
0580 | Running
0590 | Running
0591 | Running
0593 | Running
-- Waiting to cool down --
1219 | Running
1220 | Running
1221 | Running
1235 | Running
1236 | Running


1236 | Failed to get | Man info


-- Waiting to cool down --
1238 | Running
1239 | Running
1243 | Running
1247 | Running
1249 | Running
-- Waiting to cool down --
1250 | Running


1250 | Failed to get | Man info


1252 | Running
1256 | Running
1257 | Running
1258 | Running
-- Waiting to cool down --
1260 | Running
1271 | Running
1272 | Running


1272 | Failed to get | Man info


1275 | Running
1279 | Running
-- Waiting to cool down --
1280 | Running


1280 | Failed to get | Man info


1285 | Running
1286 | Running
1294 | Running
1298 | Running
-- Waiting to cool down --
1301 | Running
1313 | Running
1315 | Running
1316 | Running
1318 | Running
-- Waiting to cool down --
1319 | Running
1320 | Running
1322 | Running
1323 | Running
1328 | Running
-- Waiting to cool down --
1332 | Running
____ | Finished
