In [127]:
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import threading
import csv
import json

In [128]:
# using bs
# r = requests.get(url)

# if r.status_code == requests.codes.ok:
#     soup = bs(r.text, "html.parser")
#     # print(soup.prettify())

In [129]:
class Crawler:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # To run Chrome in headless mode
        chrome_options.add_argument("--disable-gpu")  # Required for Windows
        self.driver = webdriver.Chrome(options=chrome_options)

    def getSoup(self, url, locator):
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(locator), "not found")
        except:
            print("locator not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup
    
    def getSoupWithButton(self, url, locator, button):
        self.driver.get(url)
        WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
        print("clicking button")
        while True:
            try:
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                buttonElement = self.driver.find_element(By.CLASS_NAME, button)
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                self.driver.execute_script("arguments[0].click();", buttonElement)
            except Exception as e:
                break
        print("finished")
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(locator), "not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup
        
    def quit(self):
        self.driver.quit()
        # print("end service")

In [130]:
def getAllCompetition(crawler):
    button = "bh-more-block"
    locator = (By.CLASS_NAME, "bh-card-item")
    soup = crawler.getSoupWithButton(url, locator, button)

    print("start parsing")
    
    competitions = {"competitions":[]} # {"competition:[{title:t, ...},...]"}
    tags = []
    relations = []

    threads = []
    semaphore = threading.Semaphore(2)
    lock = threading.Lock()
    containers = soup.find_all("div", class_="bh-wrapper")
    for c in containers:
        com = {}
    
        # get title and link
        title = c.find("a", class_="bh-title")
        if title == None:
            continue
        link = title.get("href")
        com["title"] = title.text
        # print(com["title"])
        
        # get prize (total, highiest)
        prizeBlock = c.find("div", class_="bh-prize-block")
        if prizeBlock == None:
            continue
        prize = prizeBlock.find_all("span", class_="bh-amount")
        for inx, p in enumerate(prize):
            amount = re.sub(r"\D", "", p.text)
            if(amount == ""): continue
            if(inx >= 1): com["highiestPrize"] = int(amount)
            else: com["totalPrize"] = int(amount)

        # get time
        timeline = c.find("span", class_="bh-item is-processing")
        if not timeline:
            continue
        com["dueTime"] = timeline.text

        # go to the competition page and get tags, relations
        comUrl = genCompetitionUrl(baseUrl, link)
        thread = threading.Thread(target=getOneCompetition, args=(comUrl, tags, relations, title.text, semaphore, lock))
        threads.append(thread)
        thread.start()
        
        com["link"] = comUrl

        # add the competition info
        competitions["competitions"].append(com)
        
    for thread in threads:
        thread.join()
    print("finished")
    
    return competitions, tags, relations

In [131]:
def genCompetitionUrl(baseUrl, link): 
    pattern = r"(?<=competitions).+"
    id = re.findall(pattern, link)
    id = id[0] if id else None
    comUrl = baseUrl+id
    return comUrl

In [132]:
def getOneCompetition(url, curTags, relations, title, semaphore, lock):
    semaphore.acquire()
    
    crawler = Crawler()
    locator = (By.CSS_SELECTOR, ".bh-block.bh-content-block")
    soup = crawler.getSoup(url, locator)   
    crawler.quit()

    containers = soup.find_all("div", class_="bh-block bh-category-tag-block")
    if len(containers) != 1:
        print("sth wrong")
        semaphore.release()
        return
        
    c = containers[0]
    tags = c.find_all("a", class_="bh-value")
    lock.acquire()
    for t in tags:
        relations.append([t.text, title])
        if t.text not in curTags:
            # print(t.text)
            curTags.append(t.text)
    lock.release()
    semaphore.release()

In [133]:
def saveDataAsCSV(data, header, fileName):
    try:
        with open(fileName, "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(header)
            csvwriter.writerows(data)
    except Exception as e:
        print(e)

def saveDataAsJson(filename, data):
    with open(filename, 'w') as jsonFile:
        json.dump(data, jsonFile, indent=4)

In [134]:
baseUrl = "https://bhuntr.com/tw/competitions"
cate = "?category=111,112"
url = baseUrl #  + cate

In [135]:
crawler = Crawler()

competitions, tags, relations = getAllCompetition(crawler)

crawler.quit()

clicking button
finished
start parsing
locator not found
sth wrong
locator not found
sth wrong
finished


In [136]:
# save the files
tagsShaped = [[t] for t in tags]
saveDataAsCSV(tagsShaped, ["tagName"], "tags.csv")

saveDataAsCSV(relations, ["tagName", "competitionTitle"], "relations.csv")

saveDataAsJson("competitions.json", competitions)