In [31]:
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import threading
import csv
import json
import pandas as pd

In [32]:
# using bs
# r = requests.get(url)

# if r.status_code == requests.codes.ok:
#     soup = bs(r.text, "html.parser")
#     # print(soup.prettify())

In [33]:
class Crawler:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # To run Chrome in headless mode
        chrome_options.add_argument("--disable-gpu")  # Required for Windows
        self.driver = webdriver.Chrome(options=chrome_options)

    # wait until locator presents and return parsed html of given url
    def getSoup(self, url, locator):
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(locator), "not found")
        except:
            print("locator not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup
    
    # if button exists then click button, otherwise wait until locator presents and return parsed html of given url
    def getSoupWithButton(self, url, locator, button):
        self.driver.get(url)
        WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
        print("clicking button")
        while True:
            try:
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                buttonElement = self.driver.find_element(By.CLASS_NAME, button)
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                self.driver.execute_script("arguments[0].click();", buttonElement)
            except Exception as e:
                break
        print("finished")
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(locator), "not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup

    # quit the driver
    def quit(self):
        self.driver.quit()
        # print("end service")

In [34]:
# generate competition url by base and the id from link
def genCompetitionUrl(baseUrl, link): 
    pattern = r"(?<=competitions).+"
    id = re.findall(pattern, link)
    id = id[0] if id else ""
    comUrl = baseUrl+id
    return comUrl

In [35]:
# given competition page url, get it's tags and set up tags and relations list
def getOneCompetition(url, curTags, relations, title, semaphore, lock, cMap, inx):
    semaphore.acquire()
    
    crawler = Crawler()
    locator = (By.CSS_SELECTOR, ".bh-block.bh-content-block")
    soup = crawler.getSoup(url, locator)   
    crawler.quit()

    containers = soup.find_all("div", class_="bh-block bh-category-tag-block")
    if len(containers) != 1:
        semaphore.release()
        return

    # get tags and set 
    c = containers[0]
    tags = c.find_all("a", class_="bh-value")
    
    lock.acquire()
    for t in tags:
        tTag = ""
        
        # try to do the mapping
        try:
            m = int(t.text)
            if m != "" and cMap[m-inx]:
                tTag = cMap[m-inx]
            else :
                print("unknown tag number")
        except:
            tTag = t.text
            
        relations.append([tTag, title])
        if tTag not in curTags:
            curTags.append(tTag)
    lock.release()
    
    semaphore.release()

In [36]:
# get all competition datas
def getAllCompetition(crawler):
    baseUrl = "https://bhuntr.com/tw/competitions"
    # cate = "?category=111,112"
    url = baseUrl #  + cate

    # category number mapping
    startInx = 107
    cateMapping = [
        "繪畫比賽", "平面設計比賽", "產品設計比賽", "綜合設計",
        "攝影比賽", "影片比賽",
        "文學獎", "創意寫作",
        "企劃競賽", "創業競賽", "程式競賽",
        "音樂大賽", "歌唱比賽", "詞曲創作",
        "運動", "選秀", "其他"
    ]
    
    button = "bh-more-block"
    locator = (By.CLASS_NAME, "bh-card-item")
    soup = crawler.getSoupWithButton(url, locator, button)

    print("start parsing")
    
    competitions = {"competitions":[]} # {"competition:[{title:t, ...},...]"}
    tags = []
    relations = []

    # use for threading(test)
    threads = []
    semaphore = threading.Semaphore(2) # max running threads: 2
    lock = threading.Lock()
    
    containers = soup.find_all("div", class_="bh-wrapper")
    for c in containers:
        # define a competition dict
        com = {}
    
        # get title and link
        title = c.find("a", class_="bh-title")
        if title == None:
            continue
        link = title.get("href")
        com["title"] = title.text
        # print(com["title"])
        
        # get prize (total, highiest)
        prizeBlock = c.find("div", class_="bh-prize-block")
        if prizeBlock == None:
            continue
        prize = prizeBlock.find_all("span", class_="bh-amount")
        for inx, p in enumerate(prize):
            amount = re.sub(r"\D", "", p.text)
            if(amount == ""): continue
            if(inx >= 1): com["highiestPrize"] = int(amount)
            else: com["totalPrize"] = int(amount)

        # get due time
        timeline = c.find("span", class_="bh-item is-processing")
        if not timeline:
            continue
        com["dueTime"] = timeline.text

        # go to the competition page and get tags, relations
        comUrl = genCompetitionUrl(baseUrl, link)
        thread = threading.Thread(target=getOneCompetition, args=(comUrl, tags, relations, title.text, semaphore, lock, cateMapping, startInx))
        threads.append(thread)
        thread.start()
        
        com["link"] = comUrl

        # add the competition info
        competitions["competitions"].append(com)
        
    # wait for all threads to end
    for thread in threads:
        thread.join()
    print("finished")
    
    return competitions, tags, relations

In [37]:
# save list as csv file
def saveDataAsCSV(data, header, fileName):
    try:
        with open(fileName, "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(header)
            csvwriter.writerows(data)
    except Exception as e:
        print(e)

# save dict as json file
def saveDataAsJson(filename, data):
    with open(filename, 'w') as jsonFile:
        json.dump(data, jsonFile, indent=4)

In [38]:
crawler = Crawler()

competitions, tags, relations = getAllCompetition(crawler)

crawler.quit()

clicking button
finished
start parsing
finished


In [39]:
# save the files
tagsShaped = [[t] for t in tags]
saveDataAsCSV(tagsShaped, ["tagName"], "tags.csv")

saveDataAsCSV(relations, ["tagName", "competitionTitle"], "relations.csv")

saveDataAsJson("competitions.json", competitions)

used for check data

In [40]:
relaData = pd.read_csv("relations.csv")
relaData.head()

Unnamed: 0,tagName,competitionTitle
0,企劃競賽,2024集點子大賽
1,創業競賽,2024集點子大賽
2,創意,2024集點子大賽
3,競賽,2024集點子大賽
4,公益,2024集點子大賽


In [41]:
tagData = pd.read_csv("tags.csv")
tagData.head()

Unnamed: 0,tagName
0,企劃競賽
1,創業競賽
2,創意
3,競賽
4,公益


In [42]:
with open("competitions.json", "r", encoding="utf-8") as f:
    comData = json.load(f)
    print(json.dumps(comData, indent=4, ensure_ascii=False))

{
    "competitions": [
        {
            "title": "Taiwan YOU & I 綠能大未來 臺灣再生能源短影音創作競賽",
            "totalPrize": 295000,
            "highiestPrize": 50000,
            "dueTime": "投稿中：還有 3 個月",
            "link": "https://bhuntr.com/tw/competitions/renewableenrgy"
        },
        {
            "title": "2024集點子大賽",
            "totalPrize": 81000,
            "highiestPrize": 10000,
            "dueTime": "投稿中：還有 14 天",
            "link": "https://bhuntr.com/tw/competitions/6autnbu1g2ic2y4e36"
        },
        {
            "title": "113 年網路成癮預防推廣活動 創意短片.平面設計徵件比賽",
            "totalPrize": 14000,
            "highiestPrize": 5000,
            "dueTime": "投稿中：還有 大約 2 個月",
            "link": "https://bhuntr.com/tw/competitions/8g2r7zul8x4aun4knw"
        },
        {
            "title": "E.WEDDING 女神製造營",
            "dueTime": "投稿中：還有 6 個月",
            "link": "https://bhuntr.com/tw/competitions/jul7diyz0o9hs2s550"
        },
        {
            "title": "｢2分鐘 翻轉司法廉潔