#### * hw3_extend.ipynb grab competition description data used for hw4, and it stores data in google sheet

In [4]:
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import threading
import csv
import json
import pandas as pd
import pygsheets

In [5]:
class Crawler:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # To run Chrome in headless mode
        chrome_options.add_argument("--disable-gpu")  # Required for Windows
        self.driver = webdriver.Chrome(options=chrome_options)

    # wait until locator presents and return parsed html of given url
    def getSoup(self, url, locator):
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, 15).until(EC.presence_of_element_located(locator), "not found")
        except:
            print("locator not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup
    
    # if button exists then click button, otherwise wait until locator presents and return parsed html of given url
    def getSoupWithButton(self, url, locator, button):
        self.driver.get(url)
        WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
        print("clicking button")
        while True:
            try:
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                buttonElement = self.driver.find_element(By.CLASS_NAME, button)
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, button)), "not found")
                self.driver.execute_script("arguments[0].click();", buttonElement)
            except Exception as e:
                print("no button")
                break
        print("finished")
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(locator), "not found")
        htmlContent = self.driver.page_source

        soup = bs(htmlContent, 'html.parser')
        return soup

    # quit the driver
    def quit(self):
        self.driver.quit()
        # print("end service")

In [6]:
# generate competition url by base and the id from link
def genCompetitionUrl(baseUrl, link): 
    pattern = r"(?<=competitions).+"
    id = re.findall(pattern, link)
    id = id[0] if id else ""
    comUrl = baseUrl+id
    return comUrl

In [7]:
# given competition page url, get it's tags and set up tags and relations list
def getOneCompetition(url, curTags, relations, title, semaphore, lock, cMap, inx, comps, comInx):
    semaphore.acquire()
    
    crawler = Crawler()
    locator = (By.CSS_SELECTOR, "div.bh-block.bh-content-block")
    # locator = (By.XPATH, "//*[@id=\"bh-body\"]/div/div[2]/div[2]/div[2]/div[2]/div")
    soup = crawler.getSoup(url, locator)   
    crawler.quit()

    # for hw4, get competition description
    container = soup.find("div", class_="bh-guideline ck-content")
    if container == None:
        print("no guide")
    else:
        allText = container.get_text(strip=True)
        if allText == "" or allText == None:
            print(container)
            print("no description")
        lock.acquire()
        # print(comInx, type(allText), id(comInx))
        comps[comInx]["description"] = allText
    
    # get tags
    containers = soup.find_all("div", class_="bh-block bh-category-tag-block")
    if len(containers) != 1:
        semaphore.release()
        return

    # get tags and set 
    c = containers[0]
    tags = c.find_all("a", class_="bh-value")
    
    for t in tags:
        tTag = ""
        
        # try to do the mapping
        try:
            m = int(t.text)
            if m != "" and cMap[m-inx]:
                tTag = cMap[m-inx]
            else :
                print("unknown tag number")
        except:
            tTag = t.text
            
        relations.append([tTag, title])
        if tTag not in curTags:
            curTags.append(tTag)
    lock.release()
    
    semaphore.release()

In [8]:
# for testing
# url = "https://bhuntr.com/tw/competitions/2imncb1q228vrrscul"
# title = "測試"
# comps = [{"title":"test"}]
# comInx = 0
# getOneCompetition_test(url, [], [], title, comps, comInx)

In [9]:
# print(comps)

In [10]:
# get all competition datas
def getAllCompetition(crawler):
    baseUrl = "https://bhuntr.com/tw/competitions"
    # cate = "?category=111,112"
    url = baseUrl #  + cate

    # category number mapping
    startInx = 107
    cateMapping = [
        "繪畫比賽", "平面設計比賽", "產品設計比賽", "綜合設計",
        "攝影比賽", "影片比賽",
        "文學獎", "創意寫作",
        "企劃競賽", "創業競賽", "程式競賽",
        "音樂大賽", "歌唱比賽", "詞曲創作",
        "運動", "選秀", "其他"
    ]
    
    button = "bh-more-block"
    locator = (By.CLASS_NAME, "bh-card-item")
    soup = crawler.getSoupWithButton(url, locator, button)

    print("start parsing")
    
    competitions = {"competitions":[]} # {"competition:[{title:t, ...},...]"}
    tags = []
    relations = []

    # use for threading(test)
    threads = []
    semaphore = threading.Semaphore(2) # max running threads: 2
    lock = threading.Lock()
    
    containers = soup.find_all("div", class_="bh-wrapper")
    comInx = -1
    for c in containers:
        # define a competition dict
        com = {}
    
        # get title and link
        title = c.find("a", class_="bh-title")
        if title == None:
            continue
        link = title.get("href")
        com["title"] = title.text
        # print(com["title"])
        
        # get prize (total, highiest)
        prizeBlock = c.find("div", class_="bh-prize-block")
        if prizeBlock == None:
            continue
        prize = prizeBlock.find_all("span", class_="bh-amount")
        for inx, p in enumerate(prize):
            amount = re.sub(r"\D", "", p.text)
            if(amount == ""): continue
            if(inx >= 1): com["highiestPrize"] = int(amount)
            else: com["totalPrize"] = int(amount)

        # get due time
        timeline = c.find("span", class_="bh-item is-processing")
        if not timeline:
            continue
        com["dueTime"] = timeline.text

        comUrl = genCompetitionUrl(baseUrl, link)
        com["link"] = comUrl
        
        # add the competition info
        competitions["competitions"].append(com)
        comInx += 1

        curInx = comInx
        # go to the competition page and get tags, relations
        thread = threading.Thread(target=getOneCompetition, args=
                                  (comUrl, tags, relations, title.text, semaphore, lock, cateMapping, startInx, competitions["competitions"], curInx))
        threads.append(thread)
        thread.start()
        
    # wait for all threads to end
    for thread in threads:
        thread.join()
    print("finished")
    
    return competitions, tags, relations

In [11]:
# save list as csv file
def saveDataAsCSV(data, header, fileName):
    try:
        with open(fileName, "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(header)
            csvwriter.writerows(data)
    except Exception as e:
        print(e)

# save dict as json file
def saveDataAsJson(filename, data):
    with open(filename, 'w') as jsonFile:
        json.dump(data, jsonFile, indent=4)

In [12]:
crawler = Crawler()

competitions, tags, relations = getAllCompetition(crawler)

crawler.quit()

clicking button
no button
finished
start parsing
finished


store to google sheet  
ref: [Python 讀寫 Google Sheets 教學](https://hackmd.io/@Yun-Cheng/GoogleSheets)

In [15]:
df = pd.DataFrame(competitions["competitions"])
df.head()

Unnamed: 0,title,totalPrize,highiestPrize,dueTime,link,description
0,2024 臺灣文學獎徵獎,3700000.0,1000000.0,投稿中：還有 2 個月,https://bhuntr.com/tw/competitions/2imncb1q228...,壹、活動簡介為了向當年度優秀文學作品致敬，同時激發各類原創書寫，以彰顯臺灣價值、在地特色，進...
1,Taiwan YOU & I 綠能大未來 臺灣再生能源短影音創作競賽,295000.0,50000.0,投稿中：還有 3 個月,https://bhuntr.com/tw/competitions/renewableenrgy,Taiwan YOU & I 綠能大未來臺灣再生能源短影音創作競賽競賽辦法為呼應全球淨零趨...
2,2024GenAI Stars生成式AI百工百業應用選拔,7000000.0,1500000.0,投稿中：還有 2 個月,https://bhuntr.com/tw/competitions/3th7d58bc38...,競賽宗旨AI技術為人類世界帶來翻天覆地的變革，生成式AI更從去年起為產業帶來新一輪破壞式創新...
3,E.WEDDING 女神製造營,,,投稿中：還有 5 個月,https://bhuntr.com/tw/competitions/jul7diyz0o9...,E.WEDDING 女神製造營冬季官方代言人徵選活動初衷 About Event：【E.WE...
4,2024集點子大賽,81000.0,10000.0,投稿中：還有 6 天,https://bhuntr.com/tw/competitions/6autnbu1g2i...,前言信義公益基金會致力於銀髮族的關懷與扶持，創會至今辦理許多高齡服務計畫；「集點子大賽」是一...


In [23]:
#authorization
gc = pygsheets.authorize(service_file='../../pl-421203-a74b8641e6bc.json')
sheetUrl = "https://docs.google.com/spreadsheets/d/1WdlcEpxjWRB0Xz9RsRALppZYU6_sVQyTgkTX4xeMsEA/"
sh = gc.open_by_url(sheetUrl)

In [33]:
def dfToSheet(df, sh, title):
    try:
        ws = sh.worksheet_by_title(title)
    except Exception as e:
        print(e)
    ws.set_dataframe(df, start=(0,0), nan='')

In [35]:
dfToSheet(df, sh, "competitions")

In [38]:
tg = pd.DataFrame(tags, columns=["tagName"])
tg_comps = pd.DataFrame(relations, columns=["tagName", "competitionTitle"])

In [39]:
dfToSheet(tg, sh, "tags")
dfToSheet(tg_comps, sh, "tag_comps")