# Selenium


- 官方網站介紹
    - [Selenium](https://www.selenium.dev/)
    - [下載Driver](https://www.selenium.dev/selenium/docs/api/py/index.html)
- 克服Colab使用Selenium問題
    - https://stackoverflow.com/questions/51046454/how-can-we-use-selenium-webdriver-in-colab-research-google-com
    - https://medium.com/@yanweiliu/python%E7%88%AC%E8%9F%B2%E5%AD%B8%E7%BF%92%E7%AD%86%E8%A8%98-%E4%BA%8C-selenium%E8%87%AA%E5%8B%95%E5%8C%96-ab0a27a94ff2
- 從舊版Seleium過渡到4.0版注意事項
  - `webdriver.Chrome('chromedriver',chrome_options=chrome_options)`參數用法簡化，現在只要
  `wd = webdriver.Chrome(chrome_options)`
`
`

選擇元素的各種方法
```python
find_element(By.ID, "fruits")
find_element(By.NAME, "q")
find_element(By.CLASS_NAME, "tomatoes")
find_elements(By.TAG_NAME, "li")          #html標籤
find_element(By.LINK_TEXT, "More information...")
find_element(By.PARTIAL_LINK_TEXT, )
find_element(By.XPATH, )           #最常用    #/html開頭為絕對路徑；//相對路徑
find_element(By.CSS_SELECTOR,"#fruits .tomatoes")

```


In [29]:
#下載安裝相關套件，以下為Colab的安裝方式
!pip install -Uq selenium

### 常用的`webdriver.ChromeOptions()`參數
- start-maximized: Chrome開啟最大化視窗
- incognito: 無痕模式開啟
- headless: 不開啟Chrome模式
- disable-extensions: 停用Chrome擴充功能
- disable-popup-blocking: 停用Chrome彈出視窗
- make-default-browser: 設為預設瀏覽器
- version: 顯示瀏覽器版本
- disable-infobars: 阻止 Chrome 顯示通知“Chrome 由自動化軟體控制”



---



In [30]:
#@title 以 Selenium 搜尋 PTT 指定列表

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome(chrome_options)
url ='https://www.ptt.cc/bbs/movie/index.html'
wd.get(url)

# elem = wd.find_element(By.CLASS_NAME,"title")
# print(elem.text)

#beautifulsoup解析
soup = BeautifulSoup(wd.page_source,"html.parser")
links = soup.select('div.title > a')


res = [{
    'title': link.get_text(),
    'href': f'https://www.ptt.cc{link.get("href")}'
} for link in links if '公告' not in link.get_text() ]


print(res)
wd.close()

[{'title': '[討論] 第四台電影頻道是不是快不行了', 'href': 'https://www.ptt.cc/bbs/movie/M.1693663765.A.7C7.html'}, {'title': 'Re: [討論] 國片的髒話為什麼很出戲？', 'href': 'https://www.ptt.cc/bbs/movie/M.1693664353.A.7EB.html'}, {'title': '[情報] 橋本愛、仲野太賀主演《在車上》製片新作《熱のあとに》', 'href': 'https://www.ptt.cc/bbs/movie/M.1693667070.A.01B.html'}, {'title': 'Re: [討論] 第四台電影頻道是不是快不行了', 'href': 'https://www.ptt.cc/bbs/movie/M.1693668442.A.FC7.html'}, {'title': '[LIVE] 年代much38台 23:00 恐怖份子(楊德昌)', 'href': 'https://www.ptt.cc/bbs/movie/M.1693669519.A.803.html'}, {'title': '[討論] 台灣電影院怎不流行送特典吸引多刷???', 'href': 'https://www.ptt.cc/bbs/movie/M.1693669725.A.A8C.html'}, {'title': '[新聞] 經典恐怖再度回歸 德古拉嚇不了現代觀眾', 'href': 'https://www.ptt.cc/bbs/movie/M.1693671462.A.AF2.html'}, {'title': '[好雷] 王室緋聞守則的台詞', 'href': 'https://www.ptt.cc/bbs/movie/M.1693674408.A.8F9.html'}, {'title': '[問片] 找一部鄭秀文的片', 'href': 'https://www.ptt.cc/bbs/movie/M.1693674692.A.99D.html'}, {'title': '[普雷] 《之前的我們》-膚淺的因緣 ', 'href': 'https://www.ptt.cc/bbs/movie/M.1693677964.A.8

In [31]:
#@title 以 Selenium 搜尋 PTT 指定列表

from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup


def make_webdriver() -> Chrome:
  options = create_options()
  driver = Chrome(options=options)
  return driver


def create_options() -> Options:
  options = Options()
  options.add_argument("--headless")
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  options.add_argument("--start-maximized")
  return options


def get_ptt_board_list(board) -> list:
  url = f"https://www.ptt.cc/bbs/{board}/index.html"
  wd = make_webdriver()
  wd.get(url=url)
  soup = BeautifulSoup(wd.page_source,"html.parser")
  links = soup.select('div.title > a')
  res = [{
      'title': link.get_text(),
      'href': f'https://www.ptt.cc{link.get("href")}'
  } for link in links if '公告' not in link.get_text() ]
  wd.quit()
  return res

if __name__ == '__main__':
  res = get_ptt_board_list('movie')
  print(res)


[{'title': '[討論] 第四台電影頻道是不是快不行了', 'href': 'https://www.ptt.cc/bbs/movie/M.1693663765.A.7C7.html'}, {'title': 'Re: [討論] 國片的髒話為什麼很出戲？', 'href': 'https://www.ptt.cc/bbs/movie/M.1693664353.A.7EB.html'}, {'title': '[情報] 橋本愛、仲野太賀主演《在車上》製片新作《熱のあとに》', 'href': 'https://www.ptt.cc/bbs/movie/M.1693667070.A.01B.html'}, {'title': 'Re: [討論] 第四台電影頻道是不是快不行了', 'href': 'https://www.ptt.cc/bbs/movie/M.1693668442.A.FC7.html'}, {'title': '[LIVE] 年代much38台 23:00 恐怖份子(楊德昌)', 'href': 'https://www.ptt.cc/bbs/movie/M.1693669519.A.803.html'}, {'title': '[討論] 台灣電影院怎不流行送特典吸引多刷???', 'href': 'https://www.ptt.cc/bbs/movie/M.1693669725.A.A8C.html'}, {'title': '[新聞] 經典恐怖再度回歸 德古拉嚇不了現代觀眾', 'href': 'https://www.ptt.cc/bbs/movie/M.1693671462.A.AF2.html'}, {'title': '[好雷] 王室緋聞守則的台詞', 'href': 'https://www.ptt.cc/bbs/movie/M.1693674408.A.8F9.html'}, {'title': '[問片] 找一部鄭秀文的片', 'href': 'https://www.ptt.cc/bbs/movie/M.1693674692.A.99D.html'}, {'title': '[普雷] 《之前的我們》-膚淺的因緣 ', 'href': 'https://www.ptt.cc/bbs/movie/M.1693677964.A.8

In [32]:
import pandas as pd

# pd.DataFrame({"title":title, "href":href})
pd.DataFrame(res)


Unnamed: 0,title,href
0,[討論] 第四台電影頻道是不是快不行了,https://www.ptt.cc/bbs/movie/M.1693663765.A.7C...
1,Re: [討論] 國片的髒話為什麼很出戲？,https://www.ptt.cc/bbs/movie/M.1693664353.A.7E...
2,[情報] 橋本愛、仲野太賀主演《在車上》製片新作《熱のあとに》,https://www.ptt.cc/bbs/movie/M.1693667070.A.01...
3,Re: [討論] 第四台電影頻道是不是快不行了,https://www.ptt.cc/bbs/movie/M.1693668442.A.FC...
4,[LIVE] 年代much38台 23:00 恐怖份子(楊德昌),https://www.ptt.cc/bbs/movie/M.1693669519.A.80...
5,[討論] 台灣電影院怎不流行送特典吸引多刷???,https://www.ptt.cc/bbs/movie/M.1693669725.A.A8...
6,[新聞] 經典恐怖再度回歸 德古拉嚇不了現代觀眾,https://www.ptt.cc/bbs/movie/M.1693671462.A.AF...
7,[好雷] 王室緋聞守則的台詞,https://www.ptt.cc/bbs/movie/M.1693674408.A.8F...
8,[問片] 找一部鄭秀文的片,https://www.ptt.cc/bbs/movie/M.1693674692.A.99...
9,[普雷] 《之前的我們》-膚淺的因緣,https://www.ptt.cc/bbs/movie/M.1693677964.A.8F...


In [33]:
pd.DataFrame(res)['href']


0     https://www.ptt.cc/bbs/movie/M.1693663765.A.7C...
1     https://www.ptt.cc/bbs/movie/M.1693664353.A.7E...
2     https://www.ptt.cc/bbs/movie/M.1693667070.A.01...
3     https://www.ptt.cc/bbs/movie/M.1693668442.A.FC...
4     https://www.ptt.cc/bbs/movie/M.1693669519.A.80...
5     https://www.ptt.cc/bbs/movie/M.1693669725.A.A8...
6     https://www.ptt.cc/bbs/movie/M.1693671462.A.AF...
7     https://www.ptt.cc/bbs/movie/M.1693674408.A.8F...
8     https://www.ptt.cc/bbs/movie/M.1693674692.A.99...
9     https://www.ptt.cc/bbs/movie/M.1693677964.A.8F...
10    https://www.ptt.cc/bbs/movie/M.1693679507.A.45...
11    https://www.ptt.cc/bbs/movie/M.1693680704.A.58...
12    https://www.ptt.cc/bbs/movie/M.1693694419.A.84...
13    https://www.ptt.cc/bbs/movie/M.1693701477.A.DC...
14    https://www.ptt.cc/bbs/movie/M.1693702087.A.41...
15    https://www.ptt.cc/bbs/movie/M.1693707833.A.19...
16    https://www.ptt.cc/bbs/movie/M.1693708670.A.E4...
17    https://www.ptt.cc/bbs/movie/M.1693708942.



---



In [34]:
#@title Yahoo搜尋 "霍華德"
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(options=chrome_options)
url='https://tw.yahoo.com/'
wd.get(url)

# 操作網頁元素
element = wd.find_element(By.ID, 'header-search-input')
key_word = '霍華德'
element.send_keys(key_word)
wd.find_element(By.ID, 'header-desktop-search-button').click()

# 等待目標表格'id 為 web'的div出現
WebDriverWait(wd, 5).until(
    expected_conditions.presence_of_element_located((By.ID, 'web')))

#然後就是beautifulsoup的範疇了，將目前頁面用bs4解析
soup = BeautifulSoup(wd.page_source,"html.parser")
links = soup.select('div#web h3')

for link in links:
    print(link.get_text())

wd.quit()

zh.wikipedia.org › zh-tw › 迪韋特·侯活德懷特·霍華德 - 維基百科，自由的百科全書
頭條新聞
udn.com › news › story影／魔獸霍華德離台流下不捨淚水 球迷送機合照簽名來者不拒 | 籃 ... 
udn.com › news › story魔獸專訪／喊話帶著更佳狀態回來 霍華德籲球迷讓雲豹知道 | 籃球 ... 
udn.com › news › storyT1聯盟／魔獸退燒？霍華德雙腳「翹高高」掛高鐵頭墊引反感 | 籃 ... 
news.tvbs.com.tw › sports › 2226974魔獸罕曬全家福 遭爆私生子數量驚人可組「2支球隊」│霍華德│籃球 ... 
tw.news.yahoo.com › 放棄中國來台-魔獸-霍華德首放棄中國來台！「魔獸」霍華德首鬆口「關鍵內幕」全因十年前1承 ... 
udn.com › news › storyT1聯盟／霍華德面子、裡子都贏林書豪 還對他喊話：下季來T1 | 籃 ... 
為何選擇台灣？「魔獸」霍華德：為了10年前的承諾 | ETtoday運動雲 | …
霍格華茲魔法與巫術學院 - 維基百科，自由的百科全書
為何選擇台灣？「魔獸」霍華德：為了10年前的承諾 | ETtoday運動雲 | …
德懷特·霍華德 - 維基百科，自由的百科全書
圖片


In [35]:
#@title YAHOO搜尋
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(options=chrome_options)
url='https://tw.yahoo.com/'
wd.get(url)

# 操作網頁元素
element = wd.find_element(By.ID, 'header-search-input')
key_word = '\u7709\u6EAA' #@param {type:"string"}
element.send_keys(key_word)
wd.find_element(By.ID, 'header-desktop-search-button').click()

# 等待目標表格'id 為 web'的div出現
WebDriverWait(wd, 5).until(
    expected_conditions.presence_of_element_located((By.ID, 'web')))

#然後就是beautifulsoup的範疇了，將目前頁面用bs4解析
soup = BeautifulSoup(wd.page_source,"html.parser")
links = soup.select('div#web h3')

for link in links:
    print(link.get_text())

wd.quit()

zh.wikipedia.org › wiki › 眉溪眉溪 - 维基百科，自由的百科全书簡
zh.wikipedia.org › zh-tw › 眉溪眉溪 - 維基百科，自由的百科全書
www.tipp.org.tw › tribe_detail3臺灣原住民族資訊資源網 - 認識原住民族 - 部落介紹
www.walkerland.com.tw › article › view【南投埔里住宿】眉溪曉莊田園風民宿，綠意盎然的禪意莊園 (家庭 ... 
blog.niceday.tw › 2018/06/13 › 【有片】日月潭還可以【有片】日月潭還可以這樣玩！眉溪部落體驗賽德克族射箭、家屋、 ... 
眉溪 - 维基百科，自由的百科全书
媚麗埔里-旅遊導覽-眉溪曉莊
你有所不知的「賽德克．巴萊」 ｜ 林益仁 ／ 南島話聲 ｜ 獨立評論
臺灣原住民族資訊資源網 - 認識原住民族 - 部落介紹
trail.tacp.gov.tw › zh-hant › Keyword眉溪 | 尋路.循路-臺灣原住民族古道空間資訊網 - TACP
www.kidsplay.com.tw › visitspot › content眉溪曉莊鄉村渡假別墅 - 景點 - 親子旅遊 - KidsPlay親子就醬玩
圖片


In [None]:
#@title Google搜尋
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') #在背景執行
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(options=chrome_options)
url = 'https://www.google.com/'
wd.get(url)

# 用XPATH定位元素 - 搜尋框
element = wd.find_element(By.XPATH,'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/textarea')
key_word = '\u881F\u7B46\u5C0F\u65B0' #@param {type:"string"}
element.send_keys(key_word)

# 用XPATH定位元素 - 點選搜尋
wd.find_element(By.XPATH,'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[4]/center/input[1]').click()


# 用BeautifulSoup解析 - 標題h3
soup = BeautifulSoup(wd.page_source,"html.parser")
links = soup.select('div > a > h3')

for link in links:
    print(link.get_text())

wd.quit()

In [None]:
#@title PChome 搜尋 PS5
# https://blog.jiatool.com/posts/pchome_spider01/
# 實際搶單可參考: https://github.com/ywchiu/largitdata/blob/master/code/Course_137.ipynb
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(options=chrome_options)

key_word = 'ps5'

url = f'https://ecshweb.pchome.com.tw/search/v3.3/?q={key_word}'
wd.get(url)

WebDriverWait(wd, 5).until(
    expected_conditions.presence_of_element_located((By.TAG_NAME, 'h5')))

#beautifulsoup解析
soup = BeautifulSoup(wd.page_source,"html.parser")
wd.quit()

elems = soup.select('h5 > a')

items = [{
    'title': elem.get_text(),
    'link': f'https{elem.get("href")}'}
    for elem in elems
    ]

# for elem in elems:
#   item = dict()
#   item['title'] = elem.get_text()
#   item['link'] = f'https{elem.get("href")}'
#   items.append(item)

items

In [None]:
#@title MOMO搶PS5
# https://ithelp.ithome.com.tw/articles/10262268

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
prefs = {'profile.default_content_setting_values':{'notifications': 2}}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options=chrome_options)

driver.get("https://m.momoshop.com.tw/mymomo/login.momo") # 到登入頁面

driver.find_element(By.ID,'memId').send_keys('帳號') # 輸入帳號
driver.find_element(By.ID,'passwd').send_keys('密碼') # 輸入密碼
driver.find_element(By.CLASS_NAME,'login').click()
driver.get("https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=9435324")

while 1:
  try:
    buy = WebDriverWait(driver, 1, 0.5).until(EC.presence_of_element_located((By.ID, 'buy_yes'))) # 顯性等待
    buy.click() # 偵測到可以購買按鈕就點擊按鈕
    print ('可以購買!')
    break # 後面結帳部分就不寫囉
  except:
    print("還不能購買! 重新整理!")
    driver.refresh() # 重整頁面