In [52]:
import pandas as pd
import re
import requests
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from fake_useragent import UserAgent
import time

### 分解各個環節程式碼

In [154]:
# amazon website
product = "kitty"
url = "https://www.amazon.com/s?k={}&ref=nb_sb_noss".format(product)

# 加入headless模式 => 注意：使用時google版本可能不一樣
options = webdriver.ChromeOptions()
#options.add_argument("--headless")
# 設定fake_agent
ua = UserAgent()
options.add_argument(ua.random)

# 將參數帶入
driver = webdriver.Chrome(options=options, executable_path="./chromedriver")
driver.get(url)
time.sleep(2)

# 取得頁面的html
soup = BeautifulSoup(driver.page_source, "html.parser")

# 關閉頁面
#driver.close()

In [64]:
# 查看總共有幾個
product_info = soup.select("div[class='a-section a-spacing-none']")
len(product_info)

81

In [65]:
product_info[0]

<div class="a-section a-spacing-none"><div class="a-section a-spacing-none a-spacing-top-small"><div class="a-row a-spacing-micro"><span class="a-size-mini a-color-secondary">Featured from our brands</span></div><h2 class="a-size-mini a-spacing-none a-color-base s-line-clamp-4"><a class="a-link-normal a-text-normal" href="/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&amp;adId=A00224332ETVFBEILU3CA&amp;url=%2FChanging-Dimmbale-Lighting-Decoration-Backlight%2Fdp%2FB08HQ66VXQ%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dled%26qid%3D1630649022%26sr%3D8-1-spons%26psc%3D1&amp;qualifier=1630649022&amp;id=4640386571308055&amp;widgetName=sp_atf"><span class="a-size-base-plus a-color-base a-text-normal">sylvwin LED Strip Lights 16.4FT,RGB Strip Lights with Color Changing,SMD 5050 Dimmable Lighting with Remote Control for Home Kitchen,Bedroom Decoration,Party,TV Backlight</span> </a> </h2></div><div class="a-section a-spacing-none a-spacing-top-micro"><div class="a-row a

In [66]:
# 取得產品名稱
title = product_info[0].select("span[class='a-size-base-plus a-color-base a-text-normal']")[0].text
title

'sylvwin LED Strip Lights 16.4FT,RGB Strip Lights with Color Changing,SMD 5050 Dimmable Lighting with Remote Control for Home Kitchen,Bedroom Decoration,Party,TV Backlight'

In [21]:
# 取得價格
price = product_info[0].select(".a-offscreen")[0].text   #("span[class='a-price'] > span[class='a-offscreen']")[0].text
price

'$21.99'

In [22]:
# 取得星星＆評分數
star, rated_amount = [i.text for i in product_info[0].select("div[class='a-row a-size-small'] > span")]
star, rated_amount

('4.6 out of 5 stars ', '14,348  ')

### 將以上程式碼整理為一個fucntion

In [178]:
def amazon_crawler(product, pages):
    url = "https://www.amazon.com/s?k={}&ref=nb_sb_noss".format(product)
    names = []
    prices = []
    stars = []
    rated_amounts = []

    # 要爬幾頁
    for p in range(1, pages): 

        # 參數設定 (headless, useragent)
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        ua = UserAgent()
        options.add_argument(ua.random)

        # 開始爬取
        driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
        driver.get(url)
        time.sleep(4)

        # 取得soup以及各個元素
        soup = BeautifulSoup(driver.page_source, features="html.parser")
        product_infos = soup.select("div[class='a-section a-spacing-none']")
        for pi in product_infos:
            try:
                name = pi.select("span[class='a-size-base-plus a-color-base a-text-normal']")[0].text
                price = pi.select(".a-offscreen")[0].text
                star, rated_amount = [i.text for i in pi.select("div[class='a-row a-size-small'] > span")]
                
                names.append(name)
                prices.append(price)
                stars.append(star)
                rated_amounts.append(rated_amount)
            except:
                pass
            
        
        # 翻頁 => 利用qid組成下一頁的url，透過新連結換個UserAgent重新跑一次selenium
        qids = soup.select("a")
        for _ in range(len(qids)):
            n = randint(1, len(qids))
            try:
                qid = re.findall(".*qid=(\d{10})", qids[n]["href"])
                if qid:
                    break
            except:
                pass        
        url = 'https://www.amazon.com/' + f'/s?k=led&page={p}&qid={qid}&ref=sr_pg_{p}'
        
        # 隨機sleep幾秒
        time.sleep(randint(5,15))
        
    # 爬取完成，創造dataframe來裝資料
    df = pd.DataFrame({"Name":names, "Pirce":prices, "Star":stars, "Rated":rated_amounts})

    return df

In [179]:
df_output = amazon_crawler("led", 5)

### 爬取結果

In [180]:
df_output

Unnamed: 0,Name,Pirce,Star,Rated
0,40ft Led Strip Lights Keepsmile RGB Color Chan...,$13.99,4.6 out of 5 stars,3770
1,"QZYL 75ft LED Lights for Bedroom, RGB LED Stri...",$25.99,4.4 out of 5 stars,1849
2,"YORMICK LED Strip Lights, 32.8 Feet/10M 300LED...",$15.99,4.2 out of 5 stars,825
3,"LED Lights for Bedroom 32.8ft, RGB LED Light S...",$18.99,4.5 out of 5 stars,13790
4,"Tenmiro 65.6ft Led Strip Lights, Ultra Long RG...",$28.99,4.6 out of 5 stars,47743
...,...,...,...,...
186,"Govee RGBIC Pro LED Strip Lights, 32.8ft Color...",$52.99,5.0 out of 5 stars,1
187,"Smart LED Strip Lights 50ft,Sync Music Color C...",$19.97,3.9 out of 5 stars,82
188,"50ft Bluetooth LED Strip Lights, Music Sync 50...",$19.98,4.5 out of 5 stars,4185
189,"Magnetic Pickup Tool with LED Lights, Gifts fo...",$12.95,4.6 out of 5 stars,11
