In [2]:
import pandas as pd
import re
import requests
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from fake_useragent import UserAgent
import time

### 分解各個環節程式碼

In [3]:
# amazon website
product = "kitty"
url = "https://www.amazon.com/s?k={}&ref=nb_sb_noss".format(product)

# 加入headless模式 => 注意：使用時google版本可能不一樣
options = webdriver.ChromeOptions()
#options.add_argument("--headless")
# 設定fake_agent
ua = UserAgent()
options.add_argument(ua.random)

# 將參數帶入
driver = webdriver.Chrome(options=options, executable_path="./chromedriver")
driver.get(url)
time.sleep(2)

# 取得頁面的html
soup = BeautifulSoup(driver.page_source, "html.parser")

# 關閉頁面
#driver.close()

In [4]:
# 查看總共有幾個
product_info = soup.select("div[class='a-section a-spacing-none']")
len(product_info)

68

In [5]:
product_info[0]

<div class="a-section a-spacing-none"><div class="a-section a-spacing-none a-spacing-top-small"><div class="a-row a-spacing-micro"><span class="a-declarative" data-a-popover='{"closeButton":"true","dataStrategy":"preload","name":"sp-info-popover-mobile"}' data-action="a-popover"><span class="s-sponsored-label-text"><span class="a-color-secondary">贊助</span></span></span><div class="a-popover-preload" id="a-popover-sp-info-popover-mobile"><span class="a-color-secondary">贊助</span></div></div><h2 class="a-size-mini a-spacing-none a-color-base s-line-clamp-4"><a class="a-link-normal a-text-normal" href="/-/zh_TW/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&amp;adId=A09086251N3SD36GS2TZF&amp;url=%2Fdp%2FB096DYXVM5%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dkitty%26qid%3D1630662166%26sr%3D8-1-spons%26psc%3D1&amp;qualifier=1630662166&amp;id=1630205119401451&amp;widgetName=sp_atf"><span class="a-size-base-plus a-color-base a-text-normal">Streyant Mixed Cat Litter, Qu

In [6]:
# 取得產品名稱
title = product_info[0].select("span[class='a-size-base-plus a-color-base a-text-normal']")[0].text
title

'Streyant Mixed Cat Litter, Quick Clumping, Deodorize, Flushable, Dust Free & Unscented for Kitty 6LBs / Pack'

In [7]:
# 取得價格
price = product_info[0].select(".a-offscreen")[0].text   #("span[class='a-price'] > span[class='a-offscreen']")[0].text
price

'US$19.99'

In [11]:
# 取得星星＆評分數
star, rated_amount = [i.text for i in product_info[1].select("div[class='a-row a-size-small'] > span")]
star, rated_amount

('4.6 顆星，最高 5 顆星 ', '32,055  ')

### 將以上程式碼整理為一個fucntion

In [15]:
def amazon_crawler(product, pages):
    url = "https://www.amazon.com/s?k={}&ref=nb_sb_noss".format(product)
    names = []
    prices = []
    stars = []
    rated_amounts = []

    # 要爬幾頁
    for p in range(1, pages): 

        # 參數設定 (headless, useragent)
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        ua = UserAgent()
        options.add_argument(ua.random)

        # 開始爬取
        driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
        driver.get(url)
        time.sleep(4)

        # 取得soup以及各個元素
        soup = BeautifulSoup(driver.page_source, features="html.parser")
        product_infos = soup.select("div[class='a-section a-spacing-none']")
        for pi in product_infos:
            try:
                name = pi.select("span[class='a-size-base-plus a-color-base a-text-normal']")[0].text
                price = pi.select(".a-offscreen")[0].text
                star, rated_amount = [i.text for i in pi.select("div[class='a-row a-size-small'] > span")]
                
                names.append(name)
                prices.append(price)
                stars.append(star)
                rated_amounts.append(rated_amount)
            except:
                pass
            
        
        # 翻頁 => 利用qid組成下一頁的url，透過新連結換個UserAgent重新跑一次selenium
        # note: 因為page連結的html會隨機改變，容易發生錯誤，因此利用它同頁qid共用特性來取得qid，進而組成下一頁的網址 -> 應有其他方法，日後改善！
        qids = soup.select("a")
        for _ in range(len(qids)):
            n = randint(1, len(qids))
            try:
                qid = re.findall(".*qid=(\d{10})", qids[n]["href"])
                if qid:
                    break
            except:
                pass        
        url = 'https://www.amazon.com/' + f'/s?k=led&page={p}&qid={qid}&ref=sr_pg_{p}'
        
        # 隨機sleep幾秒
        time.sleep(randint(5,15))
        
    # 爬取完成，創造dataframe來裝資料
    df = pd.DataFrame({"Name":names, "Price":prices, "Star":stars, "Rated":rated_amounts})

    return df

In [16]:
df_output = amazon_crawler("led", 5)

### 爬取結果

In [14]:
df_output

Unnamed: 0,Name,Pirce,Star,Rated
0,40ft Led Strip Lights Keepsmile RGB Color Chan...,$13.99,4.6 out of 5 stars,3770
1,"Lepro 50ft LED Strip Lights, Ultra-Long RGB 50...",$21.99,4.6 out of 5 stars,14348
2,82Ft/25m LED Strip Lights，Micomlan Music Sync ...,$27.99,4.6 out of 5 stars,4123
3,"LED Strip Lights, Hiromeco 82Ft Led Light Stri...",$19.99,4.6 out of 5 stars,1085
4,"Tenmiro 65.6ft Led Strip Lights, Ultra Long RG...",$28.99,4.6 out of 5 stars,47743
...,...,...,...,...
213,LEDMO 120W LED Wall Pack Light with Dusk-to-Da...,$99.99,4.6 out of 5 stars,616
214,LED Desk Lamp Eye-Caring Table Light with USB ...,$39.99,4.4 out of 5 stars,37
215,LEDMO LED Parking Lot Lights 19500LM - [500W H...,$429.99,4.6 out of 5 stars,124
216,"Smart Table Lamp, LED Warm & White Bedside Lam...",$39.99,4.5 out of 5 stars,16


### Data cleansing 

In [19]:
df_output.rename(columns={"Pirce":"Price"}, inplace=True)

In [25]:
# Price -> 轉成float
df_output["Price"] = df_output["Price"].apply(lambda x: x.replace("$", "")).astype("float64")

In [28]:
# Star -> 擷取出數值，轉成float
df_output["Star"] = df_output["Star"].apply(lambda x: x[:3]).astype("float64")

In [34]:
# Rated -> 消除空白格，轉成int
df_output["Rated"] = df_output["Rated"].apply(lambda x: re.sub(",| ", "", x)).astype("int")

In [40]:
df_output.head()

Unnamed: 0,Name,Price,Star,Rated
0,"Lepro 50ft LED Strip Lights, Ultra-Long RGB 50...",21.99,4.6,14357
1,65.6ft Led Strip Lights Jadisi Music Sync RGB ...,21.99,4.6,3872
2,"Lepro 65.6ft LED Strip Lights, Ultra-Long RGB ...",26.99,4.6,8645
3,"50 FT Long LED Strip Lights, GUPUP LED Lights ...",18.99,4.6,1399
4,"Tenmiro 65.6ft Led Strip Lights, Ultra Long RG...",28.99,4.6,47861
