In [1]:
pip install selenium==4.24.0

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
import time
import pandas as pd
import os
import random

  from pandas.core import (


### Parameter Setting

In [3]:
# Query source and output result path
student_id = "M11307510"
query_path = f"./queries/{student_id}_queries.txt"
results_path = "./results"

# Web scraping target URL
search_url = "https://www.tw.coupang.com/search?q="

# Scraping parameter settings
short_time_sleep = 1
medium_time_sleep = 3
long_time_sleep = 5

### Helpful Funtions

In [4]:
# Read queries from file
def read_queries(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]
    return lines

# Check if the webpage is accessible
def check_access(driver):
    # Check if the page is accessible
    denied = driver.find_elements(By.XPATH, '/html/body/h1')
    if denied:
        if denied[0].text == 'Access Denied':
            return False
        else: 
            return True
    # Check if the page has results
    no_result = driver.find_elements(By.XPATH, '/html/body/div[1]/div/main/div/div/div[3]/div[3]')
    if no_result:
        if no_result[0].text == '無相關搜索結果':
            return False
        else: 
            return True
    return True

# Scroll the webpage to the bottom
def scroll_to_bottom(driver, pause_time=3):
    # Get the current scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    # Scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Pause to allow any new content to load
    time.sleep(pause_time)
    
    # Get the new scroll height after scrolling
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    # If the new height is greater than the last height, new content has loaded
    if not (new_height > last_height):
        driver.execute_script("window.scrollBy(0, -100);")
        return False
    else :
        return True

# Get all items from pages   
def get_all_items(driver):
    try:
        items = driver.find_elements(By.XPATH, '/html/body/div[1]/div[2]/main/div/div/div[3]/div/div/a')
        return items
    except:
        print("No items found.")

# Extract all item informations to a dataframe   
def extract_item_info(items):
    print("Extracting item information...")
    data = []
    for i, item in enumerate(items):
        try:
            item_name = item.find_element(By.XPATH, 'div[2]').text
            item_url = item.get_attribute('href')
            # price is in one of two possible XPaths
            try:
                item_price = item.find_element(By.XPATH, 'div[3]/div[2]/span/span').text # on sale.
            except:
                try:
                    item_price = item.find_element(By.XPATH, 'div[3]/div[1]/span/span').text # not on sale.
                except:
                    item_price = "null"
            
            data.append({
                'product_name': item_name,
                'product_price': item_price,
                'product_url': item_url
            })
        except:
            print(f"Error extracting item {i}.")

    df = pd.DataFrame(data)
    return df

def start_browser_with_proxy(proxy_ip):
    chrome_options = webdriver.ChromeOptions()
    
    # 使用 ChromeOptions 添加代理設置
    chrome_options.add_argument(f'--proxy-server={proxy_ip}')
    
    # 啟動 Chrome 瀏覽器並應用代理
    driver = webdriver.Chrome(options=chrome_options)
    
    return driver

### Set up Chrome options

In [5]:
import requests
import re
 
 
response = requests.get("https://www.sslproxies.org/")
proxy_ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', response.text)  #「\d+」代表數字一個位數以上
print(proxy_ips)

['178.128.113.118:23128', '43.153.207.93:3128', '154.94.5.241:7001', '72.10.160.92:5635', '160.86.242.23:8080', '211.104.20.205:8080', '72.10.160.170:2657', '51.83.62.245:8080', '4.159.28.85:8080', '72.10.160.173:29439', '35.220.254.137:8080', '67.43.227.230:4961', '104.248.98.31:3128', '4.158.237.61:8080', '43.153.237.252:3128', '58.240.211.250:7890', '67.43.228.250:26991', '51.178.149.106:8080', '202.188.211.11:800', '45.119.133.218:3128', '4.159.29.241:8080', '8.148.23.202:4006', '67.43.227.229:16401', '8.219.97.248:80', '67.43.236.21:8307', '72.10.160.174:13093', '72.10.160.90:1365', '13.87.97.69:8080', '114.129.2.82:8081', '87.247.186.40:1081', '103.166.8.228:1080', '171.244.60.55:8080', '5.202.149.241:8080', '4.158.2.131:8080', '43.134.1.40:3128', '67.43.236.18:1853', '36.72.245.209:8080', '47.243.92.199:3128', '47.89.184.18:3128', '8.213.151.128:3128', '20.44.188.17:3129', '148.64.110.245:3129', '20.44.189.184:3129', '20.204.214.23:3129', '20.204.214.79:3129', '122.152.4.135:600

### Start web scraping.
#### (During scraping, you may open other windows, but do not close or minimize the Chrome window that is performing the scraping.)
#### (Make sure the screen remains on while the scraper is running)

In [6]:
queries = read_queries(query_path)

for query in queries:
    # 確保每次使用新的代理 IP
    ip = random.choice(proxy_ips)
    print(f"Use IP: [{ip}]")
    
    driver = start_browser_with_proxy(ip)
    time.sleep(short_time_sleep)

    # 檢查結果是否已存在
    csv_files = [f for f in os.listdir(results_path) if f.endswith('.csv')]
    search_string = query
    all_contain_string = any(search_string in file_name for file_name in csv_files)
    if all_contain_string:
        print(f"Results for {query} have already been scraped. Skipping...\n")
        driver.quit()
        continue

    # 搜索查詢
    try:
        driver.get(search_url + query)
        time.sleep(medium_time_sleep)
        status = check_access(driver)
        if status:
            print(f"Start scraping {query}...")
        else:
            print(f"Some error occurred while scraping {query}.")
            driver.quit()
            continue
        
        while status:
            status = check_access(driver)
            if status:  # Only proceed to scroll if check_access is True
                status = scroll_to_bottom(driver, medium_time_sleep + random.random())  # add time noise

        # 處理項目
        items = get_all_items(driver)
        items_df = extract_item_info(items)

        # 保存結果到 CSV 文件
        file_path = os.path.join(results_path, f"{student_id}_{query}.csv")
        items_df.to_csv(file_path, index=False, encoding='utf-8-sig')
        print(f"Results for {query} have been saved to {file_path}")

    finally:
        # 關閉瀏覽器
        driver.quit()
    
    # 隨機延遲，避免被封
    time.sleep(long_time_sleep + random.random() * 10)
    print("Sleeping for a while...")
    print("-" * 80)

use IP:[20.204.214.79:3129]


TypeError: start_browser_with_proxy() missing 1 required positional argument: 'proxy_ip'