/
spdemo.py
61 lines (43 loc) · 2.27 KB
/
spdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import scrapy
import spiderone
from selenium import webdriver
class SpdemoSpider(scrapy.Spider):
name = 'spdemo'
allowed_domains = ['shopee.com.my']
start_urls = ['https://shopee.com.my/shop/178840654/search?page=0&sortBy=relevancy']
custom_settings = {
'DOWNLOADER_MIDDLEWARES' : {'spiderone.middlewares.SeleniumMiddleware': 543,}
}
cur_page = 0
def __init__(self):
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #关闭则不提供界面
#chrome_options.add_argument('--no-sandbox') #非沙盘模式
self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path='E:\\work_data\\soft src\\chromedriver_win32_76.0.3809.68\\chromedriver.exe')
super(SpdemoSpider, self).__init__()
def closed(self,reason):
self.browser.close() # 记得关闭
def parse(self, response):
i = scrapy.loader.ItemLoader(item=spiderone.items.spDemoItem(), response=response)
i.add_css('name', '.shopee-seller-portrait__name::text')
i.add_css('plist', '.shop-search-result-view__item a::attr(href)')
info = i.load_item()
print("============================================="+str(self.cur_page))
print(info['name'])
yield info
print("***************************")
self.cur_page = self.cur_page + 1
yield scrapy.Request(url='https://shopee.com.my/shop/71665063/search?page='+str(self.cur_page)+'&sortBy=sales', callback=self.parse_detail, meta={'data':self.cur_page})
def parse_detail(self,response):
i = scrapy.loader.ItemLoader(item=spiderone.items.spDemoItem(), response=response)
i.add_css('name', '.shopee-seller-portrait__name::text')
i.add_css('plist', '.shop-search-result-view__item a::attr(href)')
info = i.load_item()
print("============================================="+str(self.cur_page))
print(info['name'])
if 'plist' not in info.keys():
return
yield info
self.cur_page = self.cur_page + 1
yield scrapy.Request(url='https://shopee.com.my/shop/71665063/search?page='+str(self.cur_page)+'&sortBy=sales', callback=self.parse_detail, meta={'data':self.cur_page})