## 加载URL

In [4]:
from pyquery import PyQuery as pq
import requests

response = requests.get("https://webscraper.io/test-sites/")
source = pq(response.content)
type(source)

pyquery.pyquery.PyQuery

## 元素遍历、属性和伪类

### 查找元素

In [6]:
source.find('title')

[<title>]

In [7]:
source.find('title').text()

'Test Sites | Web Scraper'

### 获取属性值

In [8]:
source.find("meta[name='description']").attr('content')

'You need to train your web scraper? We have created simple test sites that allow you to try all corner cases and proof test your scraper. Try it now.'

In [9]:
source.find("meta[name='keywords']").attr('content')

'web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper'

### 伪类

In [10]:
source.find('a:eq(0)').text()

'Toggle navigation'

In [11]:
source.find('a.menuitm:first').text()

'Web Scraper'

In [12]:
source.find('a.menuitm:last').text()

'Pricing'

In [13]:
source.find('a.menuitm:eq(1)').attr('href')

'/cloud-scraper'

In [14]:
source.find(':input')

[<button.navbar-toggler.float-end.collapsed>, <button#dropdownMenuLink.menuitm.nav-link.dropdown-toggle>]

In [15]:
source.find(':header')

[<h1>, <h2.site-heading>, <h2.site-heading>, <h2.site-heading>, <h2.site-heading>, <h2.site-heading>, <h2.site-heading>]

In [16]:
source.find(':empty')

[<meta>, <meta>, <meta>, <meta>, <link>, <meta>, <link>, <link>, <link>, <link>, <link>, <link>, <link>, <link>, <script>, <iframe>, <span.icon-bar.top-bar>, <span.icon-bar.middle-bar>, <span.icon-bar.bottom-bar>, <span.icon-bar.extra-bottom-bar>, <img>, <div.crta>, <div.crta>, <div.crta>, <span.crta>, <hr.test-site-divider>, <img>, <hr.test-site-divider>, <img>, <hr.test-site-divider>, <img>, <hr.test-site-divider>, <img>, <hr.test-site-divider>, <img>, <hr.test-site-divider>, <img>, <div.clearfix>, <div.push>, <br>, <i.ws-icon.ws-icon-facebook-f>, <i.ws-icon.ws-icon-twitter>, <i.ws-icon.ws-icon-linkedin>, <i.ws-icon.ws-icon-youtube>, <i.ws-icon.ws-icon-chrome-dark>]

In [17]:
source.find('meta:empty')

[<meta>, <meta>, <meta>, <meta>, <meta>]

In [18]:
source.find('meta:empty:odd')

[<meta>, <meta>]

In [19]:
source.find('meta:empty:even')

[<meta>, <meta>, <meta>]

In [20]:
source.find(':header:odd')

[<h2.site-heading>, <h2.site-heading>, <h2.site-heading>]

In [21]:
source.find("a:contains('Web')")

[<a.nav-link.menuitm>, <a>, <a>, <a>]

In [42]:
source.find('a:contains("Web")').eq(-1)

[<a>]

In [43]:
source.find("a:contains('Web')").eq(-1).text()

'Website Privacy Policy'

### 迭代

In [44]:
[item.attr('href') for item in source.find('a.menuitm').items()]

['/', '/cloud-scraper', '/pricing']

In [45]:
[item.attr('href') for item in source.find("a.menuitm, a[class*='btn-menu']").items()]

['/',
 '/cloud-scraper',
 '/pricing',
 'https://chromewebstore.google.com/detail/web-scraper-free-web-scra/jnhgnonknehpejjnehehllkliplmbmhn?hl=en',
 'https://cloud.webscraper.io/']

## 使用PyQuery进行网络爬虫

### 示例一：书籍详细信息

导入库、定义 URLs、默认分页值和一个空数据集

In [46]:
from pyquery import PyQuery as pq
import requests
import math

category = 'Childrens'
siteUrl = 'http://books.toscrape.com/'
baseUrl = siteUrl + 'catalogue/category/books/childrens_11/index.html'
pageUrl = siteUrl + 'catalogue/category/books/childrens_11/page-'
dataSet = []
page = 1
totalPagesCount = 1

循环处理每一页

In [47]:
while page <= totalPagesCount:
    response = requests.get(pageUrl + str(page) + '.html')
    source = pq(response.content)

    if page == 1:
        pageValues =  [value.text() for value in source.find('form.form-horizontal strong').items()]
        if len(pageValues) > 0:
            pageValues = list(map(int, pageValues))
            totalPagesCount = math.ceil(pageValues[0] / pageValues[2])
    print(f'Page {page} from Total {totalPagesCount}...')     

    books = source.find('article.product_pod')
    for book in books.items():
        image = book.find('.image_container a img').attr('src')
        rating = book.find('p.star-rating').attr('class').split()[1]
        title = book.find('h3 a').attr('title')
        url = book.find('h3 a').attr('href')
        price = book.find('p.price_color').text()
        stock = book.find('p.availability').attr('class').split()[0]

        dataSet.append({
            'name': title,
            'price': price,
            'stock': stock,
            'rating': rating,
            'image': image.replace(
                '../../../../', 'http://books.toscrape.com/catalogue'
            ),
            'url': url.replace(
                '../../../', 'http://books.toscrape.com/catalogue/'
            )
        })

    page += 1

Page 1 from Total 2...
Page 2 from Total 2...


将收集的数据写入 JSON 文件

In [48]:
import json

with open('downloads/childrens_books.json', 'w') as fp:
    json.dump(dataSet, fp, indent=2, sort_keys=False)

### 示例二：站点地图

导入库、定义 URL 以及 CSV 文件的字段名

In [49]:
from pyquery import PyQuery as pq
import requests
import csv

url = "https://www.schools.com/sitemap.xml"
columns=['loc', 'lastmod', 'changefreq', 'priority'] 
xmlFile = requests.get(url).content
urlXML = pq(xmlFile, parser='html')
print(f'Children Length: {len(urlXML.children())}')    # Child-Length: 530

Children Length: 530


循环处理每个 `<url>` 节点

In [52]:
dataSet = []

for loop in range(len(urlXML.children())):
    child = urlXML.children().eq(loop)
    dataSet.append({
        child.find('loc').text(),
        child.find('lastmod').text(),
        child.find('changefreq').text(),
        child.find('priority').text()
    })

将收集的数据写入 CSV 文件

In [54]:
def write_to_csv(data, filename, columns):
    with open(filename, 'w+', newline='', encoding="utf-8") as fp:
        writer = csv.DictWriter(fp, fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(fp)
        for element in dataSet:
            writer.writerow(element)

write_to_csv(dataSet, 'downloads/schoolXML.csv', columns)

### 示例三：带有作者详细信息的名言

变量定义

In [35]:
from pyquery import PyQuery as pq
import requests
import csv

url = 'http://quotes.toscrape.com/tag/books/page/'
columns = ['id', 'author', 'quote', 'tags', 'quote_length', 'born_date', 'born_location', 'author_url'] 
authorSet = {}
dataSet = []
page = 1
nextPage = True
uid = 0  # 编号

循环处理每一页

In [36]:
while nextPage:
    print(url + str(page))
    response = requests.get(url + str(page))
    source = pq(response.content)
    print(source.find('title').text())

    if source.find("ul.pager li.next:contains('Next')"):    # 当前是否是最后一页
        nextPage = True
    else:
        nextPage = False
    
    print(f"Processing {page}...") 
    for quote in source.find('.quote').items():
        quoteText = quote.find("[itemprop='text']").text()
        author = quote.find("[itemprop='author']").text()
        tags = quote.find("[itemprop='keywords']").attr('content').replace(',', '|')
        authorUrl = quote.find("a[href*='/author/']").attr('href')

        if authorUrl:
            # print(authorUrl)
            authorKey = author.replace('.', '_').replace(' ', '_')
        
        if authorUrl and authorKey not in authorSet.keys():
            authorUrl = 'http://quotes.toscrape.com' + authorUrl  # 完整URL
            source_author = pq(requests.get(authorUrl).content)
            bornDate = source_author.find('.author-born-date').text()
            bornLocation = source_author.find('.author-born-location').text().replace('in', '').strip()
            authorSet[authorKey] = {
                'name': author,
                'url': authorUrl,
                'date': bornDate,
                'location': bornLocation
            }
        else:
            print(f'Author ({authorKey}) details already found!')

        uid += 1

        dataSet.append([
            uid, author, quoteText, tags, len(quoteText),
            authorSet[authorKey]['date'],
            authorSet[authorKey]['location'],
            authorSet[authorKey]['url']
        ])
        
    page += 1

http://quotes.toscrape.com/tag/books/page/1
Quotes to Scrape
Processing 1...
Author (Mark_Twain) details already found!
Author (Jane_Austen) details already found!
http://quotes.toscrape.com/tag/books/page/2
Quotes to Scrape
Processing 2...


In [39]:
authorSet

{'Jane_Austen': {'name': 'Jane Austen',
  'url': 'http://quotes.toscrape.com/author/Jane-Austen',
  'date': 'December 16, 1775',
  'location': 'Steventon Rectory, Hampshire, The United Kgdom'},
 'Mark_Twain': {'name': 'Mark Twain',
  'url': 'http://quotes.toscrape.com/author/Mark-Twain',
  'date': 'November 30, 1835',
  'location': 'Florida, Missouri, The United States'},
 'Jorge_Luis_Borges': {'name': 'Jorge Luis Borges',
  'url': 'http://quotes.toscrape.com/author/Jorge-Luis-Borges',
  'date': 'August 24, 1899',
  'location': 'Buenos Aires, Argenta'},
 'C_S__Lewis': {'name': 'C.S. Lewis',
  'url': 'http://quotes.toscrape.com/author/C-S-Lewis',
  'date': 'November 29, 1898',
  'location': 'Belfast, Ireland'},
 'Haruki_Murakami': {'name': 'Haruki Murakami',
  'url': 'http://quotes.toscrape.com/author/Haruki-Murakami',
  'date': 'January 12, 1949',
  'location': 'Kyoto, Japan'},
 'Ernest_Hemingway': {'name': 'Ernest Hemingway',
  'url': 'http://quotes.toscrape.com/author/Ernest-Hemingwa

将收集的数据写入 JSON 文件

In [38]:
import json

with open('downloads/quotes_author.json', 'w') as fp:
    json.dump(authorSet, fp, indent=2, sort_keys=False)