## 读取XML文件

In [4]:
from lxml import etree

xml = open('downloads/breakfast.xml', 'rb').read()
tree = etree.XML(xml)
tree

<Element breakfast_menu at 0x1d526f22a80>

In [8]:
type(tree)

lxml.etree._Element

In [10]:
for element in tree.iter():
    print(f'{element.tag} - {element.text}')

breakfast_menu - 
  
food - 
    
name - Belgian Waffles
price - $5.95
description - Two of our famous Belgian Waffles with plenty of real maple syrup
calories - 650
food - 
    
name - Strawberry Belgian Waffles
price - $7.95
description - Light Belgian waffles covered with strawberries and whipped cream
calories - 900
food - 
    
name - Berry-Berry Belgian Waffles
price - $8.95
description - Light Belgian waffles covered with an assortment of fresh berries and whipped cream
calories - 900
food - 
    
name - French Toast
price - $4.50
description - Thick slices made from our homemade sourdough bread
calories - 600
food - 
    
name - Homestyle Breakfast
price - $6.95
description - Two eggs, bacon or sausage, toast, and our ever-popular hash browns
calories - 950


### XPATH表达式

In [14]:
for element in tree.xpath('//food'):
    if int(element.xpath('./calories/text()')[0]) < 800:
        print(f"{element.xpath('./name/text()')[0]} - "
              f"{element.xpath('./calories/text()')[0]}")

Belgian Waffles - 650
French Toast - 600


## 读取HTML文件

In [16]:
from lxml import html
from urllib.request import urlopen

url ='http://httpbin.org/forms/post'
tree = html.parse(urlopen(url))
type(tree)

lxml.etree._ElementTree

In [18]:
root = tree.getroot()
type(root)

lxml.html.HtmlElement

### 查找单个元素

In [22]:
tagP = root.find('.//p')
tagP.text_content()

'Customer name: '

In [29]:
tagP1 = root.findtext('.//p/')
print(tagP1)

Customer name: 


### 查找所有元素

In [30]:
for pTag in root.findall('.//p'):
    print(pTag.text_content())

Customer name: 
Telephone: 
E-mail address: 
  Small 
  Medium 
  Large 
  Bacon 
  Extra Cheese 
  Onion 
  Mushroom 
Preferred delivery time: 
Delivery instructions: 
Submit order


### XPATH表达式

In [37]:
print(root.xpath('//p/label/input/@value'))
print(root.xpath('//legend/text()'))
# 所有p标签文本
print([formP.text_content().strip() for formP in root.xpath('//form/p')])

['small', 'medium', 'large', 'bacon', 'cheese', 'onion', 'mushroom']
[' Pizza Size ', ' Pizza Toppings ']
['Customer name:', 'Telephone:', 'E-mail address:', 'Preferred delivery time:', 'Delivery instructions:', 'Submit order']


### CSS选择器

In [45]:
print([e.get('value') for e in root.cssselect('p label input[value]')])
print([l.text_content() for l in root.cssselect('legend')])
print([p.text_content().strip() for p in root.cssselect('form > p')])

['small', 'medium', 'large', 'bacon', 'cheese', 'onion', 'mushroom']
[' Pizza Size ', ' Pizza Toppings ']
['Customer name:', 'Telephone:', 'E-mail address:', 'Preferred delivery time:', 'Delivery instructions:', 'Submit order']


## 使用lxml进行网络爬虫

In [46]:
import lxml.html as web
from lxml.etree import XPath
import math
import csv

baseUrl = "http://books.toscrape.com/"
bookUrl = baseUrl + "catalogue/category/books/childrens_11/index.html"
pageUrl = baseUrl + "catalogue/category/books/childrens_11/page-"  # page-1, page-2
columns = ['title', 'price', 'stock', 'imageUrl', 'rating', 'url']  # CSV 文件的字段

dataSet = []
page = 1
totalPagesCount = 1

### 处理每一页

In [51]:
while page <= totalPagesCount:
    source = web.parse(pageUrl + str(page) + '.html').getroot()
    if page == 1:
        perpageArticlesCount = source.xpath(
            "//form[@class='form-horizontal']/strong[3]/text()"
        )
        totalArticlesCount = source.xpath(
            "//form[@class='form-horizontal']/strong[1]/text()"
        )
        totalPagesCount = math.ceil(
            int(totalArticlesCount[0]) / int(perpageArticlesCount[0])
        )
        print(f"TotalPages found: {totalPagesCount}")
    print(f'Processing page {str(page)} from {totalPagesCount}...')

    # 元素的路径
    articles = XPath("//ol[contains(@class, 'row')]/li[position() > 0]")
    titlePath = XPath(".//article[contains(@class, 'product_pod')]/h3/a/@title")
    linkPath = XPath(".//article[contains(@class, 'product_pod')]/h3/a/@href")
    pricePath = XPath(".//article/div[2]/p[contains(@class, 'price_color')]/text()")
    stockPath = XPath(
        ".//article/div[2]/p[contains(@class, 'availability')]/text()[normalize-space()]"
    )
    imagePath = XPath(".//article/div[1][contains(@class,'image_container')]/a/img/@src")
    ratingPath = XPath(".//article/p[contains(@class, 'star-rating')]/@class")

    for row in articles(source):
        title = titlePath(row)[0]
        link = linkPath(row)[0].replace('../../../', baseUrl + 'catalogue/')
        price = pricePath(row)[0]
        availability = stockPath(row)[0].strip()
        image = imagePath(row)[0].replace('../../../../', baseUrl)
        rating = ratingPath(row)[0].replace('star-rating','').strip()

        if len(title):
            dataSet.append([title, price, availability, image, rating, link])
    
    print(f'Rows in Dataset: {len(dataSet)}')
    
    page += 1

In [53]:
dataSet

[['Birdsong: A Story in Pictures',
  '£54.64',
  'In stock',
  'http://books.toscrape.com/media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg',
  'Three',
  'http://books.toscrape.com/catalogue/birdsong-a-story-in-pictures_975/index.html'],
 ['The Bear and the Piano',
  '£36.89',
  'In stock',
  'http://books.toscrape.com/media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg',
  'One',
  'http://books.toscrape.com/catalogue/the-bear-and-the-piano_967/index.html'],
 ['The Secret of Dreadwillow Carse',
  '£56.13',
  'In stock',
  'http://books.toscrape.com/media/cache/c4/a2/c4a2a1a026c67bcceb5a411c724d7d0c.jpg',
  'One',
  'http://books.toscrape.com/catalogue/the-secret-of-dreadwillow-carse_944/index.html'],
 ['The White Cat and the Monk: A Retelling of the Poem “Pangur Bán”',
  '£58.08',
  'In stock',
  'http://books.toscrape.com/media/cache/26/32/2632a1e12f2c085fabbe022ae4cd6933.jpg',
  'Four',
  'http://books.toscrape.com/catalogue/the-white-cat-and-the-monk-a-retelling-of-the-poe

In [54]:
def write_to_csv(data, filename, columns):
    with open(filename, 'w+', newline='', encoding='utf-8') as fp:
        writer = csv.DictWriter(fp, fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(fp)
        for element in data:
            writer.writerow(element)
            
write_to_csv(dataSet, 'downloads/books.csv', columns)