<img style="float:center;" src="./img/aqi_review.png">

<br/><br/><br/><br/>
# 对空气质量历史数据的爬取

&emsp;&emsp;<font size=4>**1、创建工程，在命令行终端创建一个名为air_quality的工程,并进入该工程目录**</font>  
<br/>
&emsp;&emsp;&emsp;&emsp;<font size=4 color='red'>*c:>scrapy startproject air_quality*</font>
<br/>
&emsp;&emsp;&emsp;&emsp;<font size=4 color='red'>*c:>cd air_quality</font>*  
<br/>
&emsp;&emsp;<font size=4>**2、编写spider**</font>  
<br/>
&emsp;&emsp;&emsp;&emsp;<font size=4 color='red'>*c:\air_quality>scrapy genspider air_history_spider https://www.aqistudy.cn/historydata/index.php*</font>

<font size=4>**文件目录如图所示**</font>  
<img style="float:left;" src="./img/aqi_filefolder.png">

In [None]:
# seetings.py

ITEM_PIPELINES = {
   'air_quality.pipelines.AirQualityPipeline': 300,
}

In [None]:
# items.py

import scrapy

class AirQualityItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    city_name = scrapy.Field()      # 城市名称
    record_date = scrapy.Field()    # 检测日期
    aqi_val = scrapy.Field()        # AQI
    range_val = scrapy.Field()      # 范围
    quality_level = scrapy.Field()  # 质量等级
    pm2_5_val = scrapy.Field()      # PM2.5
    pm10_val = scrapy.Field()       # PM10
    so2_val = scrapy.Field()        # SO2
    co_val = scrapy.Field()         # CO
    no2_val = scrapy.Field()        # NO2
    o3_val = scrapy.Field()         # O3
    rank = scrapy.Field()           # 排名

In [None]:
# pipelines.py

from scrapy.exporters import CsvItemExporter

class AirQualityPipeline(object):

    def open_spider(self,spider):
        self.file = open('air_quality.csv', 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item,spider):
        self.exporter.export_item(item)
        return item

In [None]:
# api_history_spider.py

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from air_quality.items import AirQualityItem

base_url = 'https://www.aqistudy.cn/historydata/'

class ApiHistorySpiderSpider(scrapy.Spider):
    name = 'api_history_spider'
    allowed_domains = ["aqistudy.cn"]
    start_urls = ['https://www.aqistudy.cn/historydata/']

    def parse(self, response):
        """
            解析初始页面
        """
        # 获取所有城市的URL
        city_url_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//@href')

        for city_url in city_url_list:
            # 依次遍历城市URL
            city_month_url = base_url + city_url.extract()
            # 解析每个城市的月份数据
            request = scrapy.Request(city_month_url, callback=self.parse_city_month)
            yield request

    def parse_city_month(self, response):
        """
            解析该城市的月份数据
        """
        # 获取该城市的所有月份URL
        month_url_list = response.xpath('//table[@class="table table-condensed '
                                        'table-bordered table-striped table-hover '
                                        'table-responsive"]//a//@href')

        for month_url in month_url_list:
            # 依次遍历月份URL
            city_day_url = base_url + month_url.extract()
            # 解析该城市的每日数据
            request = scrapy.Request(city_day_url, callback=self.parse_city_day)
            yield request

    def parse_city_day(self, response):
        """
            解析该城市的每日数据
        """
        url = response.url
        item = AirQualityItem()
        city_url_name = url[url.find('=') + 1:url.find('&')]

        # 解析url中文
        # item['city_name'] = city_url_name
        item['city_name'] = parse.unquote(city_url_name)

        # 获取每日记录
        day_record_list = response.xpath('//table[@class="table table-condensed '
                                         'table-bordered table-striped table-hover '
                                         'table-responsive"]//tr')
        for i, day_record in enumerate(day_record_list):
            if i == 0:
                # 跳过表头
                continue
            td_list = day_record.xpath('.//td')

            item['record_date'] = td_list[0].xpath('text()').extract_first()  # 检测日期
            item['aqi_val'] = td_list[1].xpath('text()').extract_first()  # AQI
            item['range_val'] = td_list[2].xpath('text()').extract_first()  # 范围
            item['quality_level'] = td_list[3].xpath('.//div/text()').extract_first()  # 质量等级
            item['pm2_5_val'] = td_list[4].xpath('text()').extract_first()  # PM2.5
            item['pm10_val'] = td_list[5].xpath('text()').extract_first()  # PM10
            item['so2_val'] = td_list[6].xpath('text()').extract_first()  # SO2
            item['co_val'] = td_list[7].xpath('text()').extract_first()  # CO
            item['no2_val'] = td_list[8].xpath('text()').extract_first()  # NO2
            item['o3_val'] = td_list[9].xpath('text()').extract_first()  # O3
            item['rank'] = td_list[10].xpath('text()').extract_first()  # 排名

            yield item

&emsp;&emsp;<font size=4>**3、运行spider**</font>  
<br/>
&emsp;&emsp;&emsp;&emsp;<font size=4 color='red'>*scrapy crawl api_history_spider*</font>

<br/><br/><br/>

<font size=4>**得到csv文件，部分如下图所示：**</font>  
<img style="float:left;" src="./img/aqi_result.png">