In [4]:
#  !pip install bs4
import requests
from bs4 import BeautifulSoup

def build_url(city_coding, year=None, month=None):
    """
    创建网页链接
    paramters:
        city_coding: 城市名称(英文)
        year: 年份
        month: 月份
    return:
        url: 可访问的链接
    """
    BASE = 'http://www.tianqihoubao.com/aqi/'
    city_base_url = BASE + '{}.html'
    city_date_base_url = BASE + '{}-{}{}.html'
    
    if year is not None and month is not None:
        month = str(month) if month >= 10 else '0' + str(month)
        return city_date_base_url.format(city_coding, year, month)
    else:
        return city_base_url.format(city_coding)


def parse(url, city_name):
    """
    抓取网页信息
    parameters:
        url: 需要抓取的网页链接
        city_name: 城市名称(用于数据标识)
    returns:
        result: 抓取的信息
    """
    response = requests.get(url)
    if response.ok:
        html = response.text
        
        soup = BeautifulSoup(html)
        data_table = soup.table
        
        content = data_table.contents
        
        result = []
        for index, c in enumerate(content[1::2]):
                if index == 0:
                    result.append(tuple(['城市'] + c.text.split()))
                else:
                    result.append(tuple([city_name] + c.text.split()))
        return result
    
    else:
        if response.status_code == 403:
            print('403 Forbidden! 抓取太快你被拉黑啦~')
            

In [10]:
all_data = [] # 请将网页数据都添加到此变量中

# 遍历抓取 杭州的 2019年1月-12月数据
for month in range(1,13):
    city_code = 'hangzhou'
    city_name = '杭州'
    year =2019
# 调用上面的两个函数
    url_bj = build_url(city_code,year,month)
    result = parse(url_bj,city_name)
    all_data.append(result)
    
# print data
print(all_data)

[[('城市', '日期', '质量等级', 'AQI指数', '当天AQI排名', 'PM2.5', 'PM10', 'So2', 'No2', 'Co', 'O3'), ('杭州', '2019-01-01', '良', '73', '205', '53', '72', '8', '39', '0.90', '20'), ('杭州', '2019-01-02', '良', '90', '202', '66', '90', '9', '48', '0.95', '21'), ('杭州', '2019-01-03', '轻度污染', '126', '249', '95', '127', '8', '61', '1.26', '6'), ('杭州', '2019-01-04', '良', '79', '205', '58', '83', '7', '53', '1.39', '5'), ('杭州', '2019-01-05', '优', '31', '43', '21', '27', '6', '42', '1.18', '6'), ('杭州', '2019-01-06', '良', '55', '131', '38', '51', '7', '42', '1.67', '10'), ('杭州', '2019-01-07', '良', '55', '106', '38', '58', '8', '55', '1.25', '4'), ('杭州', '2019-01-08', '良', '64', '161', '45', '62', '8', '47', '1.04', '18'), ('杭州', '2019-01-09', '良', '66', '209', '47', '66', '9', '51', '1.06', '17'), ('杭州', '2019-01-10', '优', '36', '82', '22', '31', '7', '49', '0.82', '6'), ('杭州', '2019-01-11', '优', '26', '30', '15', '21', '7', '48', '0.92', '3'), ('杭州', '2019-01-12', '优', '39', '53', '26', '37', '7', '37', '1.12', '