## Building a Web Crawler
- 给出一个种子URL
- 对于一个URl对应的网页，获取该网页上的所有链接（a标签）列表，递归获取所有的URL（可以使用Redis存储和判重）
- 下载有效网页的内容到本地
- 对于给定的网页内容，爬取有效文本内容（有效文本筛选？）
- URL递归获取可以采用DFS或者BFS

### 使用Redis作为一个缓存服务，用来记录已经下载的网页（默认有效期90天），还可用来判断某个网页是否被下载

In [1]:
import redis    # 导入redis模块，通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库


redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, decode_responses=True) 
redis_url_visit_pre = 'url_visited:'

def append_visited(url, filename):
    if not filename:
        return False
    
    redis_conn = redis.Redis(connection_pool = redis_pool)
    key = redis_url_visit_pre + url
    if not redis_conn.exists(key):
        redis_conn.set(key, filename, ex = 90 * 24 * 60 * 60)
        return True
    
    return False

def is_visited(url):
    redis_conn = redis.Redis(connection_pool = redis_pool)
    return redis_conn.exists(redis_url_visit_pre + url)

### 此方法访问一个URL，如果成功则获取对应网页的所有有效href，同时保存网页到本地并且更新缓存
#### todo：还可以在此方法中增加对PageRank的支持，实现两个目的
- 当前PageRank分数高的未下载页面优先访问
- 爬取完毕后搜索算法需要用到PageRank
- 但是本作业暂时不需要用PageRank，先不实现

In [5]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import os
from urllib.parse import urlparse
from urllib.request import pathname2url
import urllib.parse
import logging


def build_logger():
    log_base = 'log'
    if not os.path.exists(log_base):
        os.mkdir(log_base)

    if not os.path.exists(os.path.join(log_base,'error')):
        os.mkdir(os.path.join(log_base,'error'))
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename='log/error/crawler0104.log', 
                        level = logging.ERROR,
                        datefmt='%Y-%m-%d %H:%M:%S')
    
    if not os.path.exists(os.path.join(log_base,'warning')):
        os.mkdir(os.path.join(log_base,'warning'))
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename='log/warning/crawler0104.log', 
                        level = logging.WARNING,
                        datefmt='%Y-%m-%d %H:%M:%S')

build_logger()

def setup_custom_logger(name):
    formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    handler = logging.FileHandler('.txt', mode='w')
    handler.setFormatter(formatter)
    screen_handler = logging.StreamHandler(stream=sys.stdout)
    screen_handler.setFormatter(formatter)
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    logger.addHandler(screen_handler)
    return logger
    


def visit(url, limit_hostname = ''):
    url_set = set()
    
    if is_visited(url):
        return url_set
    
    try:
        page = requests.get(url) 
    except requests.ConnectionError:
        print("Error: ConnectionError when request "+ url)
        return url_set
    except Exception as e:
        logging.warning('request error for ' + url + ', see error info.')
        logging.error(str(e))
        return url_set
    
    if not page.ok:
        print(url + ' return ' + str(page.status_code))
        return url_set
    
    parsed_uri = urlparse(url)
    folder = urllib.parse.quote(parsed_uri.hostname, safe='')
    if not os.path.isdir(folder):
        os.mkdir(folder)
    filename = os.path.join(folder, urllib.parse.quote(url, safe=''))
    
    page_html = page.text
    if not is_visited(url):
        page_encode = page.encoding
        try:
            with open(filename,'w',encoding = page_encode) as file_write:
                file_write.write(page_html)
        except Exception as e:
            logging.warning('writing file error for ' + url + ', see error info.')
            logging.error(str(e))
    
    append_visited(url, filename)
    
    soup = BeautifulSoup(page_html)
    for link in soup.find_all('a'):
        href = link.get("href")
        if href and href != '#' and href != './' and href != 'javascript:;' and 'http' in href and not is_visited(href):
            if not limit_hostname or limit_hostname in href:
                url_set.add(href)
            
    return url_set

In [8]:
from IPython.display import display, clear_output


def recurrent_visit(seed_url, limit_hostname = ''):
    all_urls = set([seed_url])
    count = 0
    while all_urls:
        print(len(all_urls))
        clear_output(wait = True)

        url = all_urls.pop()
        next_urls = visit(url, limit_hostname, all_urls)
        if next_urls:
            all_urls.update(next_urls)
            display(next_urls)
        count += 1
        if count >


In [9]:
recurrent_visit("http://www.cmiw.cn/","cmiw")

UnboundLocalError: local variable 'page' referenced before assignment