## Building a Web Crawler
- 给出一个种子URL
- 对于一个URl对应的网页，获取该网页上的所有链接（a标签）列表，递归获取所有的URL（可以使用Redis存储和判重）
- 下载有效网页的内容到本地
- 对于给定的网页内容，爬取有效文本内容（有效文本筛选？）
- URL递归获取可以采用DFS或者BFS

In [12]:
import redis    # 导入redis模块，通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库


redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, decode_responses=True) 
redis_url_visit_pre = 'url_visited:'
def append_visited(url, filename):
    redis_conn = redis.Redis(connection_pool = redis_pool)
    key = redis_url_visit_pre + url
    if not redis_conn.exists(key):
        redis_conn.set(key, filename, ex = 90 * 24 * 60 * 60)
        return True
    return False

def is_visited(url):
    redis_conn = redis.Redis(connection_pool = redis_pool)
    return redis_conn.exists(redis_url_visit_pre + url)

In [55]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import os
from urllib.parse import urlparse
from urllib.request import pathname2url
import urllib.parse


def visit(url, limit_hostname = ''):
    url_set = set()
    
    if is_visited(url):
        return url_set
    
    page = requests.get(url) 
    if not page.ok:
        print(url + ' return ' + page.status_code)
        return url_set
    
    parsed_uri = urlparse(url)
    folder = urllib.parse.quote(parsed_uri.hostname, safe='')
    if not os.path.isdir(folder):
        os.mkdir(folder)
    filename = os.path.join(folder, urllib.parse.quote(url, safe=''))
    
    page_html = page.text
    with open(filename,'w') as file_write:
        file_write.write(page_html)
    
    append_visited(url, filename)
    
    soup = BeautifulSoup(page_html)
    for link in soup.find_all('a'):
        href = link.get("href")
        if href and href != '#' and href != './' and href != 'javascript:;' and 'http' in href:
            if not limit_hostname or limit_hostname in href:
                url_set.add(href)
            
    return url_set

In [56]:
from IPython.display import display, clear_output


def recurrent_visit(seed_url, limit_hostname = ''):
    all_urls = set([seed_url])
    while all_urls:
        url = all_urls.pop()
        next_urls = visit(url, limit_hostname)
        if next_urls:
            all_urls.update(next_urls)
            clear_output(wait = True)
            display(next_urls)

In [57]:
recurrent_visit("http://www.cmiw.cn/","cmiw")

{'http://bbs.cmiw.cn/forum.php?mod=viewthread&tid=150386',
 'http://bbs.cmiw.cn/forum.php?mod=viewthread&tid=204466',
 'http://bbs.cmiw.cn/forum.php?mod=viewthread&tid=264013',
 'http://license.comsenz.com/?pid=1&host=www.cmiw.cn',
 'http://www.cmiw.cn',
 'http://www.cmiw.cn/',
 'http://www.cmiw.cn/archiver/',
 'http://www.cmiw.cn/forum.php?mobile=yes',
 'http://www.cmiw.cn/forum.php?mod=collection',
 'http://www.cmiw.cn/forum.php?mod=guide',
 'http://www.cmiw.cn/forum.php?mod=misc&action=showdarkroom',
 'http://www.cmiw.cn/home.php',
 'http://www.cmiw.cn/home.php?mod=follow&view=other',
 'http://www.cmiw.cn/portal.php'}

UnicodeEncodeError: 'gbk' codec can't encode character '\xe5' in position 337: illegal multibyte sequence