In [1]:
from bs4 import BeautifulSoup
import urllib.request as ulb
import re

In [2]:
class WikiReptile(object):

    def __init__(self, base_key, deep, relation_path, context_path):
        self.__urlsList = set()
        self.__baseUrl = 'https://en.wikipedia.org'
        self.__baseKey = base_key
        self.__deep = deep
        self.__contextPath = context_path
        self.__relationPath = relation_path

    def __add_url(self, url):
        self.__urlsList.add(url)

    def get_urls_len(self):
        return len(self.__urlsList)

    # 局部广度优先遍历
    def save_list(self, key_url, start_urls, deep_count=0):
        deep_count += 1
        if self.__deep < deep_count:
            return
        print('初始网页 %s 包含链接数 %s' % (key_url[6:], str(len(start_urls))))
        count = 0
        listMap = {}
        for url in start_urls:
            count += 1
            string = key_url[6:] + "\t->\t" + url[6:]
            print('\t%s 保存关系 %s' % (str(count), string))
            self.__apend_relation(self.__relationPath, string)
            if url not in self.__urlsList:
                print('\t\t保存网页 %s' % url[6:])
                success = self.__apend_html("https://en.wikipedia.org%s" % url, self.__contextPath)
                if not success:
                    continue
                self.__urlsList.add(url)
                url_List = self.__get_urls("https://en.wikipedia.org%s" % url)
                if isinstance(url_List, list) and url_List != []:
                    listMap[url] = url_List
        for key, value in listMap.items():
            self.save_list(key, value, deep_count)

    def __open_url(self, url):
        try:
            the_link = ulb.urlopen(url, timeout=10)
            the_read = the_link.read()
        except BaseException:
            print('当前连接失败...放弃并继续')
            return None
        else:
            return the_read

    def __apend_relation(self, path, string):
        try:
            list_file = open(path, "a+", encoding='utf-8')
        except BaseException:
            print('%s打开失败...放弃并继续' % path)
        else:
            try:
                list_file.write(string + "\n")
            except BaseException:
                print('关系写入失败...放弃并继续')
            finally:
                list_file.close()

    def __apend_html(self, the_url, the_path):
        the_read = self.__open_url(the_url)
        if the_read is None:
            return False
        the_html = BeautifulSoup(str(the_read, encoding="utf-8"), "lxml")
        title = self.__get_title(the_html)
        try:
            file = open(the_path, "a+", encoding='utf-8')
        except BaseException:
            print('%s打开失败...放弃并继续' % the_path)
            return False
        else:
            try:
                file.write('1 ' + title + "\n")
                content = self.__get_content(the_html)
                for ele in content:
                    file.write(ele + "\n")
                    if '2 References' in ele:
                        references = self.__get_references(the_html)
                        if references is not None:
                            for reference in references:
                                file.write(reference + "\n")
                file.write("\n" + "=" * 20 + "\n\n")
            except BaseException:
                print('正文写入失败...放弃并继续')
                return False
            finally:
                file.close()
        return True

    def __get_title(self, the_html):
        title = the_html.find("h1", {"id": "firstHeading"}).get_text()
        return title.replace('/', '_')

    def __get_content(self, the_html):
        content = the_html.find("div", {"id": "bodyContent"})
        contents = []
        for ele in content.findAll({"h1", "h2", "h3", "h4", "h5", "h6", "p"}):
            # file.write(str(ele)[2] + " " + ele.get_text() + "\n")
            contents.append(str(ele)[2] + " " + ele.get_text())
        return contents

    def __get_references(self, the_html):
        try:
            content = the_html.find("ol", {"class": "references"})
            refers = content.findAll("li")
        except AttributeError:
            return None
        # print(content.get_text())
        referecces = []
        count = 0
        for ele in refers:
            count += 1
            referecces.append("[" + str(count) + "] " + ele.get_text())
            # print(str(count) + "\t" + ele.get_text())
        return referecces

    def __get_urls(self, init_url):
        the_read = self.__open_url(init_url)
        if the_read is None:
            return
        html = BeautifulSoup(str(the_read, encoding="utf-8"), "lxml")
        content = html.find("div", {"id": "bodyContent"})
        urls = []
        for link in content.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
            if 'href' in link.attrs:
                if link.attrs['href'] not in self.__urlsList:
                    urls.append(link.attrs['href'])
        return urls

    def run(self):
        self.__add_url(self.__baseKey)
        self.__apend_html(self.__baseUrl + self.__baseKey, self.__contextPath)
        urlList = self.__get_urls(self.__baseUrl + self.__baseKey)
        if isinstance(urlList, list) and urlList != []:
            self.save_list(self.__baseKey, urlList)

In [4]:
# 爬取英文维基百科网页内容
baseKey = '/wiki/Finance'
webDeep = 1
saveUrl = "C:/Users/yuqia/python-project-chatbot-codes/WikiReptile-master/"
relationFileName = "/url_link_list.txt"
contextFileName = "/wiki.txt"
##
# 传入参数
#   base_key         初始网页（不包含维基基础地址）
#   webDeep          爬取深度
#   deep             爬取深度（一个网页到下一个网页为一个深度）
#   relation_path    关系保存文件地址
#   context_path     正文保存文件地址
wiki_reptile = WikiReptile(baseKey, webDeep, saveUrl + relationFileName, saveUrl + contextFileName)
wiki_reptile.run()
print("\t所有网页数： " + str(wiki_reptile.get_urls_len()))

初始网页 Finance 包含链接数 295
	1 保存关系 Finance	->	Finance_(game)
		保存网页 Finance_(game)
	2 保存关系 Finance	->	The_Financial
		保存网页 The_Financial
	3 保存关系 Finance	->	Financial_market
		保存网页 Financial_market
	4 保存关系 Finance	->	Financial_asset
		保存网页 Financial_asset
	5 保存关系 Finance	->	Bond_market
		保存网页 Bond_market
	6 保存关系 Finance	->	Commodity_market
		保存网页 Commodity_market
	7 保存关系 Finance	->	Derivatives_market
		保存网页 Derivatives_market
	8 保存关系 Finance	->	Foreign_exchange_market
		保存网页 Foreign_exchange_market
	9 保存关系 Finance	->	Money_market
		保存网页 Money_market
	10 保存关系 Finance	->	Over-the-counter_(finance)
		保存网页 Over-the-counter_(finance)
	11 保存关系 Finance	->	Private_equity
		保存网页 Private_equity
	12 保存关系 Finance	->	Real_estate
		保存网页 Real_estate
	13 保存关系 Finance	->	Spot_market
		保存网页 Spot_market
	14 保存关系 Finance	->	Stock_market
		保存网页 Stock_market
	15 保存关系 Finance	->	Financial_market_participants
		保存网页 Financial_market_participants
	16 保存关系 Finance	->	Investor
		保存网页 Investor
	17 保存关系 Finance	->	Inst

	133 保存关系 Finance	->	Valuation_(finance)
		保存网页 Valuation_(finance)
	134 保存关系 Finance	->	Asset_allocation
		保存网页 Asset_allocation
	135 保存关系 Finance	->	Time_value_of_money
		保存网页 Time_value_of_money
	136 保存关系 Finance	->	Present_value
		保存网页 Present_value
	137 保存关系 Finance	->	Required_rate_of_return
		保存网页 Required_rate_of_return
	138 保存关系 Finance	->	Financial_economics
		保存网页 Financial_economics
	139 保存关系 Finance	->	Mathematical_finance
		保存网页 Mathematical_finance
	140 保存关系 Finance	->	Valuation_(finance)
	141 保存关系 Finance	->	Management#Training
		保存网页 Management#Training
	142 保存关系 Finance	->	Financial_economics
	143 保存关系 Finance	->	Economics
		保存网页 Economics
	144 保存关系 Finance	->	Accountancy
		保存网页 Accountancy
	145 保存关系 Finance	->	Applied_mathematics
		保存网页 Applied_mathematics
	146 保存关系 Finance	->	Outline_of_finance#Education
		保存网页 Outline_of_finance#Education
	147 保存关系 Finance	->	List_of_unsolved_problems_in_finance
		保存网页 List_of_unsolved_problems_in_finance
	148 保存关系 Finance	->	Finan

	277 保存关系 Finance	->	Collier%27s_Encyclopedia
		保存网页 Collier%27s_Encyclopedia
	278 保存关系 Finance	->	New_York_University_Stern_School_of_Business
		保存网页 New_York_University_Stern_School_of_Business
	279 保存关系 Finance	->	Financial_technology
		保存网页 Financial_technology
	280 保存关系 Finance	->	Computational_finance
	281 保存关系 Finance	->	Experimental_finance
	282 保存关系 Finance	->	Financial_economics
	283 保存关系 Finance	->	Financial_institution
		保存网页 Financial_institution
	284 保存关系 Finance	->	Financial_management
	285 保存关系 Finance	->	Financial_market
	286 保存关系 Finance	->	Investment_management
	287 保存关系 Finance	->	Mathematical_finance
	288 保存关系 Finance	->	Personal_finance
	289 保存关系 Finance	->	Public_finance
	290 保存关系 Finance	->	Quantitative_behavioral_finance
	291 保存关系 Finance	->	Quantum_finance
		保存网页 Quantum_finance
	292 保存关系 Finance	->	Statistical_finance
		保存网页 Statistical_finance
	293 保存关系 Finance	->	LCCN_(identifier)
		保存网页 LCCN_(identifier)
	294 保存关系 Finance	->	National_Archives_and_Records_A