# Description
分析wiki百科页面，使用thread, process来加速分析。

In [2]:
import os

os.listdir("wiki")

['Ronald_McCaffer.html',
 'Communities_of_Tulu_Nadu.html',
 'Mountune_Racing.html',
 'Tim_Spencer_(singer).html',
 'Nathaniel_Merriman.html',
 'One_Night_of_Sin.html',
 'Middle_Park,_Victoria.html',
 'Zgornji_Otok.html',
 'Josef_Mik.html',
 'Gaston_Lane.html',
 '2008_Fed_Cup_World_Group_II.html',
 'Phenacobius_catostomus.html',
 'Dowell_Philip_O%27Reilly.html',
 'Hebden_Bridge_Picture_House.html',
 'Plze%C5%88_Zoo.html',
 'Lower_Blackburn_Grade_Bridge.html',
 'DWTE-TV.html',
 'HD_90156.html',
 'Ordinary,_Virginia.html',
 'Cyclohexane_conformation.html',
 'Bifidocarpus.html',
 'Terry_Cox.html',
 'Furubira_District,_Hokkaido.html',
 'Kentucky_Theater.html',
 'Smeaton,_East_Lothian.html',
 'Alexander_Rizzoni.html',
 'Charged_Records.html',
 'Kate_Harwood.html',
 'Goodnight%E2%80%93Loving_Trail.html',
 'Aniavan.html',
 'Athletics_at_the_1994_Commonwealth_Games_%E2%80%93_Men%27s_pole_vault.html',
 'Doumanaba.html',
 'East_Down_(Northern_Ireland_Parliament_constituency).html',
 'Coenaculum_s

In [3]:
len(os.listdir("wiki"))

999

In [4]:
with open("wiki/Liebig_Peak.html") as f:
    print(f.read())

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Liebig Peak - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Liebig_Peak","wgTitle":"Liebig Peak","wgCurRevisionId":691722083,"wgRevisionId":691722083,"wgArticleId":29733531,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Orphaned articles from December 2010","All orphaned articles","Coordinates on Wikidata","Wikipedia articles incorporating text from the USGS Geographic Names Information System","All stub articles","Mountains of Graham Land","Loubet Coast","Loubet Coast geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageCo

## Reading in the data

In [5]:
import concurrent.futures
import time

pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)

def read_data(filename):
    with open(filename) as f:
        data = f.read()
    return data

start = time.time()
filenames = ["wiki/{}".format(f) for f in os.listdir("wiki")]
content = pool.map(read_data, filenames)
content = list(content)

end = time.time()
print(end - start)
articles = [f.replace(".html", "").replace("wiki/", "") for f in filenames]

0.46535301208496094


## Remove Extraneous Markup

In [6]:
from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
end = time.time()

print(end - start)

34.87950682640076


In [7]:
parsed[0]

'<div class="mw-body" id="content" role="main">\n<a id="top"></a>\n<div id="siteNotice"><!-- CentralNotice --></div>\n<div class="mw-indicators">\n</div>\n<h1 class="firstHeading" id="firstHeading" lang="en">Ronald McCaffer</h1>\n<div class="mw-body-content" id="bodyContent">\n<div id="siteSub">From Wikipedia, the free encyclopedia</div>\n<div id="contentSub"></div>\n<div class="mw-jump" id="jump-to-nav">\n\t\t\t\t\tJump to:\t\t\t\t\t<a href="#mw-head">navigation</a>, \t\t\t\t\t<a href="#p-search">search</a>\n</div>\n<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><table class="plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" role="presentation">\n<tr>\n<td class="mbox-image">\n<div style="width:52px"><img alt="" data-file-height="40" data-file-width="40" height="40" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Ambox_

## Finding Common Tags

In [9]:
from bs4 import BeautifulSoup

def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 0
        tags[tag.name] += 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
tags = pool.map(count_tags, parsed)
tags = list(tags)

tag_counts = {}
for tag in tags:
    for k,v in tag.items():
        if k not in tag_counts:
            tag_counts[k] = 0
        tag_counts[k] += v
end = time.time()

sorted_tag_counts = sorted(tag_counts.items(), key=lambda x: x[1])
print(end - start)
sorted_tag_counts

19.0537428855896


[('pre', 1),
 ('h6', 1),
 ('audio', 2),
 ('source', 2),
 ('math', 2),
 ('semantics', 2),
 ('mrow', 2),
 ('mstyle', 2),
 ('mo', 2),
 ('annotation', 2),
 ('map', 2),
 ('samp', 2),
 ('del', 2),
 ('bdi', 4),
 ('h5', 4),
 ('s', 10),
 ('ruby', 16),
 ('rb', 16),
 ('rt', 16),
 ('rp', 32),
 ('area', 39),
 ('font', 40),
 ('hr', 51),
 ('u', 51),
 ('blockquote', 58),
 ('center', 64),
 ('big', 75),
 ('q', 76),
 ('wbr', 85),
 ('code', 108),
 ('h4', 117),
 ('sub', 151),
 ('caption', 200),
 ('dt', 334),
 ('dl', 457),
 ('strong', 599),
 ('h3', 777),
 ('ol', 858),
 ('h1', 999),
 ('noscript', 999),
 ('dd', 1376),
 ('small', 3272),
 ('cite', 3563),
 ('abbr', 3665),
 ('table', 4010),
 ('h2', 4045),
 ('br', 4986),
 ('img', 6701),
 ('p', 7998),
 ('ul', 10972),
 ('sup', 11157),
 ('b', 14455),
 ('th', 14472),
 ('i', 18246),
 ('tr', 27300),
 ('div', 28581),
 ('td', 57673),
 ('span', 67350),
 ('li', 85779),
 ('a', 161065)]

## Finding Common Words
从每个页面中选取出现最多的十个。

In [12]:
from bs4 import BeautifulSoup
from collections import Counter
import re

def count_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    return Counter(words).most_common(10)

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(count_words, parsed)
words = list(words)

word_counts = {}
for wc in words:
    for word, count in wc:
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1
end = time.time()
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
print(end - start)
sorted_word_counts

17.583903312683105


[('wikipedia', 431),
 ('retrieved', 169),
 ('articles', 132),
 ('article', 85),
 ('species', 69),
 ('county', 64),
 ('categories', 58),
 ('united', 50),
 ('university', 47),
 ('family', 45),
 ('states', 43),
 ('national', 39),
 ('world', 38),
 ('school', 37),
 ('district', 35),
 ('sources', 34),
 ('career', 33),
 ('coordinates', 33),
 ('football', 33),
 ('encyclopedia', 32),
 ('american', 32),
 ('state', 32),
 ('december', 30),
 ('album', 28),
 ('january', 28),
 ('september', 28),
 ('population', 27),
 ('north', 27),
 ('which', 27),
 ('february', 26),
 ('march', 26),
 ('november', 26),
 ('south', 25),
 ('german', 25),
 ('village', 25),
 ('april', 25),
 ('station', 24),
 ('october', 24),
 ('league', 24),
 ('french', 23),
 ('august', 22),
 ('television', 21),
 ('women', 21),
 ('music', 20),
 ('australian', 20),
 ('season', 20),
 ('released', 20),
 ('british', 19),
 ('party', 19),
 ('series', 19),
 ('california', 18),
 ('other', 18),
 ('india', 18),
 ('history', 18),
 ('company', 17),
 ('