In [36]:
%load_ext autoreload
%autoreload 2

import re
import os
import sys
import requests


from bs4 import BeautifulSoup

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from name import wiki
from name.named_entity import NamedEntity

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# The Wiki List

Starting from the Wiki list page: https://en.wikipedia.org/wiki/List_of_Japanese_people



In [16]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_Japanese_people')
page = BeautifulSoup(res.text)

In [17]:
list_links = page.select('.mw-parser-output>ul li a:first-child[href]')
print('list_links', len(list_links))
list_links[:4]

list_links 336


[<a href="/wiki/Yoshisuke_Aikawa" title="Yoshisuke Aikawa">Yoshisuke Aikawa</a>,
 <a href="/wiki/Takeo_Fujisawa" title="Takeo Fujisawa">Takeo Fujisawa</a>,
 <a href="/wiki/Hirotoshi_Honda" title="Hirotoshi Honda">Hirotoshi Honda</a>,
 <a class="mw-redirect" href="/wiki/Konosuke_Matsushita" title="Konosuke Matsushita">Konosuke Matsushita</a>]

In [18]:
col_links = page.select('.div-col>ul li a:first-child[href]')
print('col_links', len(col_links))
col_links[:4]

col_links 381


[<a href="/wiki/Daiki_Arioka" title="Daiki Arioka">Daiki Arioka</a>,
 <a href="/wiki/Goro_Inagaki" title="Goro Inagaki">Goro Inagaki</a>,
 <a href="/wiki/Hikaru_Yaotome" title="Hikaru Yaotome">Hikaru Yaotome</a>,
 <a href="/wiki/Hiroki_Uchi" title="Hiroki Uchi">Hiroki Uchi</a>]

In [19]:
table_links = page.select('.wikitable tr td b>a[href]')
print('table_links', len(table_links))
table_links[:4]

table_links 92


[<a href="/wiki/Tokugawa_Ieyasu" title="Tokugawa Ieyasu">Tokugawa Ieyasu</a>,
 <a href="/wiki/Tokugawa_Hidetada" title="Tokugawa Hidetada">Tokugawa Hidetada</a>,
 <a href="/wiki/Tokugawa_Iemitsu" title="Tokugawa Iemitsu">Tokugawa Iemitsu</a>,
 <a href="/wiki/Tokugawa_Ietsuna" title="Tokugawa Ietsuna">Tokugawa Ietsuna</a>]

In [22]:
links = []
links += ['https://en.wikipedia.org' + a.get('href') for a in list_links]
links += ['https://en.wikipedia.org' + a.get('href') for a in col_links]
links += ['https://en.wikipedia.org' + a.get('href') for a in table_links]

links = list(set(links))
print('links', len(links))
links[:5]

links 760


['https://en.wikipedia.org/wiki/Koji_Murofushi',
 'https://en.wikipedia.org/wiki/Shoko_Asahara',
 'https://en.wikipedia.org/wiki/Kazuhide_Uekusa',
 'https://en.wikipedia.org/wiki/Masayoshi_%22Mabo%22_Kabe',
 'https://en.wikipedia.org/wiki/Taigen_Sessai']

In [73]:
downloaded_filename = '../data/people.csv'
downloaded_names = {}

try:
    with open(downloaded_filename, 'r') as fp:
        names = NamedEntity.from_csv(fp)
        for n in names:
            downloaded_names[n.src_url] = n
except:
    pass

print('downloaded_names', len(downloaded_names))

downloaded_names 0


In [74]:
def save_downloaded_names(downloaded_filename=downloaded_filename, 
                          downloaded_names=downloaded_names):
    
    with open(downloaded_filename, 'w') as fp:
        NamedEntity.to_csv(downloaded_names.values(), fp)

In [75]:
current_size = len(downloaded_names)
print('Continue downloading from the %d-th entry' % (current_size + 1))

for l in links:
    if l in downloaded_names:
        continue
        
    name = wiki.extract_name_from_wiki_page(l)
    if not name:
        print("Can't read name", l)
        continue
    
    downloaded_names[l] = name
    
    if len(downloaded_names) >= current_size + 50:
        current_size = len(downloaded_names)
        print('Saving the file...', current_size)
        save_downloaded_names()
    
    
    

Continue downloading from the 1-th entry
Can't read name https://en.wikipedia.org/w/index.php?title=Era_Fusahide&action=edit&redlink=1
Can't read name https://en.wikipedia.org/w/index.php?title=Kuwana_Tarozaemon&action=edit&redlink=1
Can't read name https://en.wikipedia.org/w/index.php?title=Sait%C5%8D_Musashibo_Benkei&action=edit&redlink=1
Can't read name https://en.wikipedia.org/wiki/Francis_Xavier_Kaname_Shimamoto
Can't read name https://en.wikipedia.org/w/index.php?title=Oda_Hiroyoshi&action=edit&redlink=1
Can't read name https://en.wikipedia.org/w/index.php?title=Oda_Nobuyasu&action=edit&redlink=1
Can't read name https://en.wikipedia.org#cite_note-164
Can't read name https://en.wikipedia.org/wiki/Emi_Maria
Can't read name https://en.wikipedia.org/wiki/Ribbon
Saving the file... 50
Can't read name https://en.wikipedia.org/w/index.php?title=Isshiki_Fujinaga&action=edit&redlink=1
Can't read name https://en.wikipedia.org/w/index.php?title=Ujiie_Naomoto&action=edit&redlink=1
Can't read 

In [76]:
save_downloaded_names()