In [1]:
import requests
from bs4 import BeautifulSoup
import re

urls = [
    'http://www.ck12.org/earth-science/',
    'http://www.ck12.org/life-science/',
    'http://www.ck12.org/physical-science/',
    'http://www.ck12.org/biology/',
    'http://www.ck12.org/chemistry/',
    'http://www.ck12.org/physics/',
]

topics = set()
for url in urls:
    html = requests.get(url).text
    for h3 in BeautifulSoup(html, 'html.parser').find_all('h3'):
        topic = ' '.join(map(str.strip, h3.li.a.get('href').strip('/').split('/')[-1].split('-')))
        topics.add(topic)

In [2]:
plucker = re.compile(r'''
    (.*?)
    (?:
      \ in\ Earth\ Sciences?
    | \ in\ Life\ Sciences?
    | \ in\ Physical\ Sciences?
    | \ in\ Biology
    | \ in\ Chemistry
    | \ in\ Physics
    )?
''', re.X | re.I)

topics = {plucker.fullmatch(t).group(1) for t in topics}

topics

{'Polar Climates',
 'Natural Resource Conservation',
 'Terrestrial Biomes',
 'Pedigree Analysis',
 'Metallic and Nonmetallic Character',
 'Mirrors',
 'Technological Design Process',
 'Mutations',
 'Particle Physics',
 'Effect of Altitude and Mountains on Climate',
 'Seafloor Spreading Hypothesis',
 'Conservation of Momentum in One Dimension',
 'Diffraction Gratings',
 'Cnidarian Reproduction',
 'Keplers Laws of Planetary Motion',
 'Batteries',
 'Importance of Mammals',
 'Thermal Energy',
 'Plant Life Cycle Overview',
 'Viruses in Research and Medicine',
 'Bond Polarity',
 'Scientific Method',
 'Nonvascular Plants',
 'Combined Gas Law',
 'Solenoid',
 'Igneous Rock Classification',
 'Importance of Seedless Plants',
 'Metamorphic Rocks',
 'Half life and Radioactive Dating',
 'Frogs and Toads',
 'Pathogens',
 'Habitat Destruction',
 'Effect of Pressure',
 'Seawater Chemistry',
 'Mass Mole Stoichiometry',
 'Stock System Naming',
 'Timeline of Evolution',
 'Quantization of Energy',
 'Carbohy

In [3]:
import sys

import wikipedia as wiki
from tqdm import tqdm

page_titles = set()
for topic in tqdm(topics):
    while True:
        try:
            page_titles.update(wiki.search(topic, 5))
            break
        except wiki.exceptions.HTTPTimeoutError:
            continue
        except Exception as e:
            print(e, file=sys.stderr)
            
page_titles



{'OOPARTS (Shun album)',
 'Baetoidea',
 'Atmospheric wave',
 'Nilssonia (turtle)',
 'Bone',
 'Acid rain',
 'Biarc',
 'IPod Touch',
 'Static discharger',
 'Extraocular muscles',
 'Acidic oxide',
 'Heat transfer',
 'Diffraction-limited system',
 'Leveling effect',
 'Bending Science',
 'Evolution of spiders',
 'Unsaturated hydrocarbon',
 'Hemimelia',
 'Chemical bond',
 'Gene Likens',
 'Natural kind',
 'Angular frequency',
 'Ovary',
 'Map projection',
 'Satellite (biology)',
 'Seasonal thermal energy storage',
 'Anthropogenic hazard',
 'Jamestown Community College',
 'Sponge (material)',
 'Accretion disk',
 'Rock cycle',
 'Atomistix ToolKit',
 'IUPAC nomenclature of inorganic chemistry',
 'Solenoid',
 'Acetyl chloride',
 'Osteichthyes',
 'Xenacoelomorpha',
 'Gunning fog index',
 'Carboxylate',
 'Oceanography',
 'Thermometer',
 'Eusuchia',
 'Ferromagnetic material properties',
 'Leatherjacket fish',
 "Faraday's laws of electrolysis",
 'Helioseismology',
 'Vesicle-associated membrane protein

In [7]:
page_summaries = set()
for page_title in tqdm(page_titles):
    while True:
        try:
            page = wiki.page(page_title)
            page_summaries.add(page.summary)
            break
        except wiki.exceptions.HTTPTimeoutError:
            continue
        except Exception as e:
            print(e, file=sys.stderr)
            break
            
page_summaries



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))
"Heat-transfer fluid" may refer to: 
Heat-transfer oil
Coolant
  2%|▏         | 106/6702 [01:47<2:24:57,  0.76it/s]"Natural Selection (disambiguation)" may refer to: 
Natural Selection (film)
Natural Selection (Fuel album)
Natural Selection (Frank Gambale album)
Natural Selection (group)
Natural Selection (manuscript)
"Natural Selection" (song)
Natural Selection (Sounds from the Ground album)
"Natural Selection" (The Spectacular Spider-Man)
"Natural Selection" (The Unit)
Natural Selection (video game)
Natural Selection 2
Teresa's Tattoo
  2%|▏         | 107/6702 [01:48<2:34:47,  0.71it/s]"Melting temperature" may refer to: 
Melting point
Nucleic acid thermodynamics
  2%|▏         | 140/6702 [02:37<2:32:39,  0.72it/s]"Plasma" may refer to: 
Blood plasma
Cytoplasm
Germ plasm
Germplasm
Milk plasma
Nucleoplasm
Protoplasm
KDE Plasma Workspaces
Plasma effect
Plasma (album)
Pla

KeyboardInterrupt: 

In [None]:
import pickle

with open('page_summaries.pkl', 'wb') as file:
    pickle.dumps(file)