In [125]:
import urllib.request as urlreq
import urllib.error as urlerr
import urllib.parse as urlparse
import urllib.robotparser as urlrp
from bs4 import BeautifulSoup
import re
import datetime
import time
import sys
sys.path.append('../')
from common.utils import *

In [126]:
url  = "http://example.webscraping.com/places/default/view/Argentina-11"
html = download(url)
soup = BeautifulSoup(html, "lxml")
trs = soup.find_all(attrs={'id':re.compile('places_.*__row')})
for tr in trs:
    td = tr.find(attrs={'class':'w2p_fw'})
    value = td.text
    print(value)

Downloading:  http://example.webscraping.com/places/default/view/Argentina-11

2,766,890 square kilometres
41,343,201
AR
Argentina
Buenos Aires
SA
.ar
ARS
Peso
54
@####@@@
^([A-Z]\d{4}[A-Z]{3})$
es-AR,en,it,de,fr,gn
CL BO UY PY BR 


In [127]:
import lxml.html

tree = lxml.html.fromstring(html)
td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
area = td.text_content()
print(area)

2,766,890 square kilometres


In [128]:
FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format',
          'postal_code_regex', 'languages', 'neighbours')

def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)<\/td>' % field, html.decode()).groups()[0]
    
    return results

def bs_scraper(html):
    soup = BeautifulSoup(html, "lxml")
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find('tr', id='places_%s__row' % field).find(
                            'td', class_='w2p_fw').text
        
    return results

def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_%s__row > td.w2p_fw' %
                                      field)[0].text_content()
        
    return results

In [129]:
import time

NUM_ITERATIONS = 1000

for name, scraper in [('Regular expressions', re_scraper),
                      ('BeautifulSoup', bs_scraper),
                      ('Lxml', lxml_scraper)]:
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        assert(result['area'] == '2,766,890 square kilometres')
    end = time.time()
    print("%s: %.2f seconds" % (name, end-start))

Regular expressions: 1.61 seconds
BeautifulSoup: 10.67 seconds
Lxml: 2.07 seconds


In [118]:
def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content() for field in FIELDS]
        print(url, row)

In [119]:
import csv 

class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
            self.writer.writerow(row)

In [122]:
def link_crawler(seed_url, link_regex, max_depth=2, scrape_callback=None):
    crawl_queue = [seed_url]
    seen = {seed_url:0}
    throttle = Throttle(3)
    user_agent = 'victor'
    rp = urlrp.RobotFileParser()
    rp.set_url("http://example.webscraping.com/robots.txt")
    rp.read()
    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, user_agent)
            if scrape_callback:
                scrape_callback(url, html)
            if depth != max_depth:
                for link in get_links(html.decode()):
                    # skip all login pages
                    if re.search('login|register', link):
                        continue
                    if re.search(link_regex, link):
                        # form absolute link
                        link = urlparse.urljoin(seed_url, link)
                        # check if this link is already seen
                        if link not in seen:
                            seen[link] = depth + 1
                            crawl_queue.append(link)
        else:
            print('blocked by robots.txt, ', url)
                    
    return seen

In [123]:
all_links = link_crawler('http://example.webscraping.com', '/(index|view)/', scrape_callback=scrape_callback)

Downloading:  http://example.webscraping.com
Downloading:  http://example.webscraping.com/places/default/index/1
Downloading:  http://example.webscraping.com/places/default/index/2
Downloading:  http://example.webscraping.com/places/default/index/0
Downloading:  http://example.webscraping.com/places/default/view/Barbados-20
http://example.webscraping.com/places/default/view/Barbados-20 ['431 square kilometres', '285,653', 'BB', 'Barbados', 'Bridgetown', 'NA', '.bb', 'BBD', 'Dollar', '+1-246', 'BB#####', '^(?:BB)*(\\d{5})$', 'en-BB', ' ']
Downloading:  http://example.webscraping.com/places/default/view/Bangladesh-19
http://example.webscraping.com/places/default/view/Bangladesh-19 ['144,000 square kilometres', '156,118,464', 'BD', 'Bangladesh', 'Dhaka', 'AS', '.bd', 'BDT', 'Taka', '880', '####', '^(\\d{4})$', 'bn-BD,en', 'MM IN ']
Downloading:  http://example.webscraping.com/places/default/view/Bahrain-18
http://example.webscraping.com/places/default/view/Bahrain-18 ['665 square kilometr

In [124]:
all_links = link_crawler('http://example.webscraping.com', '/(index|view)/', scrape_callback=ScrapeCallback())

Downloading:  http://example.webscraping.com
Downloading:  http://example.webscraping.com/places/default/index/1
Downloading:  http://example.webscraping.com/places/default/index/2
Downloading:  http://example.webscraping.com/places/default/index/0
Downloading:  http://example.webscraping.com/places/default/view/Barbados-20
Downloading:  http://example.webscraping.com/places/default/view/Bangladesh-19
Downloading:  http://example.webscraping.com/places/default/view/Bahrain-18
Downloading:  http://example.webscraping.com/places/default/view/Bahamas-17
Downloading:  http://example.webscraping.com/places/default/view/Azerbaijan-16
Downloading:  http://example.webscraping.com/places/default/view/Austria-15
Downloading:  http://example.webscraping.com/places/default/view/Australia-14
Downloading:  http://example.webscraping.com/places/default/view/Aruba-13
Downloading:  http://example.webscraping.com/places/default/view/Armenia-12
Downloading:  http://example.webscraping.com/places/default/