In [6]:
import json
import string
import sys
sys.path.append('../')
from common.utils import *
from common.db_cache import DBCache
import time

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')



template_url = 'http://example.webscraping.com/places/ajax/search.json?page={}&page_size=10&search_term={}'
countries = set()
download = Downloader(cache=DBCache())

start = time.time()
for letter in string.ascii_lowercase:
    page = 0
    while True:
        html = download(template_url.format(page, letter))
        try:
            ajax = json.loads(html)
        except ValueError as e:
            print(e)
            ajax = None
        else:
            for record in ajax['records']:
                countries.add(record['country'])
        page += 1
        if ajax is None or page >= ajax['num_pages']:
            break
    
open('countries.txt', 'w').write('\n'.join(sorted(countries)))
end = time.time()
print("1st download: %.2f seconds" % (end-start))

Downloading:  http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=1&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=2&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=3&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=4&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=5&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=6&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=7&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=8&page_size=10&search_term=a
Downloading:  http://example.webscraping.com/places/ajax/search.json?page

Downloading:  http://example.webscraping.com/places/ajax/search.json?page=13&page_size=10&search_term=i
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=14&page_size=10&search_term=i
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=15&page_size=10&search_term=i
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=j
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=k
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=1&page_size=10&search_term=k
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=2&page_size=10&search_term=k
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=l
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=1&page_size=10&search_term=l
Downloading:  http://example.webscraping.com/places/ajax/search.json?p

Downloading:  http://example.webscraping.com/places/ajax/search.json?page=1&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=2&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=3&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=4&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=5&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=6&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=7&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=8&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page=9&page_size=10&search_term=t
Downloading:  http://example.webscraping.com/places/ajax/search.json?page

In [13]:
import csv

start = time.time()
f = open('countries.csv', 'w')
writer = csv.writer(f)
html = download('http://example.webscraping.com/places/ajax/search.json?page=0&page_size=1000&search_term=.')
ajax = json.loads(html)
for record in ajax['records']:
    writer.writerow([record['country']])
f.close()
end = time.time()
print("1st download: %.2f seconds" % (end-start))

1st download: 0.00 seconds


In [1]:
try:
    from PySide.QtGui import QApplication
    from PySide.QtCore import QUrl, QEventLoop, QTimer
    from PySide.QtWebKit import QWebView
except ImportError:
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtCore import QUrl, QEventLoop, QTimer
    from PyQt5.QtWebKitWidgets import QWebView
    
app = QApplication([])
webview = QWebView()
loop = QEventLoop()
webview.loadFinished.connect(loop.quit)
webview.load(QUrl('http://example.webscraping.com/places/default/search'))
loop.exec_()

webview.show()
frame = webview.page().mainFrame()
frame.findFirstElement('#search_term').setAttribute('value', '.')
frame.findFirstElement('#page_size option:checked').setPlainText('1000')
frame.findFirstElement('#search').evaluateJavaScript('this.click()')
app.exec_()

elements = None
while not elements:
    app.processEvents()
    elements = frame.findAllElements('#results a')
countries = [e.toPlainText().strip() for e in elements]
print(countries)

['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire, Saint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos Islands', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', '