|
1 |
| -# This is a template for a Python scraper on morph.io (https://morph.io) |
2 |
| -# including some code snippets below that you should find helpful |
| 1 | +############################################################################### |
| 2 | +# START HERE: Tutorial 2: Basic scraping and saving to the data store. |
| 3 | +# Follow the actions listed in BLOCK CAPITALS below. |
| 4 | +############################################################################### |
3 | 5 |
|
4 |
| -# import scraperwiki |
5 |
| -# import lxml.html |
6 |
| -# |
7 |
| -# # Read in a page |
8 |
| -# html = scraperwiki.scrape("http://foo.com") |
9 |
| -# |
10 |
| -# # Find something on the page using css selectors |
11 |
| -# root = lxml.html.fromstring(html) |
12 |
| -# root.cssselect("div[align='left']") |
13 |
| -# |
14 |
| -# # Write out to the sqlite database using scraperwiki library |
15 |
| -# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) |
16 |
| -# |
17 |
| -# # An arbitrary query against the database |
18 |
| -# scraperwiki.sql.select("* from data where 'name'='peter'") |
| 6 | +import scraperwiki |
| 7 | +html = scraperwiki.scrape('https://inmo.ie/6022') |
| 8 | +print "Click on the ...more link to see the whole page" |
| 9 | +print html |
19 | 10 |
|
20 |
| -# You don't have to do things with the ScraperWiki and lxml libraries. |
21 |
| -# You can use whatever libraries you want: https://morph.io/documentation/python |
22 |
| -# All that matters is that your final data is written to an SQLite database |
23 |
| -# called "data.sqlite" in the current working directory which has at least a table |
24 |
| -# called "data". |
| 11 | +# ----------------------------------------------------------------------------- |
| 12 | +# 1. Parse the raw HTML to get the interesting bits - the part inside <td> tags. |
| 13 | +# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines) |
| 14 | +# -- CLICK THE 'RUN' BUTTON BELOW |
| 15 | +# Check the 'Console' tab again, and you'll see how we're extracting |
| 16 | +# the HTML that was inside <td></td> tags. |
| 17 | +# We use lxml, which is a Python library especially for parsing html. |
| 18 | +# ----------------------------------------------------------------------------- |
| 19 | + |
| 20 | +import lxml.html |
| 21 | +root = lxml.html.fromstring(html) # turn our HTML into an lxml object |
| 22 | +tds = root.cssselect('td') # get all the <td> tags |
| 23 | +for td in tds: |
| 24 | + print lxml.html.tostring(td) # the full HTML tag |
| 25 | + print td.text # just the text inside the HTML tag |
| 26 | + |
| 27 | +# ----------------------------------------------------------------------------- |
| 28 | +# 2. Save the data in the ScraperWiki datastore. |
| 29 | +# -- UNCOMMENT THE THREE LINES BELOW |
| 30 | +# -- CLICK THE 'RUN' BUTTON BELOW |
| 31 | +# Check the 'Data' tab - here you'll see the data saved in the ScraperWiki store. |
| 32 | +# ----------------------------------------------------------------------------- |
| 33 | + |
| 34 | +for td in tds: |
| 35 | + record = { "td" : td.text } # column name and value |
| 36 | + scraperwiki.sqlite.save(["td"], record) # save the records one by one |
| 37 | + |
| 38 | +# ----------------------------------------------------------------------------- |
| 39 | +# Go back to the Tutorials page and continue to Tutorial 3 to learn about |
| 40 | +# more complex scraping methods. |
| 41 | +# ----------------------------------------------------------------------------- |
0 commit comments