Skip to content

Commit

Permalink
no message
Browse files Browse the repository at this point in the history
  • Loading branch information
torhagl committed Apr 6, 2017
1 parent 2286f78 commit 3755af7
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 4 deletions.
11 changes: 11 additions & 0 deletions .idea/doin_the_do_on_gat.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

234 changes: 234 additions & 0 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 35 additions & 4 deletions scraper.py
@@ -1,21 +1,52 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# import scraperwiki
import scraperwiki
from bs4 import BeautifulSoup
import json

# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
result = {}
html = scraperwiki.scrape("http://www.gat.no/nyheter/tok-ni-for-fart-og-en-for-mobilbruk-1.1802906")
bs = BeautifulSoup(html, "html5lib")
article = bs.find("article", {"class": "full_article"})

result["index"] = "1"
header = article.find("header")
result["overskrift"] = header.find("h1").text.strip()
overskrift = result["overskrift"]
result["ingress"] = header.find("h2", {"class": "light"}).text.strip()
result["publiseringsdato"] = header.find("time", {"class": "op-published"})["datetime"]
result["oppdateringsdato"] = header.find("time", {"class": "op-modified"})["datetime"]
result["forfatter"] = header.find("ul", {"class": "author_details"}).text.strip()

header.extract()
body = ""
for p in article("p"):
if p:
if p.text:
body += p.text.strip() + " "
elif p.string:
body += p.string.strip() + " "

if len(body.strip()) > 0:
result["body"] = body
print(json.dumps(result, indent=1))
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#scraperwiki.sqlite.create_table(unique_keys=["overskrift"])
scraperwiki.sqlite.save(unique_keys=["index"], data=result, table_name="data")

#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
herro = scraperwiki.sql.select("* from data")
print(herro)

# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
Expand Down
Binary file added scraperwiki.sqlite
Binary file not shown.

0 comments on commit 3755af7

Please sign in to comment.