Permalink
Browse files

Merge pull request #1 from erochest/semantic-markup

Updated to match a change on Trove to more semantic markup.
  • Loading branch information...
wragge committed Mar 6, 2012
2 parents 44f3e76 + bddefd3 commit f659fc7ab49b1eed74bda3e1523c47d3c9fa8393
Showing with 7 additions and 7 deletions.
  1. +7 −7 scrape.py
View
@@ -381,26 +381,26 @@ def extract_details(self, result):
article['id'] = article_id
article['url'] = url
publication_fields = result.find('dd', 'sourcedate')
if '(' in publication_fields.i.string:
if ')' in publication_fields.i.string:
if '(' in publication_fields.em.string:
if ')' in publication_fields.em.string:
newspaper_title, newspaper_details = (re.search(r'(.*?) \((.*?)\)',
publication_fields
.i.string.strip())
.em.string.strip())
.groups())
# Sometimes long titles are truncated so the full details aren't there
# Remove the fragment after the open bracket
else:
newspaper_title = (re.search(r'(.*?) \(',
publication_fields
.i.string.strip())
.em.string.strip())
.group(1))
newspaper_details = ''
else:
newspaper_title = publication_fields.i.string.strip()
newspaper_title = publication_fields.em.string.strip()
newspaper_details = ''
article['newspaper_title'] = newspaper_title
article['newspaper_details'] = newspaper_details
article['issue_date'] = publication_fields.b.string.strip()
article['issue_date'] = publication_fields.strong.string.strip()
article['issue_year'], article['issue_month'], article['issue_day'] = extract_date(article['issue_date'])
article['page'], article['type'] = (re.search(r'(\d+) (.*)',
publication_fields.contents[3]
@@ -561,4 +561,4 @@ class ServerError(Exception):
print np.total_results
print np.results

0 comments on commit f659fc7

Please sign in to comment.