Skip to content

Commit

Permalink
Added more fields
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicolas Vila committed May 6, 2015
1 parent 1edfcc3 commit fad51f3
Showing 1 changed file with 67 additions and 55 deletions.
122 changes: 67 additions & 55 deletions scraper.py
Expand Up @@ -15,64 +15,76 @@
email_regex = re.compile(r'(\b[\w.]+@+[\w.]+.+[\w.]\b)')

def get_url(url):
req = urllib2.Request(url, None, header)
response = urllib2.urlopen(req)
root = pq(response.read().decode('utf-8'))
return root
req = urllib2.Request(url, None, header)
response = urllib2.urlopen(req)
root = pq(response.read().decode('utf-8'))
return root

def parse_field(element):
field_string = element.html()
if ":" in field_string:
field_string = field_string.split(":")[1]
return field_string
field_string = element.html()
if ":" in field_string:
field_string = field_string.split(":")[1]
return field_string

def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
if value:
return re.sub(r'<[^>]*?>', '', value)
return ""
"""Returns the given HTML with all tags stripped."""
if value:
return re.sub(r'<[^>]*?>', '', value)
return ""

def parse_list(root):
""" Takes a listing page and indexes all the listings in it """
for el in root(".listing a.property_title"):
page_url = "http://www.tripadvisor.com" + el.get("href");
print "Url: %s" % page_url
page = get_url(page_url)
name = strip_tags(page("#HEADING_GROUP h1").html())
ranking = page(".sprite-ratings").attr("content")
#activity = strip_tags(page(".row-fluid *[itemprop=title]").html())
address = strip_tags(page(".format_address").html())
#url = strip_tags(page(".row-fluid .row-fluid *[itemprop=url] a").attr("href"))
telephone = strip_tags(page(".sprite-greenPhone").next().html())
email_raw = strip_tags(page(".sprite-greenEmail").next().attr("onclick"))
email = email_regex.findall(email_raw)
if email:
email = email[0]
description = strip_tags(page(".listing_description").html()).strip()[:1200]
print email
data = {
'name': name,
'source_url': page_url,
#'url': url,
'ranking': ranking,
'email': email,
#'activity': activity,
'address': address,
'telephone': telephone,
'description': description,
}
scraperwiki.sqlite.save(unique_keys=['source_url'], data=data, table_name="tripadvisor_SOT")
""" Takes a listing page and indexes all the listings in it """
for el in root(".listing a.property_title"):
page_url = "http://www.tripadvisor.com" + el.get("href");
print "Url: %s" % page_url
page = get_url(page_url)

name = strip_tags(page("#HEADING_GROUP h1").html())
ranking = page(".sprite-ratings").attr("content")
#activity = strip_tags(page(".row-fluid *[itemprop=title]").html())
address = strip_tags(page(".format_address").html())
#url = strip_tags(page(".row-fluid .row-fluid *[itemprop=url] a").attr("href"))
telephone = strip_tags(page(".sprite-greenPhone").next().html())
email_raw = strip_tags(page(".sprite-greenEmail").next().attr("onclick"))
email = email_regex.findall(email_raw)
if email:
email = email[0]
description = strip_tags(page(".listing_description").html()).strip()[:1200]

print email
data = {
'name': name,
'source_url': page_url,
#'url': url,
'ranking': ranking,
'email': email,
#'activity': activity,
'address': address,
'telephone': telephone,
'description': description,
}
scraperwiki.sqlite.save(unique_keys=['source_url'], data=data, table_name="tripadvisor_SOT")


def parse_listing_pages(start_url):
# not iterate over the pages
count = 0
while True:
url = start_url % (count) # targets each page in the list
print "On page %s" % url
root = get_url(url)
# check if there are items, if not stop since you exceeded the total pages
if not root(".listing"):
print "Reached end at page %s" % count
break
# this will parse the first listing page
parse_list(root)
print "Finished page %s" % count
count = count + 30
# not iterate over the pages
count = 0
while True:
url = start_url % (count) # targets each page in the list
print "On page %s" % url
root = get_url(url)

# check if there are items, if not stop since you exceeded the total pages
if not root(".listing"):
print "Reached end at page %s" % count
break

# this will parse the first listing page
parse_list(root)
print "Finished page %s" % count
count = count + 30

start_url = "http://www.tripadvisor.com/AttractionsAjax-g186378?cat=25&o=a%s&sortOrder=popularity"
parse_listing_pages(start_url)


0 comments on commit fad51f3

Please sign in to comment.