Skip to content

Commit

Permalink
code to crawl database
Browse files Browse the repository at this point in the history
  • Loading branch information
vijayp committed Jun 15, 2012
1 parent befcdc7 commit 39b9500
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions crawl_moviepostersdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from harvestman.apps.spider import HarvestMan
from harvestman.lib.common.macros import *
import re

class MyCustomCrawler(HarvestMan):
""" A custom crawler """

def save_this_url(self, event, *args, **kwargs):
""" Custom callback function which modifies behaviour
of saving URLs to disk """

# Get the url object
url = event.url
ustr = str(url)
# If not image, save always
if ('/poster' not in ustr):
return False

if url.is_document() or (url.is_image() and re.search('[/]t_[^_]+_[^_.]*.jpg',ustr)):

return True
return False


# Set up the custom crawler
if __name__ == "__main__":
crawler = MyCustomCrawler()
crawler.initialize()
# Get the configuration object
config = crawler.get_config()
# Register for 'save_url_data' event which will be called
# back just before a URL is saved to disk
crawler.register('save_url_data', crawler.save_this_url)
# Run
crawler.main()

0 comments on commit 39b9500

Please sign in to comment.