/
run.py
122 lines (101 loc) · 4.79 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from scrapy.command import ScrapyCommand
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.spider import spiders
from scrapy.http import Request
from scrapy import log
from datetime import datetime, timedelta, date
import djangoscraper.utils.timetext as timetext
from djangoscraper.models import Task
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
import subprocess, os, time, signal
from scrapy.conf import settings
if settings.get('MEMDEBUG_WITH_GUPPY', False):
try:
import guppy
except ImportError:
guppy = False
log.msg('Could not import Guppy module.', level=log.ERROR)
interupted = False
def interupt(signal, frame):
global interupted
print '\nReceived cancel request, waiting for item to finish execution'
interupted = True
class Command(ScrapyCommand):
def syntax(self):
return "[options] <domain>"
def short_desc(self):
return "Run scraper for specific domain."
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--server", dest="server", action="store_true", help="Run scraper server that polls for tasks and runs them")
parser.add_option("--demonize", dest="demonize", action="store_true", help="Run scrapy as a demon process continiously processing the spider's tasks.")
parser.add_option("--all", dest="all", action="store_true", help="Run all available tasks for the spider.")
parser.add_option('--interval', dest='interval', help="Polling interval for executing the spider in demon mode.")
parser.add_option('--priority', dest='priority', help="Priority of task that you want to execute." )
parser.add_option('--task-name', dest='task_name', help="Name of task that you would like to execute.", default=None )
parser.add_option('--task-id', dest='task_id', help="Id of task that you would like to execute.", default=None )
parser.add_option('--child-logfile', dest='child_logfile', help="Pass this parameter if you want to log output of child processes.", default=None)
parser.add_option("--child", dest="child", action="store_true", help="Make this process a child.")
def run(self, args, opts):
if opts.server:
global interupted
signal.signal(signal.SIGINT, interupt)
while not interupted:
self._loop(args, opts)
else:
self.execute(args, opts)
def _loop(self, args, opts):
if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy:
heapy = guppy.hpy()
task = Task().next(locked=0, completed=0)
if task:
task.lock()
cmd = ['python', os.path.join(os.getcwd(), 'scrapy-ctl.py'), 'run']
cmd.append('--task-id=%s'%task.id)
if opts.child_logfile:
cmd.append('--logfile=%s'%opts.child_logfile)
cmd.append('--child')
task.start = datetime.now()
process = subprocess.Popen(cmd, shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
task.result, task.errors = process.communicate()
task.finish = datetime.now()
task.completed = 1
task.save()
timetext.LANG = 'en'
total = task.finish - task.start
log.msg('Finished: %s(%s) in %s'%(task.name, task.id, timetext.stringify(total)), level=log.INFO, domain=task.domain)
if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy:
log.msg(heapy.heap(), level=log.DEBUG)
heapy.setref()
else:
time.sleep(30)
def execute(self, args, opts):
task = None
if opts.task_id:
task = Task().load(id=opts.task_id)
if opts.task_name:
task = Task().next(name=opts.task_name)
if task or len(args):
if task:
domain = task.domain
else:
domain = args[0]
spider = spiders.fromdomain(domain)
scrapymanager.configure()
if opts.child:
def _stop():
pass
# monkeypatching stop command to prevent stoping prematurely in child mode
scrapymanager.stop = _stop
if not task.locked:
task.lock()
self.crawl(spider, task)
scrapyengine.start()
else:
log.msg('You must specify atleast 1 domain', level=log.ERROR)
def crawl(self, spider, task):
''' Crawl task on specific spider '''
spider.load(task)
scrapymanager.crawl(*spider.start_urls)