-
Notifications
You must be signed in to change notification settings - Fork 0
/
dispatch.py
executable file
·184 lines (156 loc) · 6.48 KB
/
dispatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
import sys, os, signal
import subprocess
import logging
import argparse, ConfigParser
from multiprocessing import Process, active_children
# from threading import Thread
import notify
def runInOtherThread(function, args):
t = Process(target=function, args=args)
t.start()
def checkChildren(sig, frame):
active_children()
def signal_handler(sig, frame):
logger.debug("job was cancelled by user at %s" % hostname)
print("job was cancelled by user at %s" % hostname)
process.terminate()
sys.exit(0)
def signal_kill_handler(sig, frame):
message = "job was kill at %s" % hostname
logger.debug(message)
notify.sendMail("job killed", message, "dspsr@pacifix")
sys.exit(0)
def parseCommand(commandString):
commands = commandString.split(",")
return commands
def parseOptions():
parser = argparse.ArgumentParser()
parser.add_argument('--group', '-g', nargs=1, metavar="num", help='number of files procesesd together')
parser.add_argument('--total', '-t', nargs=1, metavar="num", help="total parts")
parser.add_argument('--part', '-p', nargs=1, metavar="num", help="part index")
parser.add_argument('--job', '-j', nargs=1, metavar="jobFile", help="job file for the pipeline")
parser.add_argument('--section', '-s', nargs=1, metavar="section", help="section name inside the job file")
parser.add_argument('--continue', '-c', action="store_true", help="wether to continue on previous process")
parser.add_argument('--thread', nargs=1, metavar="num", help="thread number, default: none")
args = parser.parse_args()
if None in (args.group, args.total, args.part, args.job, args.section):
parser.error("insufficient arguments")
return args
args = parseOptions()
section = args.section[0]
config = ConfigParser.ConfigParser()
config.read(args.job[0])
configItems = dict(config.items(section))
dataPath = configItems['data_path']
candiatePath = configItems['candidate_path']
intermediatePath = configItems['intermediate_path']
logPath = configItems['log_path']
pathPrefix = configItems['path_prefix']
instance = configItems['instance']
sendNotify = configItems['notify']
dataFileList = configItems.get('file_list', None)
commands = parseCommand(configItems['commands'])
# exit(0)
# commands = json.loads(configItems['commands'])
stackInterval = args.group[0]
thisPart = args.part[0]
totalParts = args.total[0]
outputPath = candiatePath + '/' + thisPart
timeConsistance = False
logsFileName = logPath + '/log'
if os.path.isfile(logsFileName):
if os.path.getsize(logsFileName) > 2000000:
os.rename(logsFileName, logsFileName + '.bak')
loggerFormat = '%(asctime)-15s %(filename)s %(message)s'
logging.basicConfig(filename=logsFileName , format = loggerFormat, level=logging.DEBUG)
logger = logging.getLogger(__name__)
hostname = os.uname()[1]
logger.debug("job started at %s with parameters %s" % (hostname + thisPart,
" ".join([stackInterval, thisPart, totalParts, section])))
# if(not os.path.isdir(outputPath)):
# os.mkdir(outputPath)
# os.chdir(outputPath)
if dataFileList is not None:
with open(dataFileList, 'r') as fileListObj:
allFiles = [line.strip() for line in fileListObj]
else:
allFiles = [os.path.join(dataPath, fileName) for fileName in os.listdir(dataPath)]
allFiles.sort()
fileNum = len(allFiles)
groupSize = fileNum/int(totalParts) + (1 if (fileNum % int(totalParts)) > 0 else 0)
groupStart = int(thisPart)*groupSize
groupEnd = groupStart + groupSize
if groupEnd > fileNum:
groupEnd = fileNum
thisGroup = allFiles[groupStart:groupEnd]
segments = []
if timeConsistance == True:
currentTime = ''
for dataFile in thisGroup:
dataTime = dataFile.split('/')[-1].split('_')[0]
if dataTime != currentTime:
segments.append([])
currentTime = dataTime
segments[-1].append(dataFile)
else:
segments = [thisGroup]
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_kill_handler)
signal.signal(signal.SIGCHLD, checkChildren)
retryCount = 0
processCount = 0
retryLimit = 10
sender = "gpsearch@numerix0"
if sendNotify == 'true':
runInOtherThread(notify.sendWeb,
(thisPart + "@gpsearch", "job {} started".format(section), instance))
for segment in segments:
stackNum = len(segment)
stackIdx = 0
while stackIdx < stackNum:
stackEnd = stackIdx + int(stackInterval)
if stackEnd < stackNum:
stackList = segment[stackIdx:stackEnd]
else:
stackList = segment[stackIdx:]
stackIdx = stackEnd
thisFileList = ' '.join(stackList)
for command in commands:
for attempt in range(retryLimit):
cmd = command.format(thisFileList, intermediatePath,
candiatePath, pathPrefix, thisPart)
process = subprocess.Popen(cmd,stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
for line in iter(process.stdout.readline, ''):
sys.stdout.write(line)
outs, errs = process.communicate()
if process.returncode != 0:
message = "job exits unexpected at %s" % hostname
logger.debug(message)
logger.debug(errs)
logger.debug("retry the command")
print(errs)
else:
'''files are successfully procesesd.'''
break
else:
message = ("max retry reached, skip files: %s at %s"
"with command: %s") % (thisFileList, hostname, command)
logger.debug(message)
runInOtherThread(notify.sendMail, ("job skipped", message, sender))
break
processCount += len(stackList)
message = ("procesesd: {}, remain: {}".format(
processCount, len(thisGroup) - processCount))
if sendNotify == 'true':
runInOtherThread(notify.sendWeb, (thisPart + "@gpsearch", message, instance))
print(message)
message = ("job stopped at %s after processing %d files "
"with parameters %s and command: %s ") % (
hostname, processCount," ".join([stackInterval, thisPart, totalParts, section]), " ".join(commands))
logger.debug(message)
if sendNotify == 'true':
notify.sendMail("job stopped", message, sender)
runInOtherThread(notify.sendWeb,
(thisPart + "@gpsearch", "job {} stopped".format(section), instance))