@@ -35,7 +35,7 @@ public interface IJobDispatcher : IRunnerService
35
35
// This implementation of IJobDispatcher is not thread safe.
36
36
// It is based on the fact that the current design of the runner is a dequeue
37
37
// and processes one message from the message queue at a time.
38
- // In addition, it only executes one job every time,
38
+ // In addition, it only executes one job every time,
39
39
// and the server will not send another job while this one is still running.
40
40
public sealed class JobDispatcher : RunnerService , IJobDispatcher
41
41
{
@@ -546,13 +546,27 @@ await processChannel.SendAsync(
546
546
Trace . Info ( $ "Return code { returnCode } indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result.") ;
547
547
548
548
var jobServer = await InitializeJobServerAsync ( systemConnection ) ;
549
- await LogWorkerProcessUnhandledException ( jobServer , message , detailInfo ) ;
550
-
551
- // Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
552
- if ( detailInfo . Contains ( typeof ( System . IO . IOException ) . ToString ( ) , StringComparison . OrdinalIgnoreCase ) )
549
+ var unhandledExceptionIssue = new Issue ( ) { Type = IssueType . Error , Message = detailInfo } ;
550
+ unhandledExceptionIssue . Data [ Constants . Runner . InternalTelemetryIssueDataKey ] = Constants . Runner . WorkerCrash ;
551
+ switch ( jobServer )
553
552
{
554
- Trace . Info ( $ "Finish job with result 'Failed' due to IOException.") ;
555
- await ForceFailJob ( jobServer , message , detailInfo ) ;
553
+ case IJobServer js :
554
+ {
555
+ await LogWorkerProcessUnhandledException ( js , message , unhandledExceptionIssue ) ;
556
+ // Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
557
+ if ( detailInfo . Contains ( typeof ( System . IO . IOException ) . ToString ( ) , StringComparison . OrdinalIgnoreCase ) )
558
+ {
559
+ Trace . Info ( $ "Finish job with result 'Failed' due to IOException.") ;
560
+ await ForceFailJob ( js , message ) ;
561
+ }
562
+
563
+ break ;
564
+ }
565
+ case IRunServer rs :
566
+ await ForceFailJob ( rs , message , unhandledExceptionIssue ) ;
567
+ break ;
568
+ default :
569
+ throw new NotSupportedException ( $ "JobServer type '{ jobServer . GetType ( ) . Name } ' is not supported.") ;
556
570
}
557
571
}
558
572
@@ -644,7 +658,7 @@ await processChannel.SendAsync(
644
658
}
645
659
}
646
660
647
- // wait worker to exit
661
+ // wait worker to exit
648
662
// if worker doesn't exit within timeout, then kill worker.
649
663
completedTask = await Task . WhenAny ( workerProcessTask , Task . Delay ( - 1 , workerCancelTimeoutKillToken ) ) ;
650
664
@@ -1131,86 +1145,70 @@ private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequest
1131
1145
}
1132
1146
1133
1147
// log an error issue to job level timeline record
1134
- private async Task LogWorkerProcessUnhandledException ( IRunnerService server , Pipelines . AgentJobRequestMessage message , string detailInfo )
1148
+ private async Task LogWorkerProcessUnhandledException ( IJobServer jobServer , Pipelines . AgentJobRequestMessage message , Issue issue )
1135
1149
{
1136
- if ( server is IJobServer jobServer )
1150
+ try
1137
1151
{
1138
- try
1139
- {
1140
- var timeline = await jobServer . GetTimelineAsync ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , message . Timeline . Id , CancellationToken . None ) ;
1141
- ArgUtil . NotNull ( timeline , nameof ( timeline ) ) ;
1152
+ var timeline = await jobServer . GetTimelineAsync ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , message . Timeline . Id , CancellationToken . None ) ;
1153
+ ArgUtil . NotNull ( timeline , nameof ( timeline ) ) ;
1142
1154
1143
- TimelineRecord jobRecord = timeline . Records . FirstOrDefault ( x => x . Id == message . JobId && x . RecordType == "Job" ) ;
1144
- ArgUtil . NotNull ( jobRecord , nameof ( jobRecord ) ) ;
1155
+ TimelineRecord jobRecord = timeline . Records . FirstOrDefault ( x => x . Id == message . JobId && x . RecordType == "Job" ) ;
1156
+ ArgUtil . NotNull ( jobRecord , nameof ( jobRecord ) ) ;
1145
1157
1146
- var unhandledExceptionIssue = new Issue ( ) { Type = IssueType . Error , Message = detailInfo } ;
1147
- unhandledExceptionIssue . Data [ Constants . Runner . InternalTelemetryIssueDataKey ] = Constants . Runner . WorkerCrash ;
1148
- jobRecord . ErrorCount ++ ;
1149
- jobRecord . Issues . Add ( unhandledExceptionIssue ) ;
1150
1158
1151
- if ( message . Variables . TryGetValue ( "DistributedTask.MarkJobAsFailedOnWorkerCrash" , out var markJobAsFailedOnWorkerCrash ) &&
1152
- StringUtil . ConvertToBoolean ( markJobAsFailedOnWorkerCrash ? . Value ) )
1153
- {
1154
- Trace . Info ( "Mark the job as failed since the worker crashed" ) ;
1155
- jobRecord . Result = TaskResult . Failed ;
1156
- // mark the job as completed so service will pickup the result
1157
- jobRecord . State = TimelineRecordState . Completed ;
1158
- }
1159
+ jobRecord . ErrorCount ++ ;
1160
+ jobRecord . Issues . Add ( issue ) ;
1159
1161
1160
- await jobServer . UpdateTimelineRecordsAsync ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , message . Timeline . Id , new TimelineRecord [ ] { jobRecord } , CancellationToken . None ) ;
1161
- }
1162
- catch ( Exception ex )
1162
+ if ( message . Variables . TryGetValue ( "DistributedTask.MarkJobAsFailedOnWorkerCrash" , out var markJobAsFailedOnWorkerCrash ) &&
1163
+ StringUtil . ConvertToBoolean ( markJobAsFailedOnWorkerCrash ? . Value ) )
1163
1164
{
1164
- Trace . Error ( "Fail to report unhandled exception from Runner.Worker process" ) ;
1165
- Trace . Error ( ex ) ;
1165
+ Trace . Info ( "Mark the job as failed since the worker crashed" ) ;
1166
+ jobRecord . Result = TaskResult . Failed ;
1167
+ // mark the job as completed so service will pickup the result
1168
+ jobRecord . State = TimelineRecordState . Completed ;
1166
1169
}
1170
+
1171
+ await jobServer . UpdateTimelineRecordsAsync ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , message . Timeline . Id , new TimelineRecord [ ] { jobRecord } , CancellationToken . None ) ;
1167
1172
}
1168
- else
1173
+ catch ( Exception ex )
1169
1174
{
1170
- Trace . Info ( "Job server does not support handling unhandled exception yet, error message: {0}" , detailInfo ) ;
1171
- return ;
1175
+ Trace . Error ( "Fail to report unhandled exception from Runner.Worker process" ) ;
1176
+ Trace . Error ( ex ) ;
1172
1177
}
1173
1178
}
1174
1179
1175
1180
// raise job completed event to fail the job.
1176
- private async Task ForceFailJob ( IRunnerService server , Pipelines . AgentJobRequestMessage message , string detailInfo )
1181
+ private async Task ForceFailJob ( IJobServer jobServer , Pipelines . AgentJobRequestMessage message )
1177
1182
{
1178
- if ( server is IJobServer jobServer )
1183
+ try
1179
1184
{
1180
- try
1181
- {
1182
- var jobCompletedEvent = new JobCompletedEvent ( message . RequestId , message . JobId , TaskResult . Failed ) ;
1183
- await jobServer . RaisePlanEventAsync < JobCompletedEvent > ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , jobCompletedEvent , CancellationToken . None ) ;
1184
- }
1185
- catch ( Exception ex )
1186
- {
1187
- Trace . Error ( "Fail to raise JobCompletedEvent back to service." ) ;
1188
- Trace . Error ( ex ) ;
1189
- }
1185
+ var jobCompletedEvent = new JobCompletedEvent ( message . RequestId , message . JobId , TaskResult . Failed ) ;
1186
+ await jobServer . RaisePlanEventAsync < JobCompletedEvent > ( message . Plan . ScopeIdentifier , message . Plan . PlanType , message . Plan . PlanId , jobCompletedEvent , CancellationToken . None ) ;
1190
1187
}
1191
- else if ( server is IRunServer runServer )
1188
+ catch ( Exception ex )
1192
1189
{
1193
- try
1194
- {
1195
- var unhandledExceptionIssue = new Issue ( ) { Type = IssueType . Error , Message = detailInfo } ;
1196
- var unhandledAnnotation = unhandledExceptionIssue . ToAnnotation ( ) ;
1197
- var jobAnnotations = new List < Annotation > ( ) ;
1198
- if ( unhandledAnnotation . HasValue )
1199
- {
1200
- jobAnnotations . Add ( unhandledAnnotation . Value ) ;
1201
- }
1190
+ Trace . Error ( "Fail to raise JobCompletedEvent back to service." ) ;
1191
+ Trace . Error ( ex ) ;
1192
+ }
1193
+ }
1202
1194
1203
- await runServer . CompleteJobAsync ( message . Plan . PlanId , message . JobId , TaskResult . Failed , outputs : null , stepResults : null , jobAnnotations : jobAnnotations , environmentUrl : null , CancellationToken . None ) ;
1204
- }
1205
- catch ( Exception ex )
1195
+ private async Task ForceFailJob ( IRunServer runServer , Pipelines . AgentJobRequestMessage message , Issue issue )
1196
+ {
1197
+ try
1198
+ {
1199
+ var annotation = issue . ToAnnotation ( ) ;
1200
+ var jobAnnotations = new List < Annotation > ( ) ;
1201
+ if ( annotation . HasValue )
1206
1202
{
1207
- Trace . Error ( "Fail to raise job completion back to service." ) ;
1208
- Trace . Error ( ex ) ;
1203
+ jobAnnotations . Add ( annotation . Value ) ;
1209
1204
}
1205
+
1206
+ await runServer . CompleteJobAsync ( message . Plan . PlanId , message . JobId , TaskResult . Failed , outputs : null , stepResults : null , jobAnnotations : jobAnnotations , environmentUrl : null , CancellationToken . None ) ;
1210
1207
}
1211
- else
1208
+ catch ( Exception ex )
1212
1209
{
1213
- throw new NotSupportedException ( $ "Server type { server . GetType ( ) . FullName } is not supported.") ;
1210
+ Trace . Error ( "Fail to raise job completion back to service." ) ;
1211
+ Trace . Error ( ex ) ;
1214
1212
}
1215
1213
}
1216
1214
0 commit comments