Skip to content

Commit

Permalink
Add configuration for retryable errors for CRI executor
Browse files Browse the repository at this point in the history
Retry after error "create containerd task" which should be transient and
more likely caused by well-known race in containerd.

Link: containerd/containerd#9160

---
9a6df3eda1f35d458641d6b01b3ef3916e52ca03

Pull Request resolved: #556
  • Loading branch information
koct9i authored and robot-piglet committed May 25, 2024
1 parent 910fad8 commit f17997e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
8 changes: 8 additions & 0 deletions yt/yt/library/containers/cri/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ void TCriExecutorConfig::Register(TRegistrar registrar)

registrar.Parameter("cpu_period", &TThis::CpuPeriod)
.Default(TDuration::MilliSeconds(100));

registrar.Parameter("retry_error_prefixes", &TThis::RetryErrorPrefixes)
.Default({
// https://github.com/containerd/containerd/pull/9565
"server is not initialized yet",
// https://github.com/containerd/containerd/issues/9160
"failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to create new parent process: namespace path: lstat /proc/0/ns/ipc: no such file or directory: unknown",
});
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 3 additions & 0 deletions yt/yt/library/containers/cri/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ class TCriExecutorConfig
//! Cpu quota period for cpu limits.
TDuration CpuPeriod;

//! Retry requests on generic error with these message prefixes.
std::vector<TString> RetryErrorPrefixes;

REGISTER_YSON_STRUCT(TCriExecutorConfig);

static void Register(TRegistrar registrar);
Expand Down
19 changes: 13 additions & 6 deletions yt/yt/library/containers/cri/cri_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,14 +741,21 @@ class TCriExecutor
}
}

static TRetryChecker GetRetryChecker()
TRetryChecker GetRetryChecker()
{
static const auto Result = BIND_NO_PROPAGATE([] (const TError& error) {
return IsRetriableError(error) ||
(error.GetCode() == NYT::EErrorCode::Generic &&
error.GetMessage() == "server is not initialized yet");
return BIND_NO_PROPAGATE([config = Config_] (const TError& error) {
if (IsRetriableError(error)) {
return true;
}
if (error.GetCode() == NYT::EErrorCode::Generic) {
for (const auto& prefix: config->RetryErrorPrefixes) {
if (error.GetMessage().StartsWith(prefix)) {
return true;
}
}
}
return false;
});
return Result;
}
};

Expand Down

0 comments on commit f17997e

Please sign in to comment.