From 290367f4cbafd8276dc8e250f2aa3f236c668d90 Mon Sep 17 00:00:00 2001 From: Michal Simon Date: Mon, 7 Jun 2021 17:47:32 +0200 Subject: [PATCH] [XrdCl] Implement xrdcp --retry. --- docs/man/xrdcp.1 | 5 +++++ src/XrdApps/XrdCpConfig.cc | 2 +- src/XrdCl/XrdClCopy.cc | 3 +++ src/XrdCl/XrdClCopyProcess.cc | 24 +++++++++++++++++++++--- src/XrdCl/XrdClStatus.hh | 5 +++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/docs/man/xrdcp.1 b/docs/man/xrdcp.1 index 50df7a8d9fc..07c7a57b8de 100644 --- a/docs/man/xrdcp.1 +++ b/docs/man/xrdcp.1 @@ -91,6 +91,11 @@ use \fIproxyaddr\fB:\fIproxyport\fR as a SOCKS4 proxy. Only numerical addresses .RS 5 recursively copy all files starting at the given source directory. +.RE +\fB--retry\fR +.RS 5 +retry failed copy-jobs. + .RE \fB--server\fR .RS 5 diff --git a/src/XrdApps/XrdCpConfig.cc b/src/XrdApps/XrdCpConfig.cc index e67882753e0..6822c2dfccd 100644 --- a/src/XrdApps/XrdCpConfig.cc +++ b/src/XrdApps/XrdCpConfig.cc @@ -955,7 +955,7 @@ void XrdCpConfig::Usage(int rc) "-r | --recursive recursively copies all source files\n" " --rm-bad-cksum remove the target file if checksum verification failed\n" " (enables also POSC semantics)\n" - "-t | --retry maximum number of times to retry rejected connections\n" + "-t | --retry maximum number of times to retry failed copy-jobs\n" " --server runs in a server environment with added operations\n" "-s | --silent produces no output other than error messages\n" "-y | --sources uses up to the number of sources specified in parallel\n" diff --git a/src/XrdCl/XrdClCopy.cc b/src/XrdCl/XrdClCopy.cc index db745e09867..bc871590451 100644 --- a/src/XrdCl/XrdClCopy.cc +++ b/src/XrdCl/XrdClCopy.cc @@ -601,6 +601,9 @@ int main( int argc, char **argv ) if( config.nStrm != 0 ) env->PutInt( "SubStreamsPerChannel", config.nStrm + 1 /*stands for the control stream*/ ); + if( config.Retry != -1 ) + env->PutInt( "retry", config.Retry ); + if( config.Want( XrdCpConfig::DoNoTlsOK ) ) env->PutInt( "NoTlsOK", 1 ); diff --git a/src/XrdCl/XrdClCopyProcess.cc b/src/XrdCl/XrdClCopyProcess.cc index 3827980e8af..183a012ed96 100644 --- a/src/XrdCl/XrdClCopyProcess.cc +++ b/src/XrdCl/XrdClCopyProcess.cc @@ -54,9 +54,11 @@ namespace XrdSysSemaphore *sem = 0 ): pJob(job), pProgress(progress), pCurrentJob(currentJob), pTotalJobs(totalJobs), pSem(sem), - pRetryCnt( XrdCl::DefaultRetryWrtAtLBLimit ) + pWrtRetryCnt( XrdCl::DefaultRetryWrtAtLBLimit ), + pRetryCnt( 0 ) { - XrdCl::DefaultEnv::GetEnv()->GetInt( "RetryWrtAtLBLimit", pRetryCnt ); + XrdCl::DefaultEnv::GetEnv()->GetInt( "RetryWrtAtLBLimit", pWrtRetryCnt ); + XrdCl::DefaultEnv::GetEnv()->GetInt( "retry", pRetryCnt ); } //------------------------------------------------------------------------ @@ -92,7 +94,10 @@ namespace while( true ) { st = pJob->Run( pProgress ); - if( !st.IsOK() && st.code == XrdCl::errRetry && pRetryCnt > 0 ) + //-------------------------------------------------------------------- + // Retry due to write-recovery + //-------------------------------------------------------------------- + if( !st.IsOK() && st.code == XrdCl::errRetry && pWrtRetryCnt > 0 ) { std::string url; pJob->GetResults()->Get( "LastURL", url ); @@ -124,6 +129,18 @@ namespace pJob->Init(); // we have a new job, let's try again + --pWrtRetryCnt; + continue; + } + //-------------------------------------------------------------------- + // Copy job retry + //-------------------------------------------------------------------- + if( !st.IsOK() && pRetryCnt > 0 && + ( XrdCl::Status::IsSocketError( st.code ) || + st.code == XrdCl::errOperationExpired || + st.code == XrdCl::errThresholdExceeded ) ) + { + pJob->GetProperties()->Set( "force", true ); --pRetryCnt; continue; } @@ -164,6 +181,7 @@ namespace uint16_t pCurrentJob; uint16_t pTotalJobs; XrdSysSemaphore *pSem; + int pWrtRetryCnt; int pRetryCnt; }; }; diff --git a/src/XrdCl/XrdClStatus.hh b/src/XrdCl/XrdClStatus.hh index 69ee70c0214..233a377caf7 100644 --- a/src/XrdCl/XrdClStatus.hh +++ b/src/XrdCl/XrdClStatus.hh @@ -131,6 +131,11 @@ namespace XrdCl return (code/100)+50; } + inline static bool IsSocketError( uint16_t code ) + { + return int( code / 100 ) == 1; + } + //-------------------------------------------------------------------------- //! Create a string representation //--------------------------------------------------------------------------