Matching wrf results aarch64 x86 with gcc10.2 (#1773)

This update is to resolve different simulation results on aarch64 and x86 for the same dataset (caused by round up error in the last position for some transcendental functions) TYPE: bug fix KEYWORDS: round up error, transcendental, compiler optimization SOURCE: Jun Tang, Amazon DESCRIPTION OF CHANGES: Problem: After investigation, different simulation results on aarch64 and x86 for the same dataset with gcc 10.2 are caused by round up error in the last position for some transcendental functions; and several optimization flags (FMA, inverse square root, ...). Solution: Use double precision for the computation of transcendental function at selected locations can make sure exact same output models are produced on aarch64 and x86 (this patch only covers a few PBL and cumulus scheme). Also disabling some risky optimization can guarantee same output as lower optimization level. LIST OF MODIFIED FILES: list of changed files (use `git diff --name-status master` to get formatted list) M arch/configure.defaults M phys/module_cu_tiedtke.F M phys/module_sf_myjsfc.F TESTS CONDUCTED: 1. The mods fixes the correctness problem between aarch64 and x86 for two WRF models (conus2.5km and conus12km). The output models are matched, bit by bit, on the two specified platform with the patch. 2. The regression tests have passed - as indicated the change should not affect its results. RELEASE NOTE: Fix numerical divergence on x86 and arm64. For best performance for WRF on arm64 please use armclang. (https://github.com/juntangc/notes/blob/main/release-note.pdf).
wrf-model · Dec 9, 2022 · 541fa56 · 541fa56
1 parent 05a00cf
commit 541fa56
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 0 deletions.
diff --git a/arch/configure.defaults b/arch/configure.defaults
@@ -789,6 +789,8 @@ LD              =       $(FC)
 RWORDSIZE       =       CONFIGURE_RWORDSIZE
 PROMOTION       =       #-fdefault-real-8
 ARCH_LOCAL      =       -DNONSTANDARD_SYSTEM_SUBR  CONFIGURE_D_CTSM
+# to validate WRF correctness on aarch64 against x86_64; compile with x86 this alternate ARCH_LOCAL option; able to achieve identical output model on ubuntu 18.04 with gcc 10.2 (on graviton and intel processors)
+# ARCH_LOCAL      =       -DNONSTANDARD_SYSTEM_SUBR  CONFIGURE_D_CTSM -DAARCH64_X86_CORRECTNESS_FIX
 CFLAGS_LOCAL    =       -w -O3 -c  # -DRSL0_ONLY
 LDFLAGS_LOCAL   =       
 CPLUSPLUSLIB    =       
@@ -2068,6 +2070,49 @@ NETCDFPAR_BUILD	=      CONFIGURE_NETCDFPAR_BUILD
 
 #insert new stanza here
 
+###########################################################
+#ARCH    Linux aarch64, GCC compiler OpenMPI # serial smpar dmpar dm+sm
+#
+DESCRIPTION     =      GCC ($SFC/$SCC): Aarch64
+DMPARALLEL      =
+OMPCPP          =      -fopenmp
+OMP             =      -fopenmp
+OMPCC           =      -fopenmp
+SFC             =      gfortran
+SCC             =      gcc
+CCOMP           =      gcc
+DM_FC           =      mpif90
+DM_CC           =      mpicc -DMPI2_SUPPORT
+FC              =      CONFIGURE_FC
+CC              =      CONFIGURE_CC
+LD              =      $(FC)
+RWORDSIZE       =      CONFIGURE_RWORDSIZE
+PROMOTION       =
+ARCH_LOCAL      =      -DAARCH64_X86_CORRECTNESS_FIX
+CFLAGS_LOCAL    =      -w -O3 -c
+LDFLAGS_LOCAL   =      -fopenmp
+FCOPTIM         =      -Ofast -mcpu=native -funroll-loops -fno-expensive-optimizations -fno-reciprocal-math -fsigned-zeros -fno-unsafe-math-optimizations
+# for graviton 2 use the folowing flag
+#FCOPTIM         =      -Ofast -march=armv8.2-a+fp16+rcpc+dotprod -funroll-loops -fno-expensive-optimizations -fno-reciprocal-math -fsigned-zeros -fno-unsafe-math-optimizations
+FCREDUCEDOPT    =      $(FCOPTIM)
+FCNOOPT         =      -O0 -fopenmp -frecursive
+FCDEBUG         =      -g $(FCNOOPT)
+FORMAT_FIXED    =      -ffixed-form -ffixed-line-length-0 -fallow-argument-mismatch -fallow-invalid-boz
+FORMAT_FREE     =      -ffree-form -ffree-line-length-0 -fallow-argument-mismatch -fallow-invalid-boz
+FCSUFFIX        =
+BYTESWAPIO      =      -fconvert=big-endian -frecord-marker=4
+FCBASEOPTS      =      -w $(FORMAT_FREE) $(BYTESWAPIO)
+MODULE_SRCH_FLAG=      -I$(WRF_SRC_ROOT_DIR)/main
+TRADFLAG        =      -traditional-cpp
+CPP             =      /lib/cpp CONFIGURE_CPPFLAGS
+AR              =      ar
+ARFLAGS         =      ru
+M4              =      m4 -B 14000
+RANLIB          =      ranlib
+RLFLAGS         =
+CC_TOOLS        =      $(SCC)
+
+
 ###########################################################
 #ARCH    Fujitsu FX10/FX100 Linux x86_64 SPARC64IXfx/SPARC64Xlfx, mpifrtpx and mpifccpx compilers #serial smpar dmpar dm+sm
 #

diff --git a/phys/module_cu_tiedtke.F b/phys/module_cu_tiedtke.F
@@ -2935,7 +2935,11 @@ subroutine cuentr_new                              &
             zzmzk = -(pgeoh(jl,ikh)-pgeoh(jl,kk))*zrg
             ztmzk = -(pgeoh(jl,ikh)-pgeoh(jl,ikt))*zrg
             arg = 3.1415*(zzmzk/ztmzk)*0.5
+#ifndef AARCH64_X86_CORRECTNESS_FIX
             zorgde = tan(arg)*3.1415*0.5/ztmzk
+#else
+            zorgde = real(tan(dble(arg))*3.1415*0.5/dble(ztmzk))
+#endif
             zdprho = (paph(jl,kk+1)-paph(jl,kk))*(zrg*zrrho)
             zodetr(jl,kk) = min(zorgde,1.e-3)*pmfu(jl,kk+1)*zdprho
           end if

diff --git a/phys/module_sf_myjsfc.F b/phys/module_sf_myjsfc.F
@@ -1088,6 +1088,9 @@ SUBROUTINE MYJSFCINIT(LOWLYR,USTAR,Z0                            &
 !     INTEGER :: MPI_INTEGER,MPI_MAX
 !
       REAL :: SM,X,ZETA1,ZETA2,ZRNG1,ZRNG2
+#ifdef AARCH64_X86_CORRECTNESS_FIX
+      DOUBLE PRECISION :: X2
+#endif
 !
       REAL :: PIHF=3.1415926/2.,EPS=1.E-6
 !----------------------------------------------------------------------
@@ -1209,7 +1212,12 @@ SUBROUTINE MYJSFCINIT(LOWLYR,USTAR,Z0                            &
 !----------------------------------------------------------------------
           X=SQRT(SQRT(1.-16.*ZETA1))
 !
+#ifndef AARCH64_X86_CORRECTNESS_FIX
           PSIM1(K)=-2.*LOG((X+1.)/2.)-LOG((X*X+1.)/2.)+2.*ATAN(X)-PIHF
+#else
+          X2 = dble(X)
+          PSIM1(K)=real(-2.d0 * log((X2+1.d0)/2.d0)-log((X2*X2+1.d0)/2.d0)+2.d0*atan(X2)-dble(PIHF))
+#endif
           PSIH1(K)=-2.*LOG((X*X+1.)/2.)
 !
 !----------------------------------------------------------------------
@@ -1246,7 +1254,12 @@ SUBROUTINE MYJSFCINIT(LOWLYR,USTAR,Z0                            &
 !
           X=SQRT(SQRT(1.-16.*ZETA2))
 !
+#ifndef AARCH64_X86_CORRECTNESS_FIX
           PSIM2(K)=-2.*LOG((X+1.)/2.)-LOG((X*X+1.)/2.)+2.*ATAN(X)-PIHF
+#else
+          X2 = dble(X)
+          PSIM2(K)=real(-2.d0*LOG((X2+1.d0)/2.d0)-LOG((X2*X2+1.d0)/2.d0)+2.*ATAN(X2)-dble(PIHF))
+#endif
           PSIH2(K)=-2.*LOG((X*X+1.)/2.)
 !----------------------------------------------------------------------
 !***  STABLE RANGE