From c66960763a572a0c387699b0057844903a1f53af Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Wed, 24 Feb 2021 14:47:35 -0800
Subject: [PATCH 01/17] kernel/swap: Move arch_cohere_stacks() back under the
 lock

Commit 6b84ab383050 ("kernel/sched: Adjust locking in z_swap()") moved
the call to arch_cohere_stacks() out of the scheduler lock while doing
some reorgnizing.  On further reflection, this is incorrect.  When
done outside the lock, the two arch_cohere_stacks() calls will race
against each other.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 kernel/include/kswap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
index 81bfd89281ec91..75f298cdbec2f8 100644
--- a/kernel/include/kswap.h
+++ b/kernel/include/kswap.h
@@ -106,6 +106,8 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		z_spin_lock_set_owner(&sched_spinlock);
 #endif
 
+		arch_cohere_stacks(old_thread, NULL, new_thread);
+
 #ifdef CONFIG_SMP
 		/* Add _current back to the run queue HERE. After
 		 * wait_for_switch() we are guaranteed to reach the
@@ -121,7 +123,6 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 			new_thread->switch_handle = NULL;
 		}
 		k_spin_release(&sched_spinlock);
-		arch_cohere_stacks(old_thread, NULL, new_thread);
 		arch_switch(newsh, &old_thread->switch_handle);
 	} else {
 		k_spin_release(&sched_spinlock);

From 4d61afff251289adecf5145849371eb073e87652 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Sat, 13 Feb 2021 10:32:42 -0800
Subject: [PATCH 02/17] arch/xtensa: General cleanup, remove dead code

There was a bunch of dead historical cruft floating around in the
arch/xtensa tree, left over from older code versions.  It's time to do
a cleanup pass.  This is entirely refactoring and size optimization,
no behavior changes on any in-tree devices should be present.

Among the more notable changes:

+ xtensa_context.h offered an elaborate API to deal with a stack frame
  and context layout that we no longer use.

+ xtensa_rtos.h was entirely dead code

+ xtensa_timer.h was a parallel abstraction layer implementing in the
  architecture layer what we're already doing in our timer driver.

+ The architecture thread structs (_callee_saved and _thread_arch)
  aren't used by current code, and had dead fields that were removed.
  Unfortunately for standards compliance and C++ compatibility it's
  not possible to leave an empty struct here, so they have a single
  byte field.

+ xtensa_api.h was really just some interrupt management inlines used
  by irq.h, so fold that code into the outer header.

+ Remove the stale assembly offsets.  This architecture doesn't use
  that facility.

All told, more than a thousand lines have been removed.  Not bad.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/atomic.S                |  41 ++-
 arch/xtensa/core/crt1.S                  |   2 +-
 arch/xtensa/core/fatal.c                 |  13 -
 arch/xtensa/core/irq_manage.c            |   3 +-
 arch/xtensa/core/irq_offload.c           |   1 -
 arch/xtensa/core/offsets/offsets.c       |  51 +---
 arch/xtensa/core/window_vectors.S        |   2 +-
 arch/xtensa/include/kernel_arch_data.h   |  54 ----
 arch/xtensa/include/kernel_arch_func.h   |  14 +-
 arch/xtensa/include/offsets_short_arch.h |  42 +--
 drivers/timer/cavs_timer.c               |   1 -
 drivers/timer/xtensa_sys_timer.c         |   1 -
 include/arch/xtensa/exc.h                |  11 -
 include/arch/xtensa/irq.h                |  42 ++-
 include/arch/xtensa/thread.h             |  96 +------
 include/arch/xtensa/xtensa_api.h         |  67 -----
 include/arch/xtensa/xtensa_config.h      | 132 ---------
 include/arch/xtensa/xtensa_context.h     | 323 -----------------------
 include/arch/xtensa/xtensa_rtos.h        | 204 --------------
 include/arch/xtensa/xtensa_timer.h       | 155 -----------
 soc/xtensa/intel_adsp/common/soc.c       |   1 -
 soc/xtensa/intel_s1000/soc.c             |   1 -
 22 files changed, 102 insertions(+), 1155 deletions(-)
 delete mode 100644 arch/xtensa/include/kernel_arch_data.h
 delete mode 100644 include/arch/xtensa/xtensa_api.h
 delete mode 100644 include/arch/xtensa/xtensa_config.h
 delete mode 100644 include/arch/xtensa/xtensa_context.h
 delete mode 100644 include/arch/xtensa/xtensa_rtos.h
 delete mode 100644 include/arch/xtensa/xtensa_timer.h

diff --git a/arch/xtensa/core/atomic.S b/arch/xtensa/core/atomic.S
index c7c3d7777af8af..c5ab64714e5de3 100644
--- a/arch/xtensa/core/atomic.S
+++ b/arch/xtensa/core/atomic.S
@@ -3,7 +3,46 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <arch/xtensa/xtensa_context.h>
+/*
+ * MACROS TO HANDLE ABI SPECIFICS OF FUNCTION ENTRY AND RETURN
+ *
+ * Convenient where the frame size requirements are the same for both ABIs.
+ * ENTRY(sz), RET(sz) are for framed functions (have locals or make calls).
+ * ENTRY0,    RET0    are for frameless functions (no locals, no calls).
+ *
+ * where size = size of stack frame in bytes (must be >0 and aligned to 16).
+ * For framed functions the frame is created and the return address saved at
+ * base of frame (Call0 ABI) or as determined by hardware (Windowed ABI).  For
+ * frameless functions, there is no frame and return address remains in
+ * a0.
+ *
+ * Note: Because CPP macros expand to a single line, macros requiring
+ * multi-line expansions are implemented as assembler macros.
+ */
+
+#ifdef __XTENSA_CALL0_ABI__
+/* Call0 */
+#define ENTRY(sz)     entry1  sz
+.macro  entry1 size=0x10
+addi    sp, sp, -\size
+s32i    a0, sp, 0
+.endm
+#define ENTRY0
+#define RET(sz)       ret1    sz
+.macro  ret1 size=0x10
+l32i    a0, sp, 0
+addi    sp, sp, \size
+ret
+.endm
+#define RET0          ret
+#else
+/* Windowed */
+#define ENTRY(sz)     entry   sp, sz
+#define ENTRY0        entry   sp, 0x10
+#define RET(sz)       retw
+#define RET0          retw
+#endif /* __XTENSA_CALL0_ABI__ */
+
 /**
  *
  * @brief Atomically clear a memory location
diff --git a/arch/xtensa/core/crt1.S b/arch/xtensa/core/crt1.S
index 2f125b2811c5c5..b012f29197d1ed 100644
--- a/arch/xtensa/core/crt1.S
+++ b/arch/xtensa/core/crt1.S
@@ -7,7 +7,7 @@
  * Control arrives here at _start from the reset vector or from crt0-app.S.
  */
 
-#include <arch/xtensa/xtensa_rtos.h>
+#include <xtensa/coreasm.h>
 
 /* Exports */
 .global _start
diff --git a/arch/xtensa/core/fatal.c b/arch/xtensa/core/fatal.c
index 4f749b0e51373e..cc9e411fb4914c 100644
--- a/arch/xtensa/core/fatal.c
+++ b/arch/xtensa/core/fatal.c
@@ -7,7 +7,6 @@
 #include <arch/cpu.h>
 #include <kernel_structs.h>
 #include <inttypes.h>
-#include <kernel_arch_data.h>
 #include <xtensa/config/specreg.h>
 #include <xtensa-asm2-context.h>
 #if defined(CONFIG_XTENSA_ENABLE_BACKTRACE)
@@ -105,18 +104,6 @@ void z_xtensa_fatal_error(unsigned int reason, const z_arch_esf_t *esf)
 	z_fatal_error(reason, esf);
 }
 
-XTENSA_ERR_NORET void FatalErrorHandler(void)
-{
-	z_xtensa_fatal_error(K_ERR_CPU_EXCEPTION, NULL);
-}
-
-XTENSA_ERR_NORET void ReservedInterruptHandler(unsigned int intNo)
-{
-	LOG_ERR("INTENABLE = 0x%x INTERRUPT = 0x%x (%x)",
-		get_sreg(INTENABLE), (1 << intNo), intNo);
-	z_xtensa_fatal_error(K_ERR_SPURIOUS_IRQ, NULL);
-}
-
 void exit(int return_code)
 {
 #ifdef XT_SIMULATOR
diff --git a/arch/xtensa/core/irq_manage.c b/arch/xtensa/core/irq_manage.c
index b38d8d95aabe35..34209240ca2181 100644
--- a/arch/xtensa/core/irq_manage.c
+++ b/arch/xtensa/core/irq_manage.c
@@ -5,8 +5,7 @@
 
 #include <zephyr/types.h>
 #include <stdio.h>
-#include <arch/xtensa/xtensa_api.h>
-#include <kernel_arch_data.h>
+#include <arch/xtensa/irq.h>
 #include <sys/__assert.h>
 /*
  * @internal
diff --git a/arch/xtensa/core/irq_offload.c b/arch/xtensa/core/irq_offload.c
index a3d1a3cc619786..4aeb5446e875a6 100644
--- a/arch/xtensa/core/irq_offload.c
+++ b/arch/xtensa/core/irq_offload.c
@@ -6,7 +6,6 @@
 #include <kernel.h>
 #include <irq_offload.h>
 #include <arch/xtensa/arch.h>
-#include <arch/xtensa/xtensa_api.h>
 
 /*
  * Xtensa core should support software interrupt in order to allow using
diff --git a/arch/xtensa/core/offsets/offsets.c b/arch/xtensa/core/offsets/offsets.c
index ee78f56723a0b3..a853b876b5dbfd 100644
--- a/arch/xtensa/core/offsets/offsets.c
+++ b/arch/xtensa/core/offsets/offsets.c
@@ -1,55 +1,14 @@
 /*
- * Copyright (c) 2013-2014 Wind River Systems, Inc.
- * Copyright (c) 2016 Cadence Design Systems, Inc.
+ * Copyright (c) 2021 Intel Corporation
  * SPDX-License-Identifier: Apache-2.0
  */
 
-/**
- * @file
- * @brief Xtensa kernel structure member offset definition file
- *
- * This module is responsible for the generation of the absolute symbols whose
- * value represents the member offsets for various Xtensa kernel
- * structures.
- *
- * All of the absolute symbols defined by this module will be present in the
- * final kernel or kernel ELF image (due to the linker's reference to
- * the _OffsetAbsSyms symbol).
- *
- * INTERNAL
- * It is NOT necessary to define the offset for every member of a structure.
- * Typically, only those members that are accessed by assembly language routines
- * are defined; however, it doesn't hurt to define all fields for the sake of
- * completeness.
- */
-
-#include <kernel.h>
-#include <kernel_arch_data.h>
 #include <gen_offset.h>
 #include <kernel_offsets.h>
 
-/* Xtensa-specific k_thread structure member offsets */
-GEN_OFFSET_SYM(_callee_saved_t, topOfStack);
-GEN_OFFSET_SYM(_callee_saved_t, retval);
-
-GEN_OFFSET_SYM(_thread_arch_t, preempCoprocReg);
-#if XCHAL_CP_NUM > 0
-GEN_OFFSET_SYM(tPreempCoprocReg, cpStack);
-#endif
-
-/* Xtensa-specific _thread_arch_t structure member offsets */
-GEN_OFFSET_SYM(_thread_arch_t, flags);
-
-/* Xtensa-specific ESF structure member offsets */
-GEN_OFFSET_SYM(__esf_t, sp);
-GEN_OFFSET_SYM(__esf_t, pc);
-
-/* size of the entire __esf_t structure */
-GEN_ABSOLUTE_SYM(____esf_t_SIZEOF, sizeof(__esf_t));
-
-/* size of the struct k_thread structure without save area for coproc regs */
-GEN_ABSOLUTE_SYM(_K_THREAD_NO_FLOAT_SIZEOF,
-		 sizeof(struct k_thread) - sizeof(tCoopCoprocReg) -
-			 sizeof(tPreempCoprocReg) + XT_CP_DESCR_SIZE);
+/* No offsets required in Xtensa, but this file must be present for
+ * the build.  Usage is the same as other architectures if you want to
+ * add some.
+ */
 
 GEN_ABS_SYM_END
diff --git a/arch/xtensa/core/window_vectors.S b/arch/xtensa/core/window_vectors.S
index 5423cbaface121..8bded3a7360b6e 100644
--- a/arch/xtensa/core/window_vectors.S
+++ b/arch/xtensa/core/window_vectors.S
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 Cadence Design Systems, Inc.
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <arch/xtensa/xtensa_rtos.h>
+#include <xtensa/coreasm.h>
 
 /* WINDOW OVERFLOW AND UNDERFLOW EXCEPTION VECTORS AND ALLOCA EXCEPTION
  * HANDLER
diff --git a/arch/xtensa/include/kernel_arch_data.h b/arch/xtensa/include/kernel_arch_data.h
deleted file mode 100644
index cb56d819316ab0..00000000000000
--- a/arch/xtensa/include/kernel_arch_data.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016 Wind River Systems, Inc.
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/**
- * @file
- * @brief Private kernel definitions (XTENSA)
- *
- * This file contains private kernel structures definitions and various
- * other definitions for the XTENSA processors family architecture.
- *
- * This file is also included by assembly language files which must #define
- * _ASMLANGUAGE before including this header file.  Note that kernel
- * assembly source files obtains structure offset values via "absolute symbols"
- * in the offsets.o module.
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_KERNEL_ARCH_DATA_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_KERNEL_ARCH_DATA_H_
-
-#include <toolchain.h>
-#include <linker/sections.h>
-#include <arch/cpu.h>
-
-#if !defined(_ASMLANGUAGE) && !defined(__ASSEMBLER__)
-#include <kernel.h>            /* public kernel API */
-#include <zephyr/types.h>
-#include <sys/dlist.h>
-#include <sys/util.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Bitmask definitions for the struct k_thread->flags bit field */
-
-/* executing context is interrupt handler */
-#define INT_ACTIVE (1 << 1)
-/* executing context is exception handler */
-#define EXC_ACTIVE (1 << 2)
-/* thread uses floating point unit */
-#define USE_FP 0x010
-
-typedef struct __esf __esf_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /*! _ASMLANGUAGE && ! __ASSEMBLER__ */
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_KERNEL_ARCH_DATA_H_ */
diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
index 47cef92358370e..53c6661c3e5586 100644
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@@ -12,20 +12,14 @@
 
 #ifndef _ASMLANGUAGE
 #include <kernel_internal.h>
-#include <kernel_arch_data.h>
 #include <string.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern void FatalErrorHandler(void);
-extern void ReservedInterruptHandler(unsigned int intNo);
 extern void z_xtensa_fatal_error(unsigned int reason, const z_arch_esf_t *esf);
 
-/* Defined in xtensa_context.S */
-extern void z_xt_coproc_init(void);
-
 extern K_KERNEL_STACK_ARRAY_DEFINE(z_interrupt_stacks, CONFIG_MP_NUM_CPUS,
 				   CONFIG_ISR_STACK_SIZE);
 
@@ -117,15 +111,15 @@ static inline void arch_cohere_stacks(struct k_thread *old_thread,
 }
 #endif
 
-#ifdef __cplusplus
-}
-#endif
-
 static inline bool arch_is_in_isr(void)
 {
 	return arch_curr_cpu()->nested != 0U;
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* _ASMLANGUAGE */
 
 #endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_KERNEL_ARCH_FUNC_H_ */
diff --git a/arch/xtensa/include/offsets_short_arch.h b/arch/xtensa/include/offsets_short_arch.h
index 41bab37923dace..34a4a5842cf753 100644
--- a/arch/xtensa/include/offsets_short_arch.h
+++ b/arch/xtensa/include/offsets_short_arch.h
@@ -1,43 +1,5 @@
 /*
- * Copyright (c) 2016 Wind River Systems, Inc.
- * Copyright (c) 2016 Cadence Design Systems, Inc.
+ * Copyright (c) 2021 Intel Corporation
  * SPDX-License-Identifier: Apache-2.0
  */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_OFFSETS_SHORT_ARCH_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_OFFSETS_SHORT_ARCH_H_
-
-#include <offsets.h>
-
-/* kernel */
-#define KERNEL_OFFSET(field) _kernel_offset_to_##field
-
-#define _kernel_offset_to_flags \
-	(___kernel_t_arch_OFFSET + ___kernel_arch_t_flags_OFFSET)
-
-/* end - kernel */
-
-/* threads */
-#define THREAD_OFFSET(field) _thread_offset_to_##field
-
-#define _thread_offset_to_sp \
-	(___thread_t_callee_saved_OFFSET + ___callee_saved_t_topOfStack_OFFSET)
-
-#define _thread_offset_to_retval \
-	(___thread_t_callee_saved_OFFSET + ___callee_saved_t_retval_OFFSET)
-
-#define _thread_offset_to_coopCoprocReg \
-	(___thread_t_arch_OFFSET + ___thread_arch_t_coopCoprocReg_OFFSET)
-
-#define _thread_offset_to_preempCoprocReg \
-	(___thread_t_arch_OFFSET + ___thread_arch_t_preempCoprocReg_OFFSET)
-
-#define _thread_offset_to_cpStack \
-	(_thread_offset_to_preempCoprocReg + __tPreempCoprocReg_cpStack_OFFSET)
-
-#define _thread_offset_to_cpEnable \
-	(_thread_offset_to_cpStack + XT_CPENABLE)
-
-/* end - threads */
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_OFFSETS_SHORT_ARCH_H_ */
+/* Empty File */
diff --git a/drivers/timer/cavs_timer.c b/drivers/timer/cavs_timer.c
index 33f461eeb47cbd..71d558fcb5b632 100644
--- a/drivers/timer/cavs_timer.c
+++ b/drivers/timer/cavs_timer.c
@@ -7,7 +7,6 @@
 #include <drivers/timer/system_timer.h>
 #include <sys_clock.h>
 #include <spinlock.h>
-#include <arch/xtensa/xtensa_rtos.h>
 
 /**
  * @file
diff --git a/drivers/timer/xtensa_sys_timer.c b/drivers/timer/xtensa_sys_timer.c
index 6d1c1875ca3d4b..2b605f66c51511 100644
--- a/drivers/timer/xtensa_sys_timer.c
+++ b/drivers/timer/xtensa_sys_timer.c
@@ -6,7 +6,6 @@
 #include <drivers/timer/system_timer.h>
 #include <sys_clock.h>
 #include <spinlock.h>
-#include <arch/xtensa/xtensa_rtos.h>
 
 #define TIMER_IRQ UTIL_CAT(XCHAL_TIMER,		\
 			   UTIL_CAT(CONFIG_XTENSA_TIMER_ID, _INTERRUPT))
diff --git a/include/arch/xtensa/exc.h b/include/arch/xtensa/exc.h
index 9ea111689cc37b..e207261a0278c1 100644
--- a/include/arch/xtensa/exc.h
+++ b/include/arch/xtensa/exc.h
@@ -20,17 +20,6 @@ extern "C" {
 #endif
 
 #ifndef _ASMLANGUAGE
-/**
- * @brief Exception Stack Frame
- *
- * A pointer to an "exception stack frame" (ESF) is passed as an argument
- * to exception handlers registered via nanoCpuExcConnect().
- */
-struct __esf {
-	/* FIXME - not finished yet */
-	sys_define_gpr_with_alias(a1, sp);
-	uint32_t pc;
-};
 
 /* Xtensa uses a variable length stack frame depending on how many
  * register windows are in use.  This isn't a struct type, it just
diff --git a/include/arch/xtensa/irq.h b/include/arch/xtensa/irq.h
index 6bf00a31fa9007..fd114516973deb 100644
--- a/include/arch/xtensa/irq.h
+++ b/include/arch/xtensa/irq.h
@@ -6,11 +6,51 @@
 #ifndef ZEPHYR_INCLUDE_ARCH_XTENSA_XTENSA_IRQ_H_
 #define ZEPHYR_INCLUDE_ARCH_XTENSA_XTENSA_IRQ_H_
 
-#include <arch/xtensa/xtensa_api.h>
 #include <xtensa/xtruntime.h>
 
 #define CONFIG_GEN_IRQ_START_VECTOR 0
 
+/*
+ * Call this function to enable the specified interrupts.
+ *
+ * mask     - Bit mask of interrupts to be enabled.
+ */
+static inline void z_xt_ints_on(unsigned int mask)
+{
+	int val;
+
+	__asm__ volatile("rsr.intenable %0" : "=r"(val));
+	val |= mask;
+	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(val));
+}
+
+
+/*
+ * Call this function to disable the specified interrupts.
+ *
+ * mask     - Bit mask of interrupts to be disabled.
+ */
+static inline void z_xt_ints_off(unsigned int mask)
+{
+	int val;
+
+	__asm__ volatile("rsr.intenable %0" : "=r"(val));
+	val &= ~mask;
+	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(val));
+}
+
+/*
+ * Call this function to set the specified (s/w) interrupt.
+ */
+static inline void z_xt_set_intset(unsigned int arg)
+{
+#if XCHAL_HAVE_INTERRUPTS
+	__asm__ volatile("wsr.intset %0; rsync" : : "r"(arg));
+#else
+	ARG_UNUSED(arg);
+#endif
+}
+
 #ifdef CONFIG_MULTI_LEVEL_INTERRUPTS
 
 /* for _soc_irq_*() */
diff --git a/include/arch/xtensa/thread.h b/include/arch/xtensa/thread.h
index 5b85424f21794d..d6def98a81c1c9 100644
--- a/include/arch/xtensa/thread.h
+++ b/include/arch/xtensa/thread.h
@@ -1,109 +1,27 @@
 /*
- * Copyright (c) 2017 Intel Corporation
+ * Copyright (c) 2021 Intel Corporation
  *
  * SPDX-License-Identifier: Apache-2.0
  */
 
-/**
- * @file
- * @brief Per-arch thread definition
- *
- * This file contains definitions for
- *
- *  struct _thread_arch
- *  struct _callee_saved
- *
- * necessary to instantiate instances of struct k_thread.
- */
-
 #ifndef ZEPHYR_INCLUDE_ARCH_XTENSA_THREAD_H_
 #define ZEPHYR_INCLUDE_ARCH_XTENSA_THREAD_H_
 
 #ifndef _ASMLANGUAGE
-#include <zephyr/types.h>
-#include <arch/xtensa/xtensa_context.h>
 
-/*
- * The following structure defines the set of 'non-volatile' integer registers.
- * These registers must be preserved by a called C function.  These are the
- * only registers that need to be saved/restored when a cooperative context
- * switch occurs.
+/* Xtensa doesn't use these structs, but Zephyr core requires they be
+ * defined so they can be included in struct _thread_base.  Dummy
+ * field exists for sizeof compatibility with C++.
  */
+
 struct _callee_saved {
-	/*
-	 * The following registers are considered non-volatile, i.e.
-	 * callee-saved, but their values are pushed onto the stack rather than
-	 * stored in the k_thread structure:
-	 */
-	uint32_t retval; /* a2 */
-	XtExcFrame *topOfStack; /* a1 = sp */
+	char dummy;
 };
 
 typedef struct _callee_saved _callee_saved_t;
 
-/*
- * The following structure defines the set of 'non-volatile' x87 FPU/MMX/SSE
- * registers. These registers must be preserved by a called C function.
- * These are the only registers that need to be saved/restored when a
- * cooperative context switch occurs.
- */
-typedef struct s_coopCoprocReg {
-
-	/*
-	 * This structure intentionally left blank. Coprocessor's registers are
-	 * all 'volatile' and saved using the lazy context switch mechanism.
-	 */
-
-} tCoopCoprocReg;
-
-/*
- * The following structure defines the set of 'volatile' x87 FPU/MMX/SSE
- * registers.  These registers need not be preserved by a called C function.
- * Given that they are not preserved across function calls, they must be
- * save/restored (along with s_coopCoprocReg) when a preemptive context switch
- * occurs.
- */
-typedef struct s_preempCoprocReg {
-	/*
-	 * This structure reserved coprocessor control and save area memory.
-	 */
-#if XCHAL_CP_NUM > 0
-	char __aligned(4) cpStack[XT_CP_SIZE];
-#endif
-} tPreempCoprocReg;
-
-/*
- * The thread control structure definition.  It contains the
- * various fields to manage a _single_ thread.
- */
 struct _thread_arch {
-	/*
-	 * See the above flag definitions above for valid bit settings.  This
-	 * field must remain near the start of struct k_thread, specifically
-	 * before any #ifdef'ed fields since the host tools currently use a
-	 * fixed offset to read the 'flags' field.
-	 */
-	uint32_t flags;
-
-	/*
-	 * The location of all floating point related structures/fields MUST be
-	 * located at the end of struct k_thread.  This way only the threads
-	 * that actually utilize non-integer capabilities need to account for
-	 * the increased memory required for storing FP state when sizing
-	 * stacks.
-	 *
-	 * Given that stacks "grow down" on Xtensa, and the k_thread is located
-	 * at the start of a thread's "workspace" memory, the stacks of threads
-	 * that do not utilize floating point instruction can effectively
-	 * consume the memory occupied by the 'tCoopCoprocReg' and
-	 * 'tPreempCoprocReg' structures without ill effect.
-	 */
-
-	 /* non-volatile coprocessor's register storage */
-	tCoopCoprocReg coopCoprocReg;
-
-	/* volatile coprocessor's register storage */
-	tPreempCoprocReg preempCoprocReg;
+	char dummy;
 };
 
 typedef struct _thread_arch _thread_arch_t;
diff --git a/include/arch/xtensa/xtensa_api.h b/include/arch/xtensa/xtensa_api.h
deleted file mode 100644
index 67cd578deb834d..00000000000000
--- a/include/arch/xtensa/xtensa_api.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_API_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_API_H_
-
-#include <xtensa/hal.h>
-#include "xtensa_rtos.h"
-#include "xtensa_context.h"
-
-/*
- * Call this function to enable the specified interrupts.
- *
- * mask     - Bit mask of interrupts to be enabled.
- */
-static inline void z_xt_ints_on(unsigned int mask)
-{
-	int val;
-
-	__asm__ volatile("rsr.intenable %0" : "=r"(val));
-	val |= mask;
-	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(val));
-}
-
-
-/*
- * Call this function to disable the specified interrupts.
- *
- * mask     - Bit mask of interrupts to be disabled.
- */
-static inline void z_xt_ints_off(unsigned int mask)
-{
-	int val;
-
-	__asm__ volatile("rsr.intenable %0" : "=r"(val));
-	val &= ~mask;
-	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(val));
-}
-
-/*
- * Call this function to set the specified (s/w) interrupt.
- */
-static inline void z_xt_set_intset(unsigned int arg)
-{
-#if XCHAL_HAVE_INTERRUPTS
-	__asm__ volatile("wsr.intset %0; rsync" : : "r"(arg));
-#else
-	ARG_UNUSED(arg);
-#endif
-}
-
-
-/* Call this function to clear the specified (s/w or edge-triggered)
- * interrupt.
- */
-static inline void _xt_set_intclear(unsigned int arg)
-{
-#if XCHAL_HAVE_INTERRUPTS
-	__asm__ volatile("wsr.intclear %0; rsync" : : "r"(arg));
-#else
-	ARG_UNUSED(arg);
-#endif
-}
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_API_H_ */
diff --git a/include/arch/xtensa/xtensa_config.h b/include/arch/xtensa/xtensa_config.h
deleted file mode 100644
index fdfb47220eb99f..00000000000000
--- a/include/arch/xtensa/xtensa_config.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONFIG_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONFIG_H_
-
-#include <xtensa/hal.h>
-#include <xtensa/config/core.h>
-#include <xtensa/config/system.h>	/* required for XSHAL_CLIB */
-
-#include "xtensa_context.h"
-
-/*
- * STACK REQUIREMENTS
- *
- * This section defines the minimum stack size, and the extra space required to
- * be allocated for saving coprocessor state and/or C library state information
- * (if thread safety is enabled for the C library). The sizes are in bytes.
- *
- * Stack sizes for individual threads should be derived from these minima based
- * on the maximum call depth of the task and the maximum level of interrupt
- * nesting.  A minimum stack size is defined by XT_STACK_MIN_SIZE. This minimum
- * is based on the requirement for a task that calls nothing else but can be
- * interrupted.  This assumes that interrupt handlers do not call more than a
- * few levels deep.  If this is not true, i.e. one or more interrupt handlers
- * make deep calls then the minimum must be increased.
- *
- * If the Xtensa processor configuration includes coprocessors, then space is
- * allocated to save the coprocessor state on the stack.
- *
- * If thread safety is enabled for the C runtime library,
- * (XT_USE_THREAD_SAFE_CLIB is defined) then space is allocated to save the C
- * library context in the TCB.
- *
- * Allocating insufficient stack space is a common source of hard-to-find
- * errors.  During development, it is best to enable the FreeRTOS stack
- * checking features.
- *
- * Usage:
- *
- * XT_USE_THREAD_SAFE_CLIB -- Define this to a nonzero value to enable
- * thread-safe use of the C library. This will require extra stack space to be
- * allocated for threads that use the C library reentrant functions. See below
- * for more information.
- *
- * NOTE: The Xtensa toolchain supports multiple C libraries and not all of them
- * support thread safety. Check your core configuration to see which C library
- * was chosen for your system.
- *
- * XT_STACK_MIN_SIZE       -- The minimum stack size for any task. It is
- * recommended that you do not use a stack smaller than this for any task. In
- * case you want to use stacks smaller than this size, you must verify that the
- * smaller size(s) will work under all operating conditions.
- *
- * XT_STACK_EXTRA          -- The amount of extra stack space to allocate for a
- * task that does not make C library reentrant calls. Add this to the amount of
- * stack space required by the task itself.
- *
- * XT_STACK_EXTRA_CLIB     -- The amount of space to allocate for C library
- * state.
- */
-
-/* Extra space required for interrupt/exception hooks. */
-#ifdef XT_INTEXC_HOOKS
-  #ifdef __XTENSA_CALL0_ABI__
-    #define STK_INTEXC_EXTRA        0x200
-  #else
-    #define STK_INTEXC_EXTRA        0x180
-  #endif
-#else
-  #define STK_INTEXC_EXTRA          0
-#endif
-
-/* Check C library thread safety support and compute size of C library save
- * area.
- */
-#if XT_USE_THREAD_SAFE_CLIB > 0u
-  #if XSHAL_CLIB == XTHAL_CLIB_XCLIB
-    #define XT_HAVE_THREAD_SAFE_CLIB	0
-    #error "Thread-safe operation is not yet supported for the XCLIB C library."
-  #elif XSHAL_CLIB == XTHAL_CLIB_NEWLIB
-    #define XT_HAVE_THREAD_SAFE_CLIB	1
-    #if !defined __ASSEMBLER__
-      #include <sys/reent.h>
-      #define XT_CLIB_CONTEXT_AREA_SIZE ((sizeof(struct _reent) + 15) + (-16))
-      #define XT_CLIB_GLOBAL_PTR        _impure_ptr
-    #endif
-  #else
-    #define XT_HAVE_THREAD_SAFE_CLIB	0
-    #error "The selected C runtime library is not thread safe."
-  #endif
-#else
-  #define XT_CLIB_CONTEXT_AREA_SIZE	0
-#endif
-
-/* Extra size -- interrupt frame plus coprocessor save area plus hook space.
- *
- * NOTE: Make sure XT_INTEXC_HOOKS is undefined unless you really need the
- * hooks.
- */
-#ifdef __XTENSA_CALL0_ABI__
-  #define XT_XTRA_SIZE  (XT_STK_FRMSZ + STK_INTEXC_EXTRA + 0x10 + XT_CP_SIZE)
-#else
-  #define XT_XTRA_SIZE  (XT_STK_FRMSZ + STK_INTEXC_EXTRA + 0x20 + XT_CP_SIZE)
-#endif
-
-/*
- * Space allocated for user code -- function calls and local variables.
- *
- * NOTE: This number can be adjusted to suit your needs. You must verify that
- * the amount of space you reserve is adequate for the worst-case conditions in
- * your application.  NOTE: The windowed ABI requires more stack, since space
- * has to be reserved for spilling register windows.
- */
-#ifdef __XTENSA_CALL0_ABI__
-  #define XT_USER_SIZE            0x200
-#else
-  #define XT_USER_SIZE            0x400
-#endif
-
-/* Minimum recommended stack size. */
-#define XT_STACK_MIN_SIZE \
-	((XT_XTRA_SIZE + XT_USER_SIZE) / sizeof(unsigned char))
-
-/* OS overhead with and without C library thread context. */
-#define XT_STACK_EXTRA              (XT_XTRA_SIZE)
-#define XT_STACK_EXTRA_CLIB         (XT_XTRA_SIZE + XT_CLIB_CONTEXT_AREA_SIZE)
-
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONFIG_H_ */
diff --git a/include/arch/xtensa/xtensa_context.h b/include/arch/xtensa/xtensa_context.h
deleted file mode 100644
index a2dcf5517d3d18..00000000000000
--- a/include/arch/xtensa/xtensa_context.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * XTENSA CONTEXT FRAMES AND MACROS FOR RTOS ASSEMBLER SOURCES
- *
- * This header contains definitions and macros for use primarily by Xtensa RTOS
- * assembly coded source files. It includes and uses the Xtensa hardware
- * abstraction layer (HAL) to deal with config specifics. It may also be
- * included in C source files.
- *
- * Supports only Xtensa Exception Architecture 2 (XEA2). XEA1 not supported.
- *
- * NOTE: The Xtensa architecture requires stack pointer alignment to 16 bytes.
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONTEXT_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONTEXT_H_
-
-#include    <xtensa/config/tie.h>
-#include    <xtensa/corebits.h>
-#include    <xtensa/config/system.h>
-#include    <xtensa/xtruntime-frames.h>
-
-#ifdef __ASSEMBLER__
-#include    <xtensa/coreasm.h>
-#else /* __ASSEMBLER__ */
-#ifdef __cplusplus
-extern "C" {
-#endif
-#endif /* __ASSEMBLER__ */
-
-/* Align a value up to nearest n-byte boundary, where n is a power of 2. */
-#define ALIGNUP(n, val) (((val) + (n)-1) & -(n))
-
-/*
- * INTERRUPT/EXCEPTION STACK FRAME FOR A THREAD OR NESTED INTERRUPT
- *
- * A stack frame of this structure is allocated for any interrupt or exception.
- * It goes on the current stack. If the RTOS has a system stack for handling
- * interrupts, every thread stack must allow space for just one interrupt stack
- * frame, then nested interrupt stack frames go on the system stack.
- *
- * The frame includes basic registers (explicit) and "extra" registers
- * introduced by user TIE or the use of the MAC16 option in the user's Xtensa
- * config.  The frame size is minimized by omitting regs not applicable to
- * user's config.
- *
- * For Windowed ABI, this stack frame includes the interruptee's base save
- * area, another base save area to manage gcc nested functions, and a little
- * temporary space to help manage the spilling of the register windows.
- */
-
-STRUCT_BEGIN
-STRUCT_FIELD(long, 4, XT_STK_,     exit) /* exit point for dispatch */
-STRUCT_FIELD(long, 4, XT_STK_,       pc)   /* return PC */
-STRUCT_FIELD(long, 4, XT_STK_,       ps)   /* return PS */
-STRUCT_FIELD(long, 4, XT_STK_,       a0)
-STRUCT_FIELD(long, 4, XT_STK_,       a1)   /* stack pointer before irq */
-STRUCT_FIELD(long, 4, XT_STK_,       a2)
-STRUCT_FIELD(long, 4, XT_STK_,       a3)
-STRUCT_FIELD(long, 4, XT_STK_,       a4)
-STRUCT_FIELD(long, 4, XT_STK_,       a5)
-STRUCT_FIELD(long, 4, XT_STK_,       a6)
-STRUCT_FIELD(long, 4, XT_STK_,       a7)
-STRUCT_FIELD(long, 4, XT_STK_,       a8)
-STRUCT_FIELD(long, 4, XT_STK_,       a9)
-STRUCT_FIELD(long, 4, XT_STK_,      a10)
-STRUCT_FIELD(long, 4, XT_STK_,      a11)
-STRUCT_FIELD(long, 4, XT_STK_,      a12)
-STRUCT_FIELD(long, 4, XT_STK_,      a13)
-STRUCT_FIELD(long, 4, XT_STK_,      a14)
-STRUCT_FIELD(long, 4, XT_STK_,      a15)
-STRUCT_FIELD(long, 4, XT_STK_,      sar)
-STRUCT_FIELD(long, 4, XT_STK_, exccause)
-STRUCT_FIELD(long, 4, XT_STK_, excvaddr)
-#if XCHAL_HAVE_LOOPS
-STRUCT_FIELD(long, 4, XT_STK_,   lbeg)
-STRUCT_FIELD(long, 4, XT_STK_,   lend)
-STRUCT_FIELD(long, 4, XT_STK_, lcount)
-#endif
-#ifndef __XTENSA_CALL0_ABI__
-/* Temporary space for saving stuff during window spill */
-STRUCT_FIELD(long, 4, XT_STK_,   tmp0)
-STRUCT_FIELD(long, 4, XT_STK_,   tmp1)
-STRUCT_FIELD(long, 4, XT_STK_,   tmp2)
-#endif
-#ifdef XT_USE_SWPRI
-/* Storage for virtual priority mask */
-STRUCT_FIELD(long, 4, XT_STK_,   vpri)
-#endif
-#ifdef XT_USE_OVLY
-/* Storage for overlay state */
-STRUCT_FIELD(long, 4, XT_STK_,   ovly)
-#endif
-STRUCT_END(XtExcFrame)
-
-#if defined(_ASMLANGUAGE) || defined(__ASSEMBLER__)
-#define XT_STK_NEXT1      XtExcFrameSize
-#else
-#define XT_STK_NEXT1      sizeof(XtExcFrame)
-#endif
-
-/* Allocate extra storage if needed */
-#if XCHAL_EXTRA_SA_SIZE != 0
-
-#if XCHAL_EXTRA_SA_ALIGN <= 16
-#define XT_STK_EXTRA            ALIGNUP(XCHAL_EXTRA_SA_ALIGN, XT_STK_NEXT1)
-#else
-/* If need more alignment than stack, add space for dynamic alignment */
-#define XT_STK_EXTRA		(ALIGNUP(XCHAL_EXTRA_SA_ALIGN, XT_STK_NEXT1) \
-				 + XCHAL_EXTRA_SA_ALIGN)
-#endif
-#define XT_STK_NEXT2            (XT_STK_EXTRA + XCHAL_EXTRA_SA_SIZE)
-
-#else
-
-#define XT_STK_NEXT2            XT_STK_NEXT1
-
-#endif
-
-/*
- * This is the frame size. Add space for 4 registers (interruptee's base save
- * area) and some space for gcc nested functions if any.
- */
-#define XT_STK_FRMSZ            (ALIGNUP(0x10, XT_STK_NEXT2) + 0x20)
-
-
-/*
- * SOLICITED STACK FRAME FOR A THREAD
- *
- * A stack frame of this structure is allocated whenever a thread enters the
- * RTOS kernel intentionally (and synchronously) to submit to thread
- * scheduling.  It goes on the current thread's stack.
- *
- * The solicited frame only includes registers that are required to be
- * preserved by the callee according to the compiler's ABI conventions, some
- * space to save the return address for returning to the caller, and the
- * caller's PS register. For Windowed ABI, this stack frame includes the
- * caller's base save area.
- *
- * Note on XT_SOL_EXIT field:
- *
- * It is necessary to distinguish a solicited from an interrupt stack frame.
- * This field corresponds to XT_STK_EXIT in the interrupt stack frame and is
- * always at the same offset (0). It can be written with a code (usually 0) to
- * distinguish a solicted frame from an interrupt frame. An RTOS port may opt
- * to ignore this field if it has another way of distinguishing frames.
- */
-
-STRUCT_BEGIN
-STRUCT_FIELD(long, 4, XT_SOL_, exit)
-STRUCT_FIELD(long, 4, XT_SOL_,   pc)
-STRUCT_FIELD(long, 4, XT_SOL_,   ps)
-STRUCT_FIELD(long, 4, XT_SOL_, next)
-#ifdef __XTENSA_CALL0_ABI__
-STRUCT_FIELD(long, 4, XT_SOL_,  a12)    /* should be on 16-byte alignment */
-STRUCT_FIELD(long, 4, XT_SOL_,  a13)
-STRUCT_FIELD(long, 4, XT_SOL_,  a14)
-STRUCT_FIELD(long, 4, XT_SOL_,  a15)
-#else
-STRUCT_FIELD(long, 4, XT_SOL_,   a0)    /* should be on 16-byte alignment */
-STRUCT_FIELD(long, 4, XT_SOL_,   a1)
-STRUCT_FIELD(long, 4, XT_SOL_,   a2)
-STRUCT_FIELD(long, 4, XT_SOL_,   a3)
-#endif
-STRUCT_END(XtSolFrame)
-
-/* Size of solicited stack frame */
-#define XT_SOL_FRMSZ            ALIGNUP(0x10, XtSolFrameSize)
-
-
-/*
- * CO-PROCESSOR STATE SAVE AREA FOR A THREAD
- *
- * The RTOS must provide an area per thread to save the state of co-processors
- * when that thread does not have control. Co-processors are context-switched
- * lazily (on demand) only when a new thread uses a co-processor instruction,
- * otherwise a thread retains ownership of the co-processor even when it loses
- * control of the processor. An Xtensa co-processor exception is triggered when
- * any co-processor instruction is executed by a thread that is not the owner,
- * and the context switch of that co-processor is then performed by the handler.
- * Ownership represents which thread's state is currently in the co-processor.
- *
- * Co-processors may not be used by interrupt or exception handlers. If a
- * co-processor instruction is executed by an interrupt or exception handler,
- * the co-processor exception handler will trigger a kernel panic and freeze.
- * This restriction is introduced to reduce the overhead of saving and
- * restoring  co-processor state (which can be quite large) and in particular
- * remove that overhead from interrupt handlers.
- *
- * The co-processor state save area may be in any convenient per-thread
- * location such as in the thread control block or above the thread stack area.
- * It need not be in the interrupt stack frame since interrupts don't use
- * co-processors.
- *
- * Along with the save area for each co-processor, two bitmasks with flags per
- * co-processor (laid out as in the CPENABLE reg) help manage context-switching
- * co-processors as efficiently as possible:
- *
- * XT_CPENABLE
- *
- * The contents of a non-running thread's CPENABLE register.  It represents the
- * co-processors owned (and whose state is still needed) by the thread. When a
- * thread is preempted, its CPENABLE is saved here.  When a thread solicits a
- * context-swtich, its CPENABLE is cleared - the compiler has saved the
- * (caller-saved) co-proc state if it needs to.  When a non-running thread
- * loses ownership of a CP, its bit is cleared.  When a thread runs, it's
- * XT_CPENABLE is loaded into the CPENABLE reg. Avoids co-processor exceptions
- * when no change of ownership is needed.
- *
- * XT_CPSTORED
- *
- * A bitmask with the same layout as CPENABLE, a bit per co-processor.
- * Indicates whether the state of each co-processor is saved in the state save
- * area. When a thread enters the kernel, only the state of co-procs still
- * enabled in CPENABLE is saved. When the co-processor exception handler
- * assigns ownership of a co-processor to a thread, it restores the saved state
- * only if this bit is set, and clears this bit.
- *
- * XT_CP_CS_ST
- *
- * A bitmask with the same layout as CPENABLE, a bit per co-processor.
- * Indicates whether callee-saved state is saved in the state save area.
- * Callee-saved state is saved by itself on a solicited context switch, and
- * restored when needed by the coprocessor exception handler.  Unsolicited
- * switches will cause the entire coprocessor to be saved when necessary.
- *
- * XT_CP_ASA
- *
- * Pointer to the aligned save area.  Allows it to be aligned more than the
- * overall save area (which might only be stack-aligned or TCB-aligned).
- * Especially relevant for Xtensa cores configured with a very large data path
- * that requires alignment greater than 16 bytes (ABI stack alignment).
- */
-
-#define XT_CP_DESCR_SIZE 12
-
-#if XCHAL_CP_NUM > 0
-
-/*  Offsets of each coprocessor save area within the 'aligned save area':  */
-#define XT_CP0_SA   0
-#define XT_CP1_SA   ALIGNUP(XCHAL_CP1_SA_ALIGN, XT_CP0_SA + XCHAL_CP0_SA_SIZE)
-#define XT_CP2_SA   ALIGNUP(XCHAL_CP2_SA_ALIGN, XT_CP1_SA + XCHAL_CP1_SA_SIZE)
-#define XT_CP3_SA   ALIGNUP(XCHAL_CP3_SA_ALIGN, XT_CP2_SA + XCHAL_CP2_SA_SIZE)
-#define XT_CP4_SA   ALIGNUP(XCHAL_CP4_SA_ALIGN, XT_CP3_SA + XCHAL_CP3_SA_SIZE)
-#define XT_CP5_SA   ALIGNUP(XCHAL_CP5_SA_ALIGN, XT_CP4_SA + XCHAL_CP4_SA_SIZE)
-#define XT_CP6_SA   ALIGNUP(XCHAL_CP6_SA_ALIGN, XT_CP5_SA + XCHAL_CP5_SA_SIZE)
-#define XT_CP7_SA   ALIGNUP(XCHAL_CP7_SA_ALIGN, XT_CP6_SA + XCHAL_CP6_SA_SIZE)
-#define XT_CP_SA_SIZE   ALIGNUP(16, XT_CP7_SA + XCHAL_CP7_SA_SIZE)
-
-/*  Offsets within the overall save area:  */
-
-/* (2 bytes) coprocessors active for this thread */
-#define XT_CPENABLE 0
-
- /* (2 bytes) coprocessors saved for this thread */
-#define XT_CPSTORED 2
-
-/* (2 bytes) coprocessor callee-saved regs stored for this thread */
-#define XT_CP_CS_ST 4
-
-/* (4 bytes) ptr to aligned save area */
-#define XT_CP_ASA   8
-
-/* Overall size allows for dynamic alignment:  */
-#define XT_CP_SIZE ALIGNUP(XCHAL_TOTAL_SA_ALIGN, \
-	XT_CP_DESCR_SIZE + XT_CP_SA_SIZE)
-#else
-#define XT_CP_SIZE  0
-#endif
-
-
-/*
- * MACROS TO HANDLE ABI SPECIFICS OF FUNCTION ENTRY AND RETURN
- *
- * Convenient where the frame size requirements are the same for both ABIs.
- * ENTRY(sz), RET(sz) are for framed functions (have locals or make calls).
- * ENTRY0,    RET0    are for frameless functions (no locals, no calls).
- *
- * where size = size of stack frame in bytes (must be >0 and aligned to 16).
- * For framed functions the frame is created and the return address saved at
- * base of frame (Call0 ABI) or as determined by hardware (Windowed ABI).  For
- * frameless functions, there is no frame and return address remains in
- * a0.
- *
- * Note: Because CPP macros expand to a single line, macros requiring
- * multi-line expansions are implemented as assembler macros.
- */
-
-#ifdef __ASSEMBLER__
-#ifdef __XTENSA_CALL0_ABI__
-/* Call0 */
-#define ENTRY(sz)     entry1  sz
-.macro  entry1 size=0x10
-addi    sp, sp, -\size
-s32i    a0, sp, 0
-.endm
-#define ENTRY0
-#define RET(sz)       ret1    sz
-.macro  ret1 size=0x10
-l32i    a0, sp, 0
-addi    sp, sp, \size
-ret
-.endm
-#define RET0          ret
-#else
-/* Windowed */
-#define ENTRY(sz)     entry   sp, sz
-#define ENTRY0        entry   sp, 0x10
-#define RET(sz)       retw
-#define RET0          retw
-#endif /* __XTENSA_CALL0_ABI__ */
-#else /* __ASSEMBLER__ */
-#ifdef __cplusplus
-}
-#endif
-#endif /* __ASSEMBLER__ */
-
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_CONTEXT_H_ */
diff --git a/include/arch/xtensa/xtensa_rtos.h b/include/arch/xtensa/xtensa_rtos.h
deleted file mode 100644
index 015cc17a5f12cd..00000000000000
--- a/include/arch/xtensa/xtensa_rtos.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- *        RTOS-SPECIFIC INFORMATION FOR XTENSA RTOS ASSEMBLER SOURCES
- *                            (FreeRTOS Port)
- *
- * This header is the primary glue between generic Xtensa RTOS support
- * sources and a specific RTOS port for Xtensa.  It contains definitions
- * and macros for use primarily by Xtensa assembly coded source files.
- *
- * Macros in this header map callouts from generic Xtensa files to specific
- * RTOS functions. It may also be included in C source files.
- *
- * Xtensa RTOS ports support all RTOS-compatible configurations of the Xtensa
- * architecture, using the Xtensa hardware abstraction layer (HAL) to deal
- * with configuration specifics.
- *
- * Should be included by all Xtensa generic and RTOS port-specific sources.
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_RTOS_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_RTOS_H_
-
-#ifdef __ASSEMBLER__
-#include    <xtensa/coreasm.h>
-#else
-#include    <xtensa/config/core.h>
-#endif
-
-#include    <xtensa/corebits.h>
-#include    <xtensa/config/system.h>
-
-
-/*
- * Convert Zephyr definitions to XTENSA definitions.
- */
-
-#undef XT_SIMULATOR
-#undef XT_BOARD
-#ifdef CONFIG_SIMULATOR_XTENSA
-  #define XT_SIMULATOR 1
-#else
-  #define XT_BOARD 1
-#endif
-
-#undef  XT_CLOCK_FREQ
-#define XT_CLOCK_FREQ DT_PROP(DT_PATH(cpus, cpu_0), clock_frequency)
-
-#ifndef XT_TIMER_INDEX
-  #if defined configXT_TIMER_INDEX
-    /* Index of hardware timer to be used */
-    #define XT_TIMER_INDEX           configXT_TIMER_INDEX
-  #endif
-#endif
-
-#ifndef XT_INTEXC_HOOKS
-  #if configXT_INTEXC_HOOKS
-    #define XT_INTEXC_HOOKS          1  /* Enables exception hooks */
-  #endif
-#endif
-
-#if (!XT_SIMULATOR) && (!XT_BOARD)
-  #error Either XT_SIMULATOR or XT_BOARD must be defined.
-#endif
-
-
-/*
- * Name of RTOS (for messages).
- */
-#define XT_RTOS_NAME    Zephyr
-
-/*
- * Define for enabling RTOS specific code. Enable only one of below lines.
- */
-#define XT_RTOS_IS_ZEPHYR_OS 1
-#undef XT_RTOS_IS_FREE_RTOS
-
-/*
- * Check some Xtensa configuration requirements and report error if not met.
- * Error messages can be customize to the RTOS port.
- */
-
-#if !XCHAL_HAVE_XEA2
-#error "Zephyr/Xtensa requires XEA2 (exception architecture 2)."
-#endif
-
-/*
- * RTOS CALLOUT MACROS MAPPED TO RTOS PORT-SPECIFIC FUNCTIONS.
- *
- * Define callout macros used in generic Xtensa code to interact with the RTOS.
- * The macros are simply the function names for use in calls from assembler
- * code.
- * Some of these functions may call back to generic functions in
- * xtensa_context.h .
- */
-
-/*
- * Inform RTOS of entry into an interrupt handler that will affect it.
- * Allows RTOS to manage switch to any system stack and count nesting level.
- * Called after minimal context has been saved, with interrupts disabled.
- * RTOS port can call0 _xt_context_save to save the rest of the context.
- * May only be called from assembly code by the 'call0' instruction.
- */
-#define XT_RTOS_INT_ENTER   _zxt_int_enter
-
-/*
- * Inform RTOS of completion of an interrupt handler, and give control to
- * RTOS to perform thread/task scheduling, switch back from any system stack
- * and restore the context, and return to the exit dispatcher saved in the
- * stack frame at XT_STK_EXIT. RTOS port can call0 _xt_context_restore
- * to save the context saved in XT_RTOS_INT_ENTER via _xt_context_save,
- * leaving only a minimal part of the context to be restored by the exit
- * dispatcher. This function does not return to the place it was called from.
- * May only be called from assembly code by the 'call0' instruction.
- */
-#define XT_RTOS_INT_EXIT    _zxt_int_exit
-
-/*
- * Inform RTOS of the occurrence of a tick timer interrupt.
- * If RTOS has no tick timer, leave XT_RTOS_TIMER_INT undefined.
- * May be coded in or called from C or assembly, per ABI conventions.
- * RTOS may optionally define XT_TICK_PER_SEC in its own way (eg. macro).
- */
-#define XT_RTOS_TIMER_INT   _zxt_timer_int
-
-#if CONFIG_TICKLESS_KERNEL
-#define XT_TICK_PER_SEC		1000
-#else
-#define XT_TICK_PER_SEC		CONFIG_SYS_CLOCK_TICKS_PER_SEC
-#endif	/* CONFIG_TICKLESS_KERNEL */
-
-/*
- * Return in a15 the base address of the co-processor state save area for the
- * thread that triggered a co-processor exception, or 0 if no thread was
- * running.  The state save area is structured as defined in xtensa_context.h
- * and has size XT_CP_SIZE. Co-processor instructions should only be used in
- * thread code, never in interrupt handlers or the RTOS kernel. May only be
- * called from assembly code and by the 'call0' instruction. A result of 0
- * indicates an unrecoverable error.
- *
- * The implementation may use only a2-4, a15 (all other regs must be
- * preserved).
- */
-#define XT_RTOS_CP_STATE    _zxt_task_coproc_state
-
-
-/*
- * HOOKS TO DYNAMICALLY INSTALL INTERRUPT AND EXCEPTION HANDLERS PER LEVEL.
- *
- * This Xtensa RTOS port provides hooks for dynamically installing exception
- * and interrupt handlers to facilitate automated testing where each test case
- * can install its own handler for user exceptions and each interrupt priority
- * (level). This consists of an array of function pointers indexed by interrupt
- * priority, with index 0 being the user exception handler hook.  Each entry in
- * the array is initially 0, and may be replaced by a function pointer of type
- * XT_INTEXC_HOOK. A handler may be uninstalled by installing 0.
- *
- * The handler for low and medium priority obeys ABI conventions so may be
- * coded in C. For the exception handler, the cause is the contents of the
- * EXCCAUSE reg, and the result is -1 if handled, else the cause (still needs
- * handling).  For interrupt handlers, the cause is a mask of pending enabled
- * interrupts at that level, and the result is the same mask with the bits for
- * the handled interrupts cleared (those not cleared still need handling). This
- * allows a test case to either pre-handle or override the default handling for
- * the exception or interrupt level (see xtensa_vectors.S).
- *
- * High priority handlers (including NMI) must be coded in assembly, are always
- * called by 'call0' regardless of ABI, must preserve all registers except a0,
- * and must not use or modify the interrupted stack. The hook argument 'cause'
- * is not passed and the result is ignored, so as not to burden the caller
- * with saving and restoring a2 (it assumes only one interrupt per level - see
- * the discussion in high priority interrupts in xtensa_vectors.S). The handler
- * therefore should be coded to prototype 'void h(void)' even though it plugs
- * into an array of handlers of prototype 'unsigned h(unsigned)'.
- *
- * To enable interrupt/exception hooks, compile the RTOS with
- * '-DXT_INTEXC_HOOKS'.
- */
-#define XT_INTEXC_HOOK_NUM  (1 + XCHAL_NUM_INTLEVELS + XCHAL_HAVE_NMI)
-
-#ifndef __ASSEMBLER__
-typedef unsigned int (*XT_INTEXC_HOOK)(unsigned int cause);
-extern  volatile XT_INTEXC_HOOK _xt_intexc_hooks[XT_INTEXC_HOOK_NUM];
-#endif
-
-
-/*
- * CONVENIENCE INCLUSIONS.
- *
- * Ensures RTOS specific files need only include this one Xtensa-generic
- * header.  These headers are included last so they can use the RTOS
- * definitions above.
- */
-
-#include    "xtensa_context.h"
-
-#ifdef XT_RTOS_TIMER_INT
-#include    "xtensa_timer.h"
-#endif
-
-#endif /* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_RTOS_H_ */
diff --git a/include/arch/xtensa/xtensa_timer.h b/include/arch/xtensa/xtensa_timer.h
deleted file mode 100644
index a081136d8faf28..00000000000000
--- a/include/arch/xtensa/xtensa_timer.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- *        XTENSA INFORMATION FOR RTOS TICK TIMER AND CLOCK FREQUENCY
- *
- * This header contains definitions and macros for use primarily by Xtensa
- * RTOS assembly coded source files. It includes and uses the Xtensa hardware
- * abstraction layer (HAL) to deal with config specifics. It may also be
- * included in C source files.
- *
- * User may edit to modify timer selection and to specify clock frequency and
- * tick duration to match timer interrupt to the real-time tick duration.
- *
- * If the RTOS has no timer interrupt, then there is no tick timer and the
- * clock frequency is irrelevant, so all of these macros are left undefined
- * and the Xtensa core configuration need not have a timer.
- */
-
-#ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_TIMER_H_
-#define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_TIMER_H_
-
-#ifdef __ASSEMBLER__
-#include    <xtensa/coreasm.h>
-#endif
-
-#include    <xtensa/corebits.h>
-#include    <xtensa/config/system.h>
-
-#include    "xtensa_rtos.h"     /* in case this wasn't included directly */
-
-#define USE_INTERNAL_TIMER 1
-#define EXTERNAL_TIMER_IRQ -1
-
-#if USE_INTERNAL_TIMER || (EXTERNAL_TIMER_IRQ < 0)
-/*
- * Select timer to use for periodic tick, and determine its interrupt number
- * and priority. User may specify a timer by defining XT_TIMER_INDEX with -D,
- * in which case its validity is checked (it must exist in this core and must
- * not be on a high priority interrupt - an error will be reported in invalid).
- * Otherwise select the first low or medium priority interrupt timer available.
- */
-#if XCHAL_NUM_TIMERS == 0
-#error "This Xtensa configuration is unsupported, it has no timers."
-#endif /* XCHAL_NUM_TIMERS */
-
-#ifndef XT_TIMER_INDEX
-	#if XCHAL_TIMER3_INTERRUPT != XTHAL_TIMER_UNCONFIGURED
-	  #if XCHAL_INT_LEVEL(XCHAL_TIMER3_INTERRUPT) <= XCHAL_EXCM_LEVEL
-	    #undef  XT_TIMER_INDEX
-	    #define XT_TIMER_INDEX    3
-	  #endif
-	#endif
-	#if XCHAL_TIMER2_INTERRUPT != XTHAL_TIMER_UNCONFIGURED
-	  #if XCHAL_INT_LEVEL(XCHAL_TIMER2_INTERRUPT) <= XCHAL_EXCM_LEVEL
-	    #undef  XT_TIMER_INDEX
-	    #define XT_TIMER_INDEX    2
-	  #endif
-	#endif
-	#if XCHAL_TIMER1_INTERRUPT != XTHAL_TIMER_UNCONFIGURED
-	  #if XCHAL_INT_LEVEL(XCHAL_TIMER1_INTERRUPT) <= XCHAL_EXCM_LEVEL
-	    #undef  XT_TIMER_INDEX
-	    #define XT_TIMER_INDEX    1
-	  #endif
-	#endif
-	#if XCHAL_TIMER0_INTERRUPT != XTHAL_TIMER_UNCONFIGURED
-	  #if XCHAL_INT_LEVEL(XCHAL_TIMER0_INTERRUPT) <= XCHAL_EXCM_LEVEL
-	    #undef  XT_TIMER_INDEX
-	    #define XT_TIMER_INDEX    0
-	  #endif
-	#endif
-#endif
-#ifndef XT_TIMER_INDEX
-#error "There is no suitable timer in this Xtensa configuration."
-#endif
-
-#define XT_CCOMPARE             ((CCOMPARE) + (XT_TIMER_INDEX))
-#define XT_TIMER_INTNUM         XCHAL_TIMER_INTERRUPT(XT_TIMER_INDEX)
-#if XT_TIMER_INTNUM == XTHAL_TIMER_UNCONFIGURED
-#error "The timer selected by XT_TIMER_INDEX does not exist in this core."
-#endif
-#else /* Case of an external timer which is not emulated by internal timer */
-#define XT_TIMER_INTNUM         EXTERNAL_TIMER_IRQ
-#endif /* USE_INTERNAL_TIMER || (EXTERNAL_TIMER_IRQ < 0) */
-
-#if USE_INTERNAL_TIMER
-#define XT_TIMER_INTPRI         XCHAL_INT_LEVEL(XT_TIMER_INTNUM)
-#else
-#define XT_TIMER_INTPRI         EXTERNAL_TIMER_IRQ_PRIORITY
-#endif /* USE_INTERNAL_TIMER */
-
-#if XT_TIMER_INTPRI > XCHAL_EXCM_LEVEL
-#error "The timer interrupt cannot be high priority (use medium or low)."
-#endif
-
-#define XT_TIMER_INTEN          (1 << (XT_TIMER_INTNUM))
-
-/*
- * Set processor clock frequency, used to determine clock divisor for timer
- * tick.  User should BE SURE TO ADJUST THIS for the Xtensa platform being
- * used.  If using a supported board via the board-independent API defined in
- * xtbsp.h, this may be left undefined and frequency and tick divisor will be
- * computed and cached during run-time initialization.
- *
- * NOTE ON SIMULATOR: Under the Xtensa instruction set simulator, the frequency
- * can only be estimated because it depends on the speed of the host and the
- * version of the simulator.  Also because it runs much slower than hardware,
- * it is not possible to achieve real-time performance for most applications
- * under the simulator. A frequency too low does not allow enough time between
- * timer interrupts, starving threads.  To obtain a more convenient but
- * non-real-time tick duration on the simulator, compile with xt-xcc option
- * "-DXT_SIMULATOR".  Adjust this frequency to taste (it's not real-time
- * anyway!).
- */
-#if defined(XT_SIMULATOR) && !defined(XT_CLOCK_FREQ)
-#define XT_CLOCK_FREQ	DT_PROP(DT_PATH(cpus, cpu_0), clock_frequency)
-#endif
-
-#if !defined(XT_CLOCK_FREQ) && !defined(XT_BOARD)
-#error "XT_CLOCK_FREQ must be defined for the target platform."
-#endif
-
-/*
- * Default number of timer "ticks" per second (default 100 for 10ms tick).
- * RTOS may define this in its own way (if applicable) in xtensa_rtos.h.
- * User may redefine this to an optimal value for the application, either by
- * editing this here or in xtensa_rtos.h, or compiling with xt-xcc option
- * "-DXT_TICK_PER_SEC=<value>" where <value> is a suitable number.
- */
-#ifndef XT_TICK_PER_SEC
-#if CONFIG_TICKLESS_KERNEL
-#define XT_TICK_PER_SEC         1000	/* In tickless kernel 1TICK  = 1msec */
-#else
-#define XT_TICK_PER_SEC         CONFIG_SYS_CLOCK_TICKS_PER_SEC
-#endif  /* CONFIG_TICKLESS_KERNEL */
-#endif /* XT_TICK_PER_SEC */
-
-/*
- * Derivation of clock divisor for timer tick and interrupt (one per tick).
- */
-#ifdef XT_CLOCK_FREQ
-#define XT_TICK_DIVISOR     (XT_CLOCK_FREQ / XT_TICK_PER_SEC)
-#endif
-
-#if USE_INTERNAL_TIMER || (EXTERNAL_TIMER_IRQ < 0)
-#ifndef __ASSEMBLER__
-extern unsigned int _xt_tick_divisor;
-extern void z_xt_tick_divisor_init(void);
-#endif
-
-#endif // Internal/External timer
-
-#endif  /* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_TIMER_H_ */
diff --git a/soc/xtensa/intel_adsp/common/soc.c b/soc/xtensa/intel_adsp/common/soc.c
index f3c517514e0da1..64ad62f1dcc919 100644
--- a/soc/xtensa/intel_adsp/common/soc.c
+++ b/soc/xtensa/intel_adsp/common/soc.c
@@ -5,7 +5,6 @@
  */
 
 #include <device.h>
-#include <arch/xtensa/xtensa_api.h>
 #include <xtensa/xtruntime.h>
 #include <irq_nextlevel.h>
 #include <xtensa/hal.h>
diff --git a/soc/xtensa/intel_s1000/soc.c b/soc/xtensa/intel_s1000/soc.c
index 0883ddfad5ebfc..d1dba101b9ef0d 100644
--- a/soc/xtensa/intel_s1000/soc.c
+++ b/soc/xtensa/intel_s1000/soc.c
@@ -5,7 +5,6 @@
  */
 
 #include <device.h>
-#include <arch/xtensa/xtensa_api.h>
 #include <xtensa/xtruntime.h>
 #include <irq_nextlevel.h>
 #include <xtensa/hal.h>

From 088d1cfb734090bb5c2e2e6892849d907a8580e1 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Sun, 14 Feb 2021 11:51:46 -0800
Subject: [PATCH 03/17] arch/xtensa: Inline atomics

The xtensa atomics layer was written with hand-coded assembly that had
to be called as functions.  That's needlessly slow, given that the low
level primitives are a two-instruction sequence.  Ideally the compiler
should see this as an inline to permit it to better optimize around
the needed barriers.

There was also a bug with the atomic_cas function, which had a loop
internally instead of returning the old value synchronously on a
failed swap.  That's benign right now because our existing spin lock
does nothing but retry it in a tight loop anyway, but it's incorrect
per spec and would have caused a contention hang with more elaborate
algorithms (for example a spinlock with backoff semantics).

Remove the old implementation and replace with a much smaller inline C
one based on just two assembly primitives.

This patch also contains a little bit of refactoring to address the
scheme has been split out into a separate header for each, and the
ATOMIC_OPERATIONS_CUSTOM kconfig has been renamed to
ATOMIC_OPERATIONS_ARCH to better capture what it means.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/Kconfig                          |   1 +
 arch/xtensa/core/CMakeLists.txt       |   1 -
 arch/xtensa/core/atomic.S             | 456 --------------------------
 doc/reference/kernel/other/atomic.rst |   2 +-
 include/arch/xtensa/atomic_xtensa.h   | 143 ++++++++
 include/sys/atomic.h                  | 387 +---------------------
 include/sys/atomic_builtin.h          | 307 +++++++++++++++++
 include/sys/atomic_c.h                |  78 +++++
 kernel/Kconfig                        |   2 +-
 9 files changed, 549 insertions(+), 828 deletions(-)
 delete mode 100644 arch/xtensa/core/atomic.S
 create mode 100644 include/arch/xtensa/atomic_xtensa.h
 create mode 100644 include/sys/atomic_builtin.h
 create mode 100644 include/sys/atomic_c.h

diff --git a/arch/Kconfig b/arch/Kconfig
index 6270656b0b68f1..febe47fdf65c95 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -91,6 +91,7 @@ config XTENSA
 	select HAS_DTS
 	select USE_SWITCH
 	select USE_SWITCH_SUPPORTED
+	select ATOMIC_OPERATIONS_ARCH
 	help
 	  Xtensa architecture
 
diff --git a/arch/xtensa/core/CMakeLists.txt b/arch/xtensa/core/CMakeLists.txt
index 96fcc299fc8c87..f28cd18ffb4ddc 100644
--- a/arch/xtensa/core/CMakeLists.txt
+++ b/arch/xtensa/core/CMakeLists.txt
@@ -13,7 +13,6 @@ zephyr_library_sources(
   irq_manage.c
   )
 
-zephyr_library_sources_ifndef(CONFIG_ATOMIC_OPERATIONS_C atomic.S)
 zephyr_library_sources_ifdef(CONFIG_XTENSA_USE_CORE_CRT1 crt1.S)
 zephyr_library_sources_ifdef(CONFIG_IRQ_OFFLOAD irq_offload.c)
 zephyr_library_sources_ifdef(CONFIG_THREAD_LOCAL_STORAGE tls.c)
diff --git a/arch/xtensa/core/atomic.S b/arch/xtensa/core/atomic.S
deleted file mode 100644
index c5ab64714e5de3..00000000000000
--- a/arch/xtensa/core/atomic.S
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2016 Cadence Design Systems, Inc.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * MACROS TO HANDLE ABI SPECIFICS OF FUNCTION ENTRY AND RETURN
- *
- * Convenient where the frame size requirements are the same for both ABIs.
- * ENTRY(sz), RET(sz) are for framed functions (have locals or make calls).
- * ENTRY0,    RET0    are for frameless functions (no locals, no calls).
- *
- * where size = size of stack frame in bytes (must be >0 and aligned to 16).
- * For framed functions the frame is created and the return address saved at
- * base of frame (Call0 ABI) or as determined by hardware (Windowed ABI).  For
- * frameless functions, there is no frame and return address remains in
- * a0.
- *
- * Note: Because CPP macros expand to a single line, macros requiring
- * multi-line expansions are implemented as assembler macros.
- */
-
-#ifdef __XTENSA_CALL0_ABI__
-/* Call0 */
-#define ENTRY(sz)     entry1  sz
-.macro  entry1 size=0x10
-addi    sp, sp, -\size
-s32i    a0, sp, 0
-.endm
-#define ENTRY0
-#define RET(sz)       ret1    sz
-.macro  ret1 size=0x10
-l32i    a0, sp, 0
-addi    sp, sp, \size
-ret
-.endm
-#define RET0          ret
-#else
-/* Windowed */
-#define ENTRY(sz)     entry   sp, sz
-#define ENTRY0        entry   sp, 0x10
-#define RET(sz)       retw
-#define RET0          retw
-#endif /* __XTENSA_CALL0_ABI__ */
-
-/**
- *
- * @brief Atomically clear a memory location
- *
- * This routine atomically clears the contents of <target> and returns the old
- * value that was in <target>.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_clear
- *    (
- *    atomic_t *target	/@ memory location to clear @/
- *    )
- */
-	.global atomic_clear
-	.type   atomic_clear,@function
-	.global atomic_ptr_clear
-	.type   atomic_ptr_clear,@function
-	.align  4
-atomic_clear:
-atomic_ptr_clear:
-	ENTRY(48)
-	movi a4, 0
-.L_LoopClear:
-	l32ai a3, a2, 0
-	wsr a3, scompare1
-	s32c1i a4, a2, 0
-	bne a3, a4, .L_LoopClear
-	mov a2, a3
-	RET(48)
-
-/**
- *
- * @brief Atomically set a memory location
- *
- * This routine atomically sets the contents of <target> to <value> and returns
- * the old value that was in <target>.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_set
- *    (
- *    atomic_t *target,	/@ memory location to set @/
- *    atomic_val_t value	/@ set with this value @/
- *    )
- *
- */
-	.global atomic_set
-	.type   atomic_set,@function
-	.global atomic_ptr_set
-	.type   atomic_ptr_set,@function
-	.align  4
-atomic_set:
-atomic_ptr_set:
-	ENTRY(48)
-.L_LoopSet:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	s32c1i a3, a2, 0
-	bne a3, a4, .L_LoopSet
-	mov a2, a3
-	RET(48)
-
-/**
- *
- * @brief Get the value of a shared memory atomically
- *
- * This routine atomically retrieves the value in *target
- *
- * long atomic_get
- *     (
- *     atomic_t * target    /@ address of atom to be retrieved @/
- *     )
- *
- * @return value read from address target.
- *
- */
-	.global atomic_get
-	.type   atomic_get,@function
-	.global atomic_ptr_get
-	.type   atomic_ptr_get,@function
-	.align  4
-atomic_get:
-atomic_ptr_get:
-	ENTRY(48)
-	l32ai a2, a2, 0
-	RET(48)
-
-/**
- *
- * @brief Atomically increment a memory location
- *
- * This routine atomically increments the value in <target>.  The operation is
- * done using unsigned integer arithmetic.  Various CPU architectures may
- * impose restrictions with regards to the alignment and cache attributes of
- * the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_inc
- *    (
- *    atomic_t *target,	/@ memory location to increment @/
- *    )
- *
- */
-
-	.global atomic_inc
-	.type   atomic_inc,@function
-	.align  4
-atomic_inc:
-	ENTRY(48)
-.L_LoopInc:
-	l32ai a3, a2, 0
-	wsr a3, scompare1
-	addi a4, a3, 1
-	s32c1i a4, a2, 0
-	bne a3, a4, .L_LoopInc
-	mov a2, a3
-	RET(48)
-
-/**
- *
- * @brief Atomically add a value to a memory location
- *
- * This routine atomically adds the contents of <target> and <value>, placing
- * the result in <target>. The operation is done using signed integer
- * arithmetic.  Various CPU architectures may impose restrictions with regards
- * to the alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_add
- *    (
- *    atomic_t *target,	/@ memory location to add to @/
- *    atomic_val_t value	/@ value to add @/
- *    )
- */
-	.global atomic_add
-	.type   atomic_add,@function
-	.align  4
-atomic_add:
-	ENTRY(48)
-.L_LoopAdd:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	add a5, a3, a4
-	s32c1i a5, a2, 0
-	bne a5, a4, .L_LoopAdd
-	mov a2, a5
-	RET(48)
-
-/**
- *
- * @brief Atomically decrement a memory location
- *
- * This routine atomically decrements the value in <target>.  The operation is
- * done using unsigned integer arithmetic.  Various CPU architectures may impose
- * restrictions with regards to the alignment and cache attributes of the
- * atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_dec
- *    (
- *    atomic_t *target,	/@ memory location to decrement @/
- *    )
- *
- */
-
-	.global atomic_dec
-	.type   atomic_dec,@function
-	.align  4
-atomic_dec:
-	ENTRY(48)
-.L_LoopDec:
-	l32ai a3, a2, 0
-	wsr a3, scompare1
-	addi a4, a3, -1
-	s32c1i a4, a2, 0
-	bne a3, a4, .L_LoopDec
-	mov a2, a3
-	RET(48)
-
-/**
- *
- * @brief Atomically subtract a value from a memory location
- *
- * This routine atomically subtracts <value> from the contents of <target>,
- * placing the result in <target>.  The operation is done using signed integer
- * arithmetic. Various CPU architectures may impose restrictions with regards to
- * the alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_sub
- *    (
- *    atomic_t *target,	/@ memory location to subtract from @/
- *    atomic_val_t value	/@ value to subtract @/
- *    )
- *
- */
-
-	.global atomic_sub
-	.type   atomic_sub,@function
-	.align  4
-atomic_sub:
-	ENTRY(48)
-.L_LoopSub:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	sub a5, a4, a3
-	s32c1i a5, a2, 0
-	bne a5, a4, .L_LoopSub
-	mov a2, a5
-	RET(48)
-
-/**
- *
- * @brief Atomically perform a bitwise NAND on a memory location
- *
- * This routine atomically performs a bitwise NAND operation of the contents of
- * <target> and <value>, placing the result in <target>.
- * Various CPU architectures may impose restrictions with regards to the
- * alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_nand
- *    (
- *    atomic_t *target,	/@ memory location to NAND @/
- *    atomic_val_t value	/@ NAND with this value @/
- *    )
- *
- */
-
-	.global atomic_nand
-	.type   atomic_nand,@function
-	.align  4
-atomic_nand:
-	ENTRY(48)
-.L_LoopNand:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	and a5, a3, a4
-	neg a5, a5
-	addi a5, a5, -1
-	s32c1i a5, a2, 0
-	bne a5, a4, .L_LoopNand
-	mov a2, a4
-	RET(48)
-
-/**
- *
- * @brief Atomically perform a bitwise AND on a memory location
- *
- * This routine atomically performs a bitwise AND operation of the contents of
- * <target> and <value>, placing the result in <target>.
- * Various CPU architectures may impose restrictions with regards to the
- * alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_and
- *    (
- *    atomic_t *target,	/@ memory location to AND @/
- *    atomic_val_t value	/@ AND with this value @/
- *    )
- *
- */
-
-	.global atomic_and
-	.type   atomic_and,@function
-	.align  4
-atomic_and:
-	ENTRY(48)
-.L_LoopAnd:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	and a5, a3, a4
-	s32c1i a5, a2, 0
-	bne a5, a4, .L_LoopAnd
-	mov a2, a4
-	RET(48)
-
-/**
- *
- * @brief Atomically perform a bitwise OR on memory location
- *
- * This routine atomically performs a bitwise OR operation of the contents of
- * <target> and <value>, placing the result in <target>.
- * Various CPU architectures may impose restrictions with regards to the
- * alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_or
- *    (
- *    atomic_t *target,	/@ memory location to OR @/
- *    atomic_val_t value	/@ OR with this value @/
- *    )
- *
- */
-
-	.global atomic_or
-	.type   atomic_or,@function
-	.align  4
-atomic_or:
-	ENTRY(48)
-.L_LoopOr:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	or a5, a3, a4
-	s32c1i a5, a2, 0
-	bne a4, a5, .L_LoopOr
-	mov a2, a4
-	RET(48)
-
-/**
- *
- * @brief Atomically perform a bitwise XOR on a memory location
- *
- * This routine atomically performs a bitwise XOR operation of the contents of
- * <target> and <value>, placing the result in <target>.
- * Various CPU architectures may impose restrictions with regards to the
- * alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return Contents of <target> before the atomic operation
- *
- * atomic_val_t atomic_xor
- *    (
- *    atomic_t *target,	/@ memory location to XOR @/
- *    atomic_val_t value	/@ XOR with this value @/
- *    )
- *
- */
-
-	.global atomic_xor
-	.type   atomic_xor,@function
-	.align  4
-atomic_xor:
-	ENTRY(48)
-.L_LoopXor:
-	l32ai a4, a2, 0
-	wsr a4, scompare1
-	xor a5, a3, a4
-	s32c1i a5, a2, 0
-	bne a5, a4, .L_LoopXor
-	mov a2, a4
-	RET(48)
-
-/**
- *
- * @brief Atomically compare-and-swap the contents of a memory location
- *
- * This routine performs an atomic compare-and-swap. testing that the contents
- * of <target> contains <oldValue>, and if it does, setting the value of
- * <target> to <newValue>. Various CPU architectures may impose restrictions
- * with regards to the alignment and cache attributes of the atomic_t type.
- *
- * This routine can be used from both task and interrupt level.
- *
- * @return 1 if the swap is actually executed, 0 otherwise.
- *
- * int atomic_cas
- *    (
- *    atomic_t *target,	        /@ memory location to compare-and-swap @/
- *    atomic_val_t oldValue,	/@ compare to this value @/
- *    atomic_val_t newValue,	/@ swap with this value @/
- *    )
- *
- */
-	.global atomic_cas
-	.type   atomic_cas,@function
-	.global atomic_ptr_cas
-	.type   atomic_ptr_cas,@function
-	.align  4
-atomic_cas:
-atomic_ptr_cas:
-	ENTRY(48)
-	l32ai a5, a2, 0
-	beq a5, a3, 2f
-1:
-	movi a2, 0
-	j 3f
-2:
-	wsr a5, scompare1
-	s32c1i a4, a2, 0
-	bne a4, a5, 1b
-	movi a2, 1
-3:
-	RET(48)
diff --git a/doc/reference/kernel/other/atomic.rst b/doc/reference/kernel/other/atomic.rst
index 9acd492194a9b2..abeb9af975546d 100644
--- a/doc/reference/kernel/other/atomic.rst
+++ b/doc/reference/kernel/other/atomic.rst
@@ -107,7 +107,7 @@ Configuration Options
 Related configuration options:
 
 * :option:`CONFIG_ATOMIC_OPERATIONS_BUILTIN`
-* :option:`CONFIG_ATOMIC_OPERATIONS_CUSTOM`
+* :option:`CONFIG_ATOMIC_OPERATIONS_ARCH`
 * :option:`CONFIG_ATOMIC_OPERATIONS_C`
 
 API Reference
diff --git a/include/arch/xtensa/atomic_xtensa.h b/include/arch/xtensa/atomic_xtensa.h
new file mode 100644
index 00000000000000..c518f4df4ed492
--- /dev/null
+++ b/include/arch/xtensa/atomic_xtensa.h
@@ -0,0 +1,143 @@
+/**
+ * Copyright (c) 2021 Intel Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef ZEPHYR_INCLUDE_ATOMIC_XTENSA_H_
+#define ZEPHYR_INCLUDE_ATOMIC_XTENSA_H_
+
+/* Included from <sys/atomic.h> */
+
+/* Recent GCC versions actually do have working atomics support on
+ * Xtensa (and so should work with CONFIG_ATOMIC_OPERATIONS_BUILTIN),
+ * but existing versions of Xtensa's XCC do not.  So we define an
+ * inline implementation here that is more or less identical
+ */
+
+static ALWAYS_INLINE atomic_val_t atomic_get(const atomic_t *target)
+{
+	atomic_val_t ret;
+
+	/* Actual Xtensa hardware seems to have only in-order
+	 * pipelines, but the architecture does define a barrier load,
+	 * so use it.  There is a matching s32ri instruction, but
+	 * nothing in the Zephyr API requires a barrier store (all the
+	 * atomic write ops have exchange semantics.
+	 */
+	__asm__ volatile("l32ai %0, %1, 0"
+			 : "=r"(ret) : "r"(target) : "memory");
+	return ret;
+}
+
+static ALWAYS_INLINE
+atomic_val_t xtensa_cas(atomic_t *addr, atomic_val_t oldval,
+			atomic_val_t newval)
+{
+	__asm__ volatile("wsr %1, SCOMPARE1; s32c1i %0, %2, 0"
+			 : "+r"(newval), "+r"(oldval) : "r"(addr) : "memory");
+
+	return newval; /* got swapped with the old memory by s32c1i */
+}
+
+static ALWAYS_INLINE
+bool atomic_cas(atomic_t *target, atomic_val_t oldval, atomic_val_t newval)
+{
+	return oldval == xtensa_cas(target, oldval, newval);
+}
+
+static ALWAYS_INLINE
+bool atomic_ptr_cas(atomic_ptr_t *target, void *oldval, void *newval)
+{
+	return (atomic_val_t) oldval
+		== xtensa_cas((atomic_t *) target, (atomic_val_t) oldval,
+			      (atomic_val_t) newval);
+}
+
+/* Generates an atomic exchange sequence that swaps the value at
+ * address "target", whose old value is read to be "cur", with the
+ * specified expression.  Evaluates to the old value which was
+ * atomically replaced.
+ */
+
+#define Z__GEN_ATOMXCHG(expr) ({				\
+	atomic_val_t res, cur;				\
+	do {						\
+		cur = *target;				\
+		res = xtensa_cas(target, cur, (expr));	\
+	} while (res != cur);				\
+	res; })
+
+static ALWAYS_INLINE
+atomic_val_t atomic_set(atomic_t *target, atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(value);
+}
+
+static ALWAYS_INLINE
+atomic_val_t atomic_add(atomic_t *target, atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(cur + value);
+}
+
+static ALWAYS_INLINE
+atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(cur - value);
+}
+
+static ALWAYS_INLINE
+atomic_val_t atomic_inc(atomic_t *target)
+{
+	return Z__GEN_ATOMXCHG(cur + 1);
+}
+
+static ALWAYS_INLINE
+atomic_val_t atomic_dec(atomic_t *target)
+{
+	return Z__GEN_ATOMXCHG(cur - 1);
+}
+
+static ALWAYS_INLINE atomic_val_t atomic_or(atomic_t *target,
+					    atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(cur | value);
+}
+
+static ALWAYS_INLINE atomic_val_t atomic_xor(atomic_t *target,
+					     atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(cur ^ value);
+}
+
+static ALWAYS_INLINE atomic_val_t atomic_and(atomic_t *target,
+					     atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(cur & value);
+}
+
+static ALWAYS_INLINE atomic_val_t atomic_nand(atomic_t *target,
+					      atomic_val_t value)
+{
+	return Z__GEN_ATOMXCHG(~(cur & value));
+}
+
+static ALWAYS_INLINE void *atomic_ptr_get(const atomic_ptr_t *target)
+{
+	return (void *) atomic_get((atomic_t *)target);
+}
+
+static ALWAYS_INLINE void *atomic_ptr_set(atomic_ptr_t *target, void *value)
+{
+	return (void *) atomic_set((atomic_t *) target, (atomic_val_t) value);
+}
+
+static ALWAYS_INLINE atomic_val_t atomic_clear(atomic_t *target)
+{
+	return atomic_set(target, 0);
+}
+
+static ALWAYS_INLINE void *atomic_ptr_clear(atomic_ptr_t *target)
+{
+	return (void *) atomic_set((atomic_t *) target, 0);
+}
+
+#endif /* ZEPHYR_INCLUDE_ATOMIC_XTENSA_H_ */
diff --git a/include/sys/atomic.h b/include/sys/atomic.h
index d6cf027bcc2ea3..893c18f8f10555 100644
--- a/include/sys/atomic.h
+++ b/include/sys/atomic.h
@@ -1,7 +1,6 @@
-/* atomic operations */
-
 /*
  * Copyright (c) 1997-2015, Wind River Systems, Inc.
+ * Copyright (c) 2021 Intel Corporation
  *
  * SPDX-License-Identifier: Apache-2.0
  */
@@ -23,375 +22,29 @@ typedef int atomic_t;
 typedef atomic_t atomic_val_t;
 typedef void *atomic_ptr_t;
 
-/**
- * @defgroup atomic_apis Atomic Services APIs
- * @ingroup kernel_apis
- * @{
- */
-
-/**
- * @brief Atomic compare-and-set.
- *
- * This routine performs an atomic compare-and-set on @a target. If the current
- * value of @a target equals @a old_value, @a target is set to @a new_value.
- * If the current value of @a target does not equal @a old_value, @a target
- * is left unchanged.
- *
- * @param target Address of atomic variable.
- * @param old_value Original value to compare against.
- * @param new_value New value to store.
- * @return true if @a new_value is written, false otherwise.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline bool atomic_cas(atomic_t *target, atomic_val_t old_value,
-			  atomic_val_t new_value)
-{
-	return __atomic_compare_exchange_n(target, &old_value, new_value,
-					   0, __ATOMIC_SEQ_CST,
-					   __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall bool atomic_cas(atomic_t *target, atomic_val_t old_value,
-			 atomic_val_t new_value);
-
-#else
-extern bool atomic_cas(atomic_t *target, atomic_val_t old_value,
-		      atomic_val_t new_value);
-#endif
-
-/**
- * @brief Atomic compare-and-set with pointer values
- *
- * This routine performs an atomic compare-and-set on @a target. If the current
- * value of @a target equals @a old_value, @a target is set to @a new_value.
- * If the current value of @a target does not equal @a old_value, @a target
- * is left unchanged.
- *
- * @param target Address of atomic variable.
- * @param old_value Original value to compare against.
- * @param new_value New value to store.
- * @return true if @a new_value is written, false otherwise.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline bool atomic_ptr_cas(atomic_ptr_t *target, void *old_value,
-				  void *new_value)
-{
-	return __atomic_compare_exchange_n(target, &old_value, new_value,
-					   0, __ATOMIC_SEQ_CST,
-					   __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall bool atomic_ptr_cas(atomic_ptr_t *target, void *old_value,
-			      void *new_value);
-#else
-extern bool atomic_ptr_cas(atomic_ptr_t *target, void *old_value,
-			   void *new_value);
-#endif
-
-/**
- *
- * @brief Atomic addition.
- *
- * This routine performs an atomic addition on @a target.
- *
- * @param target Address of atomic variable.
- * @param value Value to add.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_add(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_add(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_add(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_add(atomic_t *target, atomic_val_t value);
-#endif
-
-/**
- *
- * @brief Atomic subtraction.
- *
- * This routine performs an atomic subtraction on @a target.
- *
- * @param target Address of atomic variable.
- * @param value Value to subtract.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_sub(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value);
-#endif
-
-/**
- *
- * @brief Atomic increment.
- *
- * This routine performs an atomic increment by 1 on @a target.
- *
- * @param target Address of atomic variable.
- *
- * @return Previous value of @a target.
- */
-#if defined(CONFIG_ATOMIC_OPERATIONS_BUILTIN) || defined (CONFIG_ATOMIC_OPERATIONS_C)
-static inline atomic_val_t atomic_inc(atomic_t *target)
-{
-	return atomic_add(target, 1);
-}
-#else
-extern atomic_val_t atomic_inc(atomic_t *target);
-#endif
-
-/**
- *
- * @brief Atomic decrement.
- *
- * This routine performs an atomic decrement by 1 on @a target.
- *
- * @param target Address of atomic variable.
- *
- * @return Previous value of @a target.
- */
-#if defined(CONFIG_ATOMIC_OPERATIONS_BUILTIN) || defined (CONFIG_ATOMIC_OPERATIONS_C)
-static inline atomic_val_t atomic_dec(atomic_t *target)
-{
-	return atomic_sub(target, 1);
-}
-#else
-extern atomic_val_t atomic_dec(atomic_t *target);
-#endif
-
-/**
- *
- * @brief Atomic get.
- *
- * This routine performs an atomic read on @a target.
- *
- * @param target Address of atomic variable.
- *
- * @return Value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_get(const atomic_t *target)
-{
-	return __atomic_load_n(target, __ATOMIC_SEQ_CST);
-}
-#else
-extern atomic_val_t atomic_get(const atomic_t *target);
-#endif
-
-/**
- *
- * @brief Atomic get a pointer value
- *
- * This routine performs an atomic read on @a target.
- *
- * @param target Address of pointer variable.
- *
- * @return Value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline void *atomic_ptr_get(const atomic_ptr_t *target)
-{
-	return __atomic_load_n(target, __ATOMIC_SEQ_CST);
-}
-#else
-extern void *atomic_ptr_get(const atomic_ptr_t *target);
-#endif
-
-/**
- *
- * @brief Atomic get-and-set.
- *
- * This routine atomically sets @a target to @a value and returns
- * the previous value of @a target.
- *
- * @param target Address of atomic variable.
- * @param value Value to write to @a target.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_set(atomic_t *target, atomic_val_t value)
-{
-	/* This builtin, as described by Intel, is not a traditional
-	 * test-and-set operation, but rather an atomic exchange operation. It
-	 * writes value into *ptr, and returns the previous contents of *ptr.
-	 */
-	return __atomic_exchange_n(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_set(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_set(atomic_t *target, atomic_val_t value);
-#endif
-
-/**
- *
- * @brief Atomic get-and-set for pointer values
- *
- * This routine atomically sets @a target to @a value and returns
- * the previous value of @a target.
- *
- * @param target Address of atomic variable.
- * @param value Value to write to @a target.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline void *atomic_ptr_set(atomic_ptr_t *target, void *value)
-{
-	return __atomic_exchange_n(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall void *atomic_ptr_set(atomic_ptr_t *target, void *value);
-#else
-extern void *atomic_ptr_set(atomic_ptr_t *target, void *value);
-#endif
-
-/**
- *
- * @brief Atomic clear.
- *
- * This routine atomically sets @a target to zero and returns its previous
- * value. (Hence, it is equivalent to atomic_set(target, 0).)
- *
- * @param target Address of atomic variable.
- *
- * @return Previous value of @a target.
- */
-#if defined(CONFIG_ATOMIC_OPERATIONS_BUILTIN) || defined (CONFIG_ATOMIC_OPERATIONS_C)
-static inline atomic_val_t atomic_clear(atomic_t *target)
-{
-	return atomic_set(target, 0);
-}
-#else
-extern atomic_val_t atomic_clear(atomic_t *target);
-#endif
-
-/**
- *
- * @brief Atomic clear of a pointer value
- *
- * This routine atomically sets @a target to zero and returns its previous
- * value. (Hence, it is equivalent to atomic_set(target, 0).)
- *
- * @param target Address of atomic variable.
- *
- * @return Previous value of @a target.
- */
-#if defined(CONFIG_ATOMIC_OPERATIONS_BUILTIN) || \
-	defined (CONFIG_ATOMIC_OPERATIONS_C)
-static inline void *atomic_ptr_clear(atomic_ptr_t *target)
-{
-	return atomic_ptr_set(target, NULL);
-}
-#else
-extern void *atomic_ptr_clear(atomic_ptr_t *target);
-#endif
-
-/**
- *
- * @brief Atomic bitwise inclusive OR.
- *
- * This routine atomically sets @a target to the bitwise inclusive OR of
- * @a target and @a value.
- *
- * @param target Address of atomic variable.
- * @param value Value to OR.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_or(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_or(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_or(atomic_t *target, atomic_val_t value);
+/* Low-level primitives come in several styles: */
 
+#if defined(CONFIG_ATOMIC_OPERATIONS_BUILTIN)
+/* Default.  See this file for the Doxygen reference: */
+#include <sys/atomic_builtin.h>
+#elif defined(CONFIG_ATOMIC_OPERATIONS_ARCH)
+/* Some architectures need their own implementation */
+# ifdef CONFIG_XTENSA
+/* Not all Xtensa toolchains support GCC-style atomic intrinsics */
+# include <arch/xtensa/atomic_xtensa.h>
+# endif
 #else
-extern atomic_val_t atomic_or(atomic_t *target, atomic_val_t value);
+/* Generic-but-slow implementation based on kernel locking and syscalls */
+#include <sys/atomic_c.h>
 #endif
 
-/**
- *
- * @brief Atomic bitwise exclusive OR (XOR).
- *
- * This routine atomically sets @a target to the bitwise exclusive OR (XOR) of
- * @a target and @a value.
- *
- * @param target Address of atomic variable.
- * @param value Value to XOR
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_xor(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_xor(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_xor(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_xor(atomic_t *target, atomic_val_t value);
-#endif
+/* Portable higher-level utilities: */
 
 /**
- *
- * @brief Atomic bitwise AND.
- *
- * This routine atomically sets @a target to the bitwise AND of @a target
- * and @a value.
- *
- * @param target Address of atomic variable.
- * @param value Value to AND.
- *
- * @return Previous value of @a target.
- */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_and(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_and(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_and(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_and(atomic_t *target, atomic_val_t value);
-#endif
-
-/**
- *
- * @brief Atomic bitwise NAND.
- *
- * This routine atomically sets @a target to the bitwise NAND of @a target
- * and @a value. (This operation is equivalent to target = ~(target & value).)
- *
- * @param target Address of atomic variable.
- * @param value Value to NAND.
- *
- * @return Previous value of @a target.
+ * @defgroup atomic_apis Atomic Services APIs
+ * @ingroup kernel_apis
+ * @{
  */
-#ifdef CONFIG_ATOMIC_OPERATIONS_BUILTIN
-static inline atomic_val_t atomic_nand(atomic_t *target, atomic_val_t value)
-{
-	return __atomic_fetch_nand(target, value, __ATOMIC_SEQ_CST);
-}
-#elif defined(CONFIG_ATOMIC_OPERATIONS_C)
-__syscall atomic_val_t atomic_nand(atomic_t *target, atomic_val_t value);
-#else
-extern atomic_val_t atomic_nand(atomic_t *target, atomic_val_t value);
-#endif
-
 
 /**
  * @brief Initialize an atomic variable.
@@ -555,11 +208,7 @@ static inline void atomic_set_bit_to(atomic_t *target, int bit, bool val)
  */
 
 #ifdef __cplusplus
-}
-#endif
-
-#ifdef CONFIG_ATOMIC_OPERATIONS_C
-#include <syscalls/atomic.h>
+} /* extern "C" */
 #endif
 
 #endif /* ZEPHYR_INCLUDE_SYS_ATOMIC_H_ */
diff --git a/include/sys/atomic_builtin.h b/include/sys/atomic_builtin.h
new file mode 100644
index 00000000000000..320a0ed75f2fe8
--- /dev/null
+++ b/include/sys/atomic_builtin.h
@@ -0,0 +1,307 @@
+/* atomic operations */
+
+/*
+ * Copyright (c) 1997-2015, Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef ZEPHYR_INCLUDE_SYS_ATOMIC_BUILTIN_H_
+#define ZEPHYR_INCLUDE_SYS_ATOMIC_BUILTIN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Included from <atomic.h> */
+
+/**
+ * @addtogroup atomic_apis Atomic Services APIs
+ * @ingroup kernel_apis
+ * @{
+ */
+
+/**
+ * @brief Atomic compare-and-set.
+ *
+ * This routine performs an atomic compare-and-set on @a target. If the current
+ * value of @a target equals @a old_value, @a target is set to @a new_value.
+ * If the current value of @a target does not equal @a old_value, @a target
+ * is left unchanged.
+ *
+ * @param target Address of atomic variable.
+ * @param old_value Original value to compare against.
+ * @param new_value New value to store.
+ * @return true if @a new_value is written, false otherwise.
+ */
+static inline bool atomic_cas(atomic_t *target, atomic_val_t old_value,
+			  atomic_val_t new_value)
+{
+	return __atomic_compare_exchange_n(target, &old_value, new_value,
+					   0, __ATOMIC_SEQ_CST,
+					   __ATOMIC_SEQ_CST);
+}
+
+/**
+ * @brief Atomic compare-and-set with pointer values
+ *
+ * This routine performs an atomic compare-and-set on @a target. If the current
+ * value of @a target equals @a old_value, @a target is set to @a new_value.
+ * If the current value of @a target does not equal @a old_value, @a target
+ * is left unchanged.
+ *
+ * @param target Address of atomic variable.
+ * @param old_value Original value to compare against.
+ * @param new_value New value to store.
+ * @return true if @a new_value is written, false otherwise.
+ */
+static inline bool atomic_ptr_cas(atomic_ptr_t *target, void *old_value,
+				  void *new_value)
+{
+	return __atomic_compare_exchange_n(target, &old_value, new_value,
+					   0, __ATOMIC_SEQ_CST,
+					   __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic addition.
+ *
+ * This routine performs an atomic addition on @a target.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to add.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_add(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_add(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic subtraction.
+ *
+ * This routine performs an atomic subtraction on @a target.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to subtract.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_sub(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic increment.
+ *
+ * This routine performs an atomic increment by 1 on @a target.
+ *
+ * @param target Address of atomic variable.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_inc(atomic_t *target)
+{
+	return atomic_add(target, 1);
+}
+
+/**
+ *
+ * @brief Atomic decrement.
+ *
+ * This routine performs an atomic decrement by 1 on @a target.
+ *
+ * @param target Address of atomic variable.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_dec(atomic_t *target)
+{
+	return atomic_sub(target, 1);
+}
+
+/**
+ *
+ * @brief Atomic get.
+ *
+ * This routine performs an atomic read on @a target.
+ *
+ * @param target Address of atomic variable.
+ *
+ * @return Value of @a target.
+ */
+static inline atomic_val_t atomic_get(const atomic_t *target)
+{
+	return __atomic_load_n(target, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic get a pointer value
+ *
+ * This routine performs an atomic read on @a target.
+ *
+ * @param target Address of pointer variable.
+ *
+ * @return Value of @a target.
+ */
+static inline void *atomic_ptr_get(const atomic_ptr_t *target)
+{
+	return __atomic_load_n(target, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic get-and-set.
+ *
+ * This routine atomically sets @a target to @a value and returns
+ * the previous value of @a target.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to write to @a target.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_set(atomic_t *target, atomic_val_t value)
+{
+	/* This builtin, as described by Intel, is not a traditional
+	 * test-and-set operation, but rather an atomic exchange operation. It
+	 * writes value into *ptr, and returns the previous contents of *ptr.
+	 */
+	return __atomic_exchange_n(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic get-and-set for pointer values
+ *
+ * This routine atomically sets @a target to @a value and returns
+ * the previous value of @a target.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to write to @a target.
+ *
+ * @return Previous value of @a target.
+ */
+static inline void *atomic_ptr_set(atomic_ptr_t *target, void *value)
+{
+	return __atomic_exchange_n(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic clear.
+ *
+ * This routine atomically sets @a target to zero and returns its previous
+ * value. (Hence, it is equivalent to atomic_set(target, 0).)
+ *
+ * @param target Address of atomic variable.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_clear(atomic_t *target)
+{
+	return atomic_set(target, 0);
+}
+
+/**
+ *
+ * @brief Atomic clear of a pointer value
+ *
+ * This routine atomically sets @a target to zero and returns its previous
+ * value. (Hence, it is equivalent to atomic_set(target, 0).)
+ *
+ * @param target Address of atomic variable.
+ *
+ * @return Previous value of @a target.
+ */
+static inline void *atomic_ptr_clear(atomic_ptr_t *target)
+{
+	return atomic_ptr_set(target, NULL);
+}
+
+/**
+ *
+ * @brief Atomic bitwise inclusive OR.
+ *
+ * This routine atomically sets @a target to the bitwise inclusive OR of
+ * @a target and @a value.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to OR.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_or(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_or(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic bitwise exclusive OR (XOR).
+ *
+ * This routine atomically sets @a target to the bitwise exclusive OR (XOR) of
+ * @a target and @a value.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to XOR
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_xor(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_xor(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic bitwise AND.
+ *
+ * This routine atomically sets @a target to the bitwise AND of @a target
+ * and @a value.
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to AND.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_and(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_and(target, value, __ATOMIC_SEQ_CST);
+}
+
+/**
+ *
+ * @brief Atomic bitwise NAND.
+ *
+ * This routine atomically sets @a target to the bitwise NAND of @a target
+ * and @a value. (This operation is equivalent to target = ~(target & value).)
+ *
+ * @param target Address of atomic variable.
+ * @param value Value to NAND.
+ *
+ * @return Previous value of @a target.
+ */
+static inline atomic_val_t atomic_nand(atomic_t *target, atomic_val_t value)
+{
+	return __atomic_fetch_nand(target, value, __ATOMIC_SEQ_CST);
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef CONFIG_ATOMIC_OPERATIONS_C
+#include <syscalls/atomic.h>
+#endif
+
+#endif /* ZEPHYR_INCLUDE_SYS_ATOMIC_BUILTIN_H_ */
diff --git a/include/sys/atomic_c.h b/include/sys/atomic_c.h
new file mode 100644
index 00000000000000..5f0c6807ec6703
--- /dev/null
+++ b/include/sys/atomic_c.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 1997-2015, Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef ZEPHYR_INCLUDE_SYS_ATOMIC_C_H_
+#define ZEPHYR_INCLUDE_SYS_ATOMIC_C_H_
+
+/* Included from <atomic.h> */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Simple and correct (but very slow) implementation of atomic
+ * primitives that require nothing more than kernel interrupt locking.
+ */
+
+__syscall bool atomic_cas(atomic_t *target, atomic_val_t old_value,
+			 atomic_val_t new_value);
+
+__syscall bool atomic_ptr_cas(atomic_ptr_t *target, void *old_value,
+			      void *new_value);
+
+__syscall atomic_val_t atomic_add(atomic_t *target, atomic_val_t value);
+
+__syscall atomic_val_t atomic_sub(atomic_t *target, atomic_val_t value);
+
+static inline atomic_val_t atomic_inc(atomic_t *target)
+{
+	return atomic_add(target, 1);
+
+}
+
+static inline atomic_val_t atomic_dec(atomic_t *target)
+{
+	return atomic_sub(target, 1);
+
+}
+
+extern atomic_val_t atomic_get(const atomic_t *target);
+
+extern void *atomic_ptr_get(const atomic_ptr_t *target);
+
+__syscall atomic_val_t atomic_set(atomic_t *target, atomic_val_t value);
+
+__syscall void *atomic_ptr_set(atomic_ptr_t *target, void *value);
+
+static inline atomic_val_t atomic_clear(atomic_t *target)
+{
+	return atomic_set(target, 0);
+
+}
+
+static inline void *atomic_ptr_clear(atomic_ptr_t *target)
+{
+	return atomic_ptr_set(target, NULL);
+
+}
+
+__syscall atomic_val_t atomic_or(atomic_t *target, atomic_val_t value);
+
+__syscall atomic_val_t atomic_xor(atomic_t *target, atomic_val_t value);
+
+__syscall atomic_val_t atomic_and(atomic_t *target, atomic_val_t value);
+
+__syscall atomic_val_t atomic_nand(atomic_t *target, atomic_val_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef CONFIG_ATOMIC_OPERATIONS_C
+#include <syscalls/atomic_c.h>
+#endif
+
+#endif /* ZEPHYR_INCLUDE_SYS_ATOMIC_C_H_ */
diff --git a/kernel/Kconfig b/kernel/Kconfig
index dc0bae2ba58353..537b942dfd49ff 100644
--- a/kernel/Kconfig
+++ b/kernel/Kconfig
@@ -406,7 +406,7 @@ config ATOMIC_OPERATIONS_BUILTIN
 	  the preferred method. However, support for all arches in GCC is
 	  incomplete.
 
-config ATOMIC_OPERATIONS_CUSTOM
+config ATOMIC_OPERATIONS_ARCH
 	bool
 	help
 	  Use when there isn't support for compiler built-ins, but you have

From 1a6d7b68205aa9308ba566e96921658e094ba820 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Sun, 14 Feb 2021 16:31:22 -0800
Subject: [PATCH 04/17] arch/xtensa: Remove XTOS dependency in irq_lock()

This whole file is written to assume XEA2, so there's no value to
using an abstraction call here.  Write to the RSIL instruction
directly.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 include/arch/xtensa/irq.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/arch/xtensa/irq.h b/include/arch/xtensa/irq.h
index fd114516973deb..6dd1bea1f18506 100644
--- a/include/arch/xtensa/irq.h
+++ b/include/arch/xtensa/irq.h
@@ -6,7 +6,8 @@
 #ifndef ZEPHYR_INCLUDE_ARCH_XTENSA_XTENSA_IRQ_H_
 #define ZEPHYR_INCLUDE_ARCH_XTENSA_XTENSA_IRQ_H_
 
-#include <xtensa/xtruntime.h>
+#include <toolchain.h>
+#include <xtensa/config/core-isa.h>
 
 #define CONFIG_GEN_IRQ_START_VECTOR 0
 
@@ -105,13 +106,17 @@ static ALWAYS_INLINE void z_xtensa_irq_disable(uint32_t irq)
 
 static ALWAYS_INLINE unsigned int arch_irq_lock(void)
 {
-	unsigned int key = XTOS_SET_INTLEVEL(XCHAL_EXCM_LEVEL);
+	unsigned int key;
+
+	__asm__ volatile("rsil %0, %1"
+			 : "=r"(key) : "i"(XCHAL_EXCM_LEVEL) : "memory");
 	return key;
 }
 
 static ALWAYS_INLINE void arch_irq_unlock(unsigned int key)
 {
-	XTOS_RESTORE_INTLEVEL(key);
+	__asm__ volatile("wsr.ps %0; rsync"
+			 :: "r"(key) : "memory");
 }
 
 static ALWAYS_INLINE bool arch_irq_unlocked(unsigned int key)

From ed7a7e43a14ef5f5d21d8666f314b660b616cc3d Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 15 Feb 2021 19:59:18 -0800
Subject: [PATCH 05/17] soc/intel_adsp: Elevate cached/uncached mapping to a
 SoC API

The trace output layer was using this transformation already, make it
an official API.  There are other places doing similar logic that can
benefit.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 .../intel_adsp/common/include/adsp/cache.h    | 44 +++++++++++++++++++
 soc/xtensa/intel_adsp/common/include/soc.h    |  1 +
 soc/xtensa/intel_adsp/common/trace_out.c      |  9 +---
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/soc/xtensa/intel_adsp/common/include/adsp/cache.h b/soc/xtensa/intel_adsp/common/include/adsp/cache.h
index 63075ab0b632c2..ea7c66f456b080 100644
--- a/soc/xtensa/intel_adsp/common/include/adsp/cache.h
+++ b/soc/xtensa/intel_adsp/common/include/adsp/cache.h
@@ -15,4 +15,48 @@
 #define SOC_DCACHE_INVALIDATE(addr, size)	\
 	xthal_dcache_region_invalidate((addr), (size))
 
+/**
+ * @brief Return uncached pointer to a RAM address
+ *
+ * The Intel ADSP architecture maps all addressable RAM (of all types)
+ * twice, in two different 512MB segments regions whose L1 cache
+ * settings can be controlled independently.  So for any given
+ * pointer, it is possible to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory while bypassing the L1 data cache.  Data
+ * in the L1 cache will not be inspected nor modified by the access.
+ *
+ * @see z_soc_cached_ptr()
+ *
+ * @param p A pointer to a valid C object
+ * @return A pointer to the same object bypassing the L1 dcache
+ */
+static inline void *z_soc_uncached_ptr(void *p)
+{
+	return ((void *)(((size_t)p) & ~0x20000000));
+}
+
+/**
+ * @brief Return cached pointer to a RAM address
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory through the L1 data cache.  Data read
+ * through the resulting pointer will reflect locally cached values on
+ * the current CPU if they exist, and writes will go first into the
+ * cache and be written back later.
+ *
+ * @see z_soc_uncached_ptr()
+ *
+ * @param p A pointer to a valid C object
+ * @return A pointer to the same object via the L1 dcache
+
+ */
+static inline void *z_soc_cached_ptr(void *p)
+{
+	return ((void *)(((size_t)p) | 0x20000000));
+}
+
 #endif
diff --git a/soc/xtensa/intel_adsp/common/include/soc.h b/soc/xtensa/intel_adsp/common/include/soc.h
index 4fc9b6c7df0d12..63dae0c6765b10 100644
--- a/soc/xtensa/intel_adsp/common/include/soc.h
+++ b/soc/xtensa/intel_adsp/common/include/soc.h
@@ -167,4 +167,5 @@ extern void z_soc_irq_enable(uint32_t irq);
 extern void z_soc_irq_disable(uint32_t irq);
 extern int z_soc_irq_is_enabled(unsigned int irq);
 
+
 #endif /* __INC_SOC_H */
diff --git a/soc/xtensa/intel_adsp/common/trace_out.c b/soc/xtensa/intel_adsp/common/trace_out.c
index 4edc18cacb61ef..9df11067c0934e 100644
--- a/soc/xtensa/intel_adsp/common/trace_out.c
+++ b/soc/xtensa/intel_adsp/common/trace_out.c
@@ -28,11 +28,6 @@
 #define NSLOTS (SRAM_TRACE_SIZE / SLOT_SIZE)
 #define MSGSZ (SLOT_SIZE - sizeof(struct slot_hdr))
 
-/* Translates a SRAM pointer into an address of the same memory in the
- * uncached region from 0x80000000-0x9fffffff
- */
-#define UNCACHED_PTR(p) ((void *)(((int)p) & ~0x20000000))
-
 struct slot_hdr {
 	uint16_t magic;
 	uint16_t id;
@@ -56,11 +51,11 @@ static __aligned(64) union {
 	uint32_t cache_pad[16];
 } data_rec;
 
-#define data ((volatile struct metadata *)UNCACHED_PTR(&data_rec.meta))
+#define data ((volatile struct metadata *)z_soc_uncached_ptr(&data_rec.meta))
 
 static inline struct slot *slot(int i)
 {
-	struct slot *slots = UNCACHED_PTR(SRAM_TRACE_BASE);
+	struct slot *slots = z_soc_uncached_ptr((void *)SRAM_TRACE_BASE);
 
 	return &slots[i];
 }

From 81ed70a8eda767ffbdc88f0c4079653a3fdfed73 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 15 Feb 2021 21:04:58 -0800
Subject: [PATCH 06/17] soc/intel_adsp: Clean up MP startup

The multiprocessor entry code here had some bits that look to have
been copied from esp32, including a clumsy stack switch that's needed
there.  But it wasn't actually switching the stack at all, which on
this device is pointed at the top of HP-SRAM and can stay there until
the second CPU swaps away into a real thread (this will need to change
once we support >2 CPUS though).

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 soc/xtensa/intel_adsp/common/soc_mp.c | 35 +--------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/soc/xtensa/intel_adsp/common/soc_mp.c b/soc/xtensa/intel_adsp/common/soc_mp.c
index f56c05f95d2897..d9143a5cf25c7b 100644
--- a/soc/xtensa/intel_adsp/common/soc_mp.c
+++ b/soc/xtensa/intel_adsp/common/soc_mp.c
@@ -74,9 +74,7 @@ struct cpustart_rec {
 static __aligned(XCHAL_DCACHE_LINESIZE)
 struct cpustart_rec start_rec;
 
-static void *mp_top;
-
-static void mp_entry2(void)
+void z_mp_entry(void)
 {
 	volatile int ie;
 	uint32_t idc_reg;
@@ -127,35 +125,6 @@ static void mp_entry2(void)
 #endif
 }
 
-/* Defines a locally callable "function" named mp_stack_switch().  The
- * first argument (in register a2 post-ENTRY) is the new stack pointer
- * to go into register a1.  The second (a3) is the entry point.
- * Because this never returns, a0 is used as a scratch register then
- * set to zero for the called function (a null return value is the
- * signal for "top of stack" to the debugger).
- */
-void mp_stack_switch(void *stack, void *entry);
-__asm__("\n"
-	".align 4		\n"
-	"mp_stack_switch:	\n\t"
-
-	"entry a1, 16		\n\t"
-
-	"movi a0, 0		\n\t"
-
-	"jx a3			\n\t");
-
-/* Carefully constructed to use no stack beyond compiler-generated ABI
- * instructions. Stack pointer is pointing to __stack at this point.
- */
-void z_mp_entry(void)
-{
-	*(uint32_t *)CONFIG_SRAM_BASE_ADDRESS = 0xDEADBEEF;
-	SOC_DCACHE_FLUSH((uint32_t *)CONFIG_SRAM_BASE_ADDRESS, 64);
-
-	mp_stack_switch(mp_top, mp_entry2);
-}
-
 void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 		    arch_cpustart_t fn, void *arg)
 {
@@ -174,8 +143,6 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 	start_rec.vecbase = vecbase;
 	start_rec.alive = 0;
 
-	mp_top = Z_THREAD_STACK_BUFFER(stack) + sz;
-
 	SOC_DCACHE_FLUSH(&start_rec, sizeof(start_rec));
 
 #ifdef CONFIG_IPM_CAVS_IDC

From 962039a1af07fe5b542aa1373022c12f4248178a Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 15 Feb 2021 21:34:30 -0800
Subject: [PATCH 07/17] soc/intel_adsp: Clean up cache handling in MP startup

There's no need to muck with the cache directly as long as we're
careful about addressing the shared start record through an uncached
volatile pointer.

Correct a theoretical bug with the initial cache invalidate on the
second CPU which was actually doing a flush (and thus potentially
pushing things the boot ROM wrote into RAM now owned by the OS).

Optimize memory layout a bit when using KERNEL_COHERENCE; we don't
need a full cache line for the start record there as it's already in
uncached memory.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 soc/xtensa/intel_adsp/common/soc_mp.c | 41 +++++++++++++++++++--------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/soc/xtensa/intel_adsp/common/soc_mp.c b/soc/xtensa/intel_adsp/common/soc_mp.c
index d9143a5cf25c7b..4b1d1e8fe1b502 100644
--- a/soc/xtensa/intel_adsp/common/soc_mp.c
+++ b/soc/xtensa/intel_adsp/common/soc_mp.c
@@ -66,13 +66,28 @@ struct cpustart_rec {
 	uint32_t		vecbase;
 
 	uint32_t		alive;
-
-	/* padding to cache line */
-	uint8_t		padding[XCHAL_DCACHE_LINESIZE - 6 * 4];
 };
 
-static __aligned(XCHAL_DCACHE_LINESIZE)
-struct cpustart_rec start_rec;
+#ifdef CONFIG_KERNEL_COHERENCE
+/* Coherence guarantees that normal .data will be coherent and that it
+ * won't overlap any cached memory.
+ */
+static struct {
+	struct cpustart_rec cpustart;
+} cpustart_mem;
+#else
+/* If .data RAM is by default incoherent, then the start record goes
+ * into its own dedicated cache line(s)
+ */
+static __aligned(XCHAL_DCACHE_LINESIZE) union {
+	struct cpustart_rec cpustart;
+	char pad[XCHAL_DCACHE_LINESIZE];
+} cpustart_mem;
+#endif
+
+#define start_rec \
+	(*((volatile struct cpustart_rec *) \
+	   z_soc_uncached_ptr(&cpustart_mem.cpustart)))
 
 void z_mp_entry(void)
 {
@@ -80,7 +95,13 @@ void z_mp_entry(void)
 	uint32_t idc_reg;
 
 	/* We don't know what the boot ROM might have touched and we
-	 * don't care.  Make sure it's not in our local cache.
+	 * don't care.  Make sure it's not in our local cache to be
+	 * flushed accidentally later.
+	 *
+	 * Note that technically this is dropping our own (cached)
+	 * stack memory, which we don't have a guarantee the compiler
+	 * isn't using yet.  Manual inspection of generated code says
+	 * we're safe, but really we need a better solution here.
 	 */
 	xthal_dcache_all_writeback_inv();
 
@@ -109,7 +130,6 @@ void z_mp_entry(void)
 #endif /* CONFIG_IPM_CAVS_IDC */
 
 	start_rec.alive = 1;
-	SOC_DCACHE_FLUSH(&start_rec, sizeof(start_rec));
 
 	start_rec.fn(start_rec.arg);
 
@@ -143,8 +163,6 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 	start_rec.vecbase = vecbase;
 	start_rec.alive = 0;
 
-	SOC_DCACHE_FLUSH(&start_rec, sizeof(start_rec));
-
 #ifdef CONFIG_IPM_CAVS_IDC
 	idc = device_get_binding(DT_LABEL(DT_INST(0, intel_cavs_idc)));
 #endif
@@ -169,9 +187,8 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 	sys_clear_bit(DT_REG_ADDR(DT_NODELABEL(cavs0)) + 0x04 +
 		      CAVS_ICTL_INT_CPU_OFFSET(cpu_num), 8);
 
-	do {
-		SOC_DCACHE_INVALIDATE(&start_rec, sizeof(start_rec));
-	} while (start_rec.alive == 0);
+	while (start_rec.alive == 0) {
+	}
 
 	/* Clear done bit from responding the power up message */
 	idc_reg = idc_read(IPC_IDCIETC(cpu_num), 0) | IPC_IDCIETC_DONE;

From 12b4bc435156e080be4d3b3ac949fba206ff679a Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 15 Feb 2021 22:19:51 -0800
Subject: [PATCH 08/17] soc/intel_adsp: Use the correct MP stack pointer

The kernel passes the CPU's interrupt stack expected that it will
start on that, so do it.  Pass the initial stack pointer from the SOC
layer in the variable "z_mp_stack_top" and set it in the assembly
startup before calling z_mp_entry().

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/crt1.S               | 7 ++++++-
 soc/xtensa/intel_adsp/common/soc_mp.c | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/xtensa/core/crt1.S b/arch/xtensa/core/crt1.S
index b012f29197d1ed..23ce498df9eb82 100644
--- a/arch/xtensa/core/crt1.S
+++ b/arch/xtensa/core/crt1.S
@@ -204,7 +204,12 @@ _start:
 	rsr     a3, PRID
 	extui   a3, a3, 0, 8	/* extract core ID */
 	beqz    a3, 2f
-	CALL    z_mp_entry
+
+	/* Load our stack pointer set up for us by the SOC layer */
+	movi    a1, z_mp_stack_top
+	l32i    a1, a1, 0
+
+	call4 z_mp_entry
 
 2:
 #endif
diff --git a/soc/xtensa/intel_adsp/common/soc_mp.c b/soc/xtensa/intel_adsp/common/soc_mp.c
index 4b1d1e8fe1b502..6984a4fa775147 100644
--- a/soc/xtensa/intel_adsp/common/soc_mp.c
+++ b/soc/xtensa/intel_adsp/common/soc_mp.c
@@ -61,13 +61,14 @@ struct cpustart_rec {
 	uint32_t		cpu;
 
 	arch_cpustart_t	fn;
-	char		*stack_top;
 	void		*arg;
 	uint32_t		vecbase;
 
 	uint32_t		alive;
 };
 
+char *z_mp_stack_top;
+
 #ifdef CONFIG_KERNEL_COHERENCE
 /* Coherence guarantees that normal .data will be coherent and that it
  * won't overlap any cached memory.
@@ -158,11 +159,12 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 
 	start_rec.cpu = cpu_num;
 	start_rec.fn = fn;
-	start_rec.stack_top = Z_THREAD_STACK_BUFFER(stack) + sz;
 	start_rec.arg = arg;
 	start_rec.vecbase = vecbase;
 	start_rec.alive = 0;
 
+	z_mp_stack_top = Z_THREAD_STACK_BUFFER(stack) + sz;
+
 #ifdef CONFIG_IPM_CAVS_IDC
 	idc = device_get_binding(DT_LABEL(DT_INST(0, intel_cavs_idc)));
 #endif

From efa05d1e42da430c2ee5c93f949cffd840c52f37 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Tue, 16 Feb 2021 10:09:25 -0800
Subject: [PATCH 09/17] soc/intel_adsp: Put initial stack into the CPU0
 interrupt stack

Zephyr's normal architecture is to do all initialization in the
interrupt stacks.  The CAVS code was traditionally written to start
the stack at the end of HP-SRAM, where it has no protection against
overlap with other uses (e.g. MP startup used the same region for
stacks and saw cache collisions, and the SOF heap lives in this area
too).  Put it where Zephyr expects and we'll have fewer surprises.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 soc/xtensa/intel_adsp/cavs_v15/linker.ld | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/soc/xtensa/intel_adsp/cavs_v15/linker.ld b/soc/xtensa/intel_adsp/cavs_v15/linker.ld
index dd27ba428c3a46..3ff29df6584974 100644
--- a/soc/xtensa/intel_adsp/cavs_v15/linker.ld
+++ b/soc/xtensa/intel_adsp/cavs_v15/linker.ld
@@ -499,10 +499,9 @@ SECTIONS
   /* Re-adjust to the upper mapping for the final symbols below */
   . = SEGSTART_CACHED;
 
-  /* stack */
-  _end = ALIGN(8);
-  PROVIDE(end = ALIGN(8));
-  __stack = L2_SRAM_BASE + L2_SRAM_SIZE;
+  /* Initial/boot stack lives in the CPU0 interrupt stack */
+  __stack = z_interrupt_stacks + CONFIG_ISR_STACK_SIZE;
+
   /* dma buffers */
   .lpbuf (NOLOAD): ALIGN(4)
   {

From 811aa969902f812989303739f59f3beb4625bf85 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Tue, 16 Feb 2021 08:07:04 -0800
Subject: [PATCH 10/17] arch/xtensa: soc/intel_adsp: Rework MP code entry

Instead of passing the crt1 _start function as the entry code for
auxiliary CPUs, use a tiny assembly stub instead which can avoid the
runtime testing needed to skip the work in _start.  All the crt1 code
was doing was clearing BSS (which must not happen on a second CPU) and
setting the stack pointer (which is wrong on the second CPU).

This allows us to clean out the SMP code in crt1.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/crt1.S               | 25 -----------------------
 soc/xtensa/intel_adsp/common/soc_mp.c | 29 ++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/xtensa/core/crt1.S b/arch/xtensa/core/crt1.S
index 23ce498df9eb82..a147e9ed14d5d7 100644
--- a/arch/xtensa/core/crt1.S
+++ b/arch/xtensa/core/crt1.S
@@ -148,13 +148,6 @@ _start:
 	movi	a0, 0
 # endif
 
-# if CONFIG_MP_NUM_CPUS > 1
-	/* Only clear BSS when running on core 0 */
-	rsr	a3, PRID
-	extui	a3, a3, 0, 8	/* extract core ID */
-	bnez	a3, .L3zte
-# endif
-
 	/*
 	 *  Clear the BSS (uninitialized data) segments.
 	 *  This code supports multiple zeroed sections (*.bss).
@@ -196,24 +189,6 @@ _start:
 
 #endif /* !XCHAL_HAVE_BOOTLOADER */
 
-#if CONFIG_MP_NUM_CPUS > 1
-	/*
-	 * z_cstart() is only for CPU #0.
-	 * Other CPUs have different entry point.
-	 */
-	rsr     a3, PRID
-	extui   a3, a3, 0, 8	/* extract core ID */
-	beqz    a3, 2f
-
-	/* Load our stack pointer set up for us by the SOC layer */
-	movi    a1, z_mp_stack_top
-	l32i    a1, a1, 0
-
-	call4 z_mp_entry
-
-2:
-#endif
-
 	/* Enter C domain, never returns from here */
 	CALL	z_cstart
 
diff --git a/soc/xtensa/intel_adsp/common/soc_mp.c b/soc/xtensa/intel_adsp/common/soc_mp.c
index 6984a4fa775147..6dac17b7d9dfe0 100644
--- a/soc/xtensa/intel_adsp/common/soc_mp.c
+++ b/soc/xtensa/intel_adsp/common/soc_mp.c
@@ -55,8 +55,6 @@ LOG_MODULE_REGISTER(soc_mp, CONFIG_SOC_LOG_LEVEL);
 static const struct device *idc;
 #endif
 
-extern void __start(void);
-
 struct cpustart_rec {
 	uint32_t		cpu;
 
@@ -90,6 +88,29 @@ static __aligned(XCHAL_DCACHE_LINESIZE) union {
 	(*((volatile struct cpustart_rec *) \
 	   z_soc_uncached_ptr(&cpustart_mem.cpustart)))
 
+/* Tiny assembly stub for calling z_mp_entry() on the auxiliary CPUs.
+ * Mask interrupts, clear the register window state and set the stack
+ * pointer.  This represents the minimum work required to run C code
+ * safely.
+ *
+ * Note that alignment is absolutely required: the IDC protocol passes
+ * only the upper 30 bits of the address to the second CPU.
+ */
+void z_soc_mp_asm_entry(void);
+__asm__(".align 4                   \n\t"
+	".global z_soc_mp_asm_entry \n\t"
+	"z_soc_mp_asm_entry:        \n\t"
+	"  rsil  a0, 5              \n\t" /* 5 == XCHAL_EXCM_LEVEL */
+	"  movi  a0, 0              \n\t"
+	"  wsr   a0, WINDOWBASE     \n\t"
+	"  movi  a0, 1              \n\t"
+	"  wsr   a0, WINDOWSTART    \n\t"
+	"  rsync                    \n\t"
+	"  movi  a1, z_mp_stack_top \n\t"
+	"  l32i  a1, a1, 0          \n\t"
+	"  call4 z_mp_entry         \n\t");
+BUILD_ASSERT(XCHAL_EXCM_LEVEL == 5);
+
 void z_mp_entry(void)
 {
 	volatile int ie;
@@ -177,7 +198,9 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
 		    CAVS_ICTL_INT_CPU_OFFSET(cpu_num), 8);
 
 	/* Send power up message to the other core */
-	idc_write(IPC_IDCIETC(cpu_num), 0, IDC_MSG_POWER_UP_EXT(RAM_BASE));
+	uint32_t ietc = IDC_MSG_POWER_UP_EXT((long) z_soc_mp_asm_entry);
+
+	idc_write(IPC_IDCIETC(cpu_num), 0, ietc);
 	idc_write(IPC_IDCITC(cpu_num), 0, IDC_MSG_POWER_UP | IPC_IDCITC_BUSY);
 
 	/* Disable IDC interrupt on other core so IPI won't cause

From 5a47aa0f55654e02de4b146d8247f63ee67798ac Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Wed, 17 Feb 2021 07:02:26 -0800
Subject: [PATCH 11/17] arch/xtensa: Add an arch-internal README on register
 windows

Back when I started work on this stuff, I had a set of notes on
register windows that slowly evolved into something that looks like
formal documentation.  There really isn't any overview-style
documentation of this stuff on the public internet, so it couldn't
hurt to commit it here for posterity.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/README-WINDOWS.rst | 108 ++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 arch/xtensa/core/README-WINDOWS.rst

diff --git a/arch/xtensa/core/README-WINDOWS.rst b/arch/xtensa/core/README-WINDOWS.rst
new file mode 100644
index 00000000000000..eaf9156e19566f
--- /dev/null
+++ b/arch/xtensa/core/README-WINDOWS.rst
@@ -0,0 +1,108 @@
+# How Xtensa register windows work
+
+There is a paucity of introductory material on this subject, and
+Zephyr plays some tricks here that require understanding the base
+layer.
+
+## Hardware
+
+When register windows are configured in the CPU, there are either 32
+or 64 "real" registers in hardware, with 16 visible at one time.
+Registers are grouped and rotated in units of 4, so there are 8 or 16
+such "quads" (my term, not Tensilica's) in hardware of which 4 are
+visible as A0-A15.
+
+The first quad (A0-A3) is pointed to by a special register called
+WINDOWBASE.  The register file is cyclic, so for example if NREGS==64
+and WINDOWBASE is 15, quads 15, 0, 1, and 2 will be visible as
+(respectively) A0-A3, A4-A7, A8-A11, and A12-A15.
+
+There is a ROTW instruction that can be used to manually rotate the
+window by a immediate number of quads that are added to WINDOWBASE.
+Positive rotations "move" high registers into low registers
+(i.e. after "ROTW 1" the register that used to be called A4 is now
+A0).
+
+There are CALL4/CALL8/CALL12 instructions to effect rotated calls
+which rotate registers upward (i.e. "hiding" low registers from the
+callee) by 1, 2 or 3 quads.  These do not rotate the window
+themselves.  Instead they place the rotation amount in two places
+(yes, two; see below): the 2-bit CALLINC field of the PS register, and
+the top two bits of the return address placed in A0.
+
+There is an ENTRY instruction that does the rotation.  It adds CALLINC
+to WINDOWBASE, at the same time copying the old (now hidden) stack
+pointer in A1 into the "new" A1 in the rotated frame, subtracting an
+immediate offset from it to make space for the new frame.
+
+There is a RETW instruction that undoes the rotation.  It reads the
+top two bits from the return address in A0 and subtracts that value
+from WINDOWBASE before returning.  This is why the CALLINC bits went
+in two places.  They have to be stored on the stack across potentially
+many calls, so they need to be GPR data that lives in registers and
+can be spilled.  But ENTRY isn't specified to assume a particular
+return value format and is used immediately, so it makes more sense
+for it to use processor state instead.
+
+Note that we still don't know how to detect when the register file has
+wrapped around and needs to be spilled or filled.  To do this there is
+a WINDOWSTART register used to detect which register quads are in use.
+The name "start" is somewhat confusing, this is not a pointer.
+WINDOWSTART stores a bitmask with one bit per hardware quad (so it's 8
+or 16 bits wide).  The bit in windowstart corresponding to WINDOWBASE
+will be set by the ENTRY instruction, and remain set after rotations
+until cleared by a function return (by RETW, see below).  Other bits
+stay zero.  So there is one set bit in WINDOWSTART corresponding to
+each call frame that is live in hardware registers, and it will be
+followed by 0, 1 or 2 zero bits that tell you how "big" (how many
+quads of registers) that frame is.
+
+So the CPU executing RETW checks to make sure that the register quad
+being brought into A0-A3 (i.e. the new WINDOWBASE) has a set bit
+indicating it's valid. If it does not, the registers must have been
+spilled and the CPU traps to an exception handler to fill them.
+
+Likewise, the processor can tell if a high register is "owned" by
+another call by seeing if there is a one in WINDOWSTART between that
+register's quad and WINDOWBASE.  If there is, the CPU traps to a spill
+handler to spill one frame.  Note that a frame might be only four
+registers, but it's possible to hit registers 12 out from WINDOWBASE,
+so it's actually possible to trap again when the instruction restarts
+to spill a second quad, and even a third time at maximum.
+
+Finally: note that hardware checks the two bits of WINDOWSTART after
+the frame bit to detect how many quads are represented by the one
+frame.  So there are six separate exception handlers to spill/fill
+1/2/3 quads of registers.
+
+## Software & ABI
+
+The advantage of the scheme above is that it allows the registers to
+be spilled naturally into the stack by using the stack pointers
+embedded in the register file.  But the hardware design assumes and to
+some extent enforces a fairly complicated stack layout to make that
+work:
+
+The spill area for a single frame's A0-A3 registers is not in its own
+stack frame.  It lies in the 16 bytes below its CALLEE's stack
+pointer.  This is so that the callee (and exception handlers invoked
+on its behalf) can see its caller's potentially-spilled stack pointer
+register (A1) on the stack and be able to walk back up on return.
+Other architectures do this too by e.g. pushing the incoming stack
+pointer onto the stack as a standard "frame pointer" defined in the
+platform ABI.  Xtensa wraps this together with the natural spill area
+for register windows.
+
+By convention spill regions always store the lowest numbered register
+in the lowest address.
+
+The spill area for a frame's A4-A11 registers may or may not exist
+depending on whether the call was made with CALL8/CALL12.  It is legal
+to write a function using only A0-A3 and CALL4 calls and ignore higher
+registers.  But if those 0-2 register quads are in use, they appear at
+the top of the stack frame, immediately below the parent call's A0-A3
+spill area.
+
+There is no spill area for A12-A15.  Those registers are always
+caller-save.  When using CALLn, you always need to overlap 4 registers
+to provide arguments and take a return value.

From 882c24d1b0dbeece096a53a54d42af861d5cd24c Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Sun, 14 Feb 2021 16:09:43 -0800
Subject: [PATCH 12/17] arch/xtensa: Add non-HAL caching primitives

The Xtensa L1 cache layer has straightforward semantics accessible via
single-instructions that operate on cache lines via physical
addresses.  These are very amenable to inlining.

Unfortunately the Xtensa HAL layer requires function calls to do this,
leading to significant code waste at the calling site, an extra frame
on the stack and needless runtime instructions for situations where
the call is over a constant region that could elide the loop.  This is
made even worse because the HAL library is not built with
-ffunction-sections, so pulling in even one of these tiny cache
functions has the effect of importing a 1500-byte object file into the
link!

Add our own tiny cache layer to include/arch/xtensa/cache.h and use
that instead.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/xtensa-asm2.c                |  2 +-
 arch/xtensa/include/kernel_arch_func.h        |  8 +-
 include/arch/xtensa/cache.h                   | 81 +++++++++++++++++++
 .../common/bootloader/boot_loader.c           |  5 +-
 .../intel_adsp/common/include/adsp/cache.h    |  6 +-
 soc/xtensa/intel_adsp/common/soc_mp.c         |  3 +-
 soc/xtensa/intel_s1000/soc.h                  |  6 +-
 soc/xtensa/intel_s1000/soc_mp.c               |  1 +
 .../intel_s1000_crb/cache/src/cache_test.c    |  4 +-
 .../intel_s1000_crb/main/src/dma_test.c       |  8 +-
 10 files changed, 105 insertions(+), 19 deletions(-)
 create mode 100644 include/arch/xtensa/cache.h

diff --git a/arch/xtensa/core/xtensa-asm2.c b/arch/xtensa/core/xtensa-asm2.c
index 6c40dde067bca5..46636b53fc3e3e 100644
--- a/arch/xtensa/core/xtensa-asm2.c
+++ b/arch/xtensa/core/xtensa-asm2.c
@@ -61,7 +61,7 @@ void *xtensa_init_stack(struct k_thread *thread, int *stack_top,
 	ret = &bsa[-9];
 
 #ifdef CONFIG_KERNEL_COHERENCE
-	xthal_dcache_region_writeback(ret, (char *)stack_top - (char *)ret);
+	z_xtensa_cache_flush(ret, (char *)stack_top - (char *)ret);
 #endif
 	return ret;
 }
diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
index 53c6661c3e5586..607d78ecd9c08d 100644
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@@ -13,6 +13,7 @@
 #ifndef _ASMLANGUAGE
 #include <kernel_internal.h>
 #include <string.h>
+#include <arch/xtensa/cache.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -31,7 +32,7 @@ static ALWAYS_INLINE void arch_kernel_init(void)
 	/* Make sure we don't have live data for unexpected cached
 	 * regions due to boot firmware
 	 */
-	xthal_dcache_all_writeback_inv();
+	z_xtensa_cache_flush_inv_all();
 #endif
 
 	cpu0->nested = 0;
@@ -82,7 +83,7 @@ static inline void arch_cohere_stacks(struct k_thread *old_thread,
 	size_t nsz    = new_thread->stack_info.size;
 	size_t nsp    = (size_t) new_thread->switch_handle;
 
-	xthal_dcache_region_invalidate((void *)nsp, (nstack + nsz) - nsp);
+	z_xtensa_cache_inv((void *)nsp, (nstack + nsz) - nsp);
 
 	/* FIXME: dummy initializion threads don't have stack info set
 	 * up and explode the logic above.  Find a way to get this
@@ -98,8 +99,7 @@ static inline void arch_cohere_stacks(struct k_thread *old_thread,
 	 * calculate the boundary for it.
 	 */
 	if (old_switch_handle != NULL) {
-		xthal_dcache_region_writeback((void *)osp,
-					      (ostack + osz) - osp);
+		z_xtensa_cache_flush((void *)osp, (ostack + osz) - osp);
 	} else {
 		/* FIXME: hardcoding EXCSAVE3 is bad, should be
 		 * configurable a-la XTENSA_KERNEL_CPU_PTR_SR.
diff --git a/include/arch/xtensa/cache.h b/include/arch/xtensa/cache.h
new file mode 100644
index 00000000000000..ba89079c273bd2
--- /dev/null
+++ b/include/arch/xtensa/cache.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2021 Intel Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef ZEPHYR_INCLUDE_ARCH_XTENSA_CACHE_H_
+#define ZEPHYR_INCLUDE_ARCH_XTENSA_CACHE_H_
+
+#include <xtensa/config/core-isa.h>
+#include <sys/util.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS)
+
+#if XCHAL_DCACHE_SIZE
+#define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0))
+BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE));
+BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX));
+#endif
+
+static inline void z_xtensa_cache_flush(void *addr, size_t bytes)
+{
+#if XCHAL_DCACHE_SIZE
+	size_t step = XCHAL_DCACHE_LINESIZE;
+	size_t first = ROUND_DOWN(addr, step);
+	size_t last = ROUND_UP(((long)addr) + bytes, step);
+
+	for (size_t line = first; bytes && line < last; line += step) {
+		__asm__ volatile("dhwb %0, 0" :: "r"(line));
+	}
+#endif
+}
+
+static inline void z_xtensa_cache_flush_inv(void *addr, size_t bytes)
+{
+#if XCHAL_DCACHE_SIZE
+	size_t step = XCHAL_DCACHE_LINESIZE;
+	size_t first = ROUND_DOWN(addr, step);
+	size_t last = ROUND_UP(((long)addr) + bytes, step);
+
+	for (size_t line = first; bytes && line < last; line += step) {
+		__asm__ volatile("dhwbi %0, 0" :: "r"(line));
+	}
+#endif
+}
+
+static inline void z_xtensa_cache_inv(void *addr, size_t bytes)
+{
+#if XCHAL_DCACHE_SIZE
+	size_t step = XCHAL_DCACHE_LINESIZE;
+	size_t first = ROUND_DOWN(addr, step);
+	size_t last = ROUND_UP(((long)addr) + bytes, step);
+
+	for (size_t line = first; bytes && line < last; line += step) {
+		__asm__ volatile("dhi %0, 0" :: "r"(line));
+	}
+#endif
+}
+
+static inline void z_xtensa_cache_inv_all(void)
+{
+	z_xtensa_cache_inv(NULL, Z_DCACHE_MAX);
+}
+
+static inline void z_xtensa_cache_flush_all(void)
+{
+	z_xtensa_cache_flush(NULL, Z_DCACHE_MAX);
+}
+
+static inline void z_xtensa_cache_flush_inv_all(void)
+{
+	z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ZEPHYR_INCLUDE_ARCH_XTENSA_CACHE_H_ */
diff --git a/soc/xtensa/intel_adsp/common/bootloader/boot_loader.c b/soc/xtensa/intel_adsp/common/bootloader/boot_loader.c
index 13919c093062c9..f6e63c70bd3d6e 100644
--- a/soc/xtensa/intel_adsp/common/bootloader/boot_loader.c
+++ b/soc/xtensa/intel_adsp/common/bootloader/boot_loader.c
@@ -15,6 +15,7 @@
 #include <soc/shim.h>
 #include <adsp/io.h>
 #include <soc.h>
+#include <arch/xtensa/cache.h>
 #include "manifest.h"
 
 #if CONFIG_SOC_INTEL_S1000
@@ -70,7 +71,7 @@ static inline void bmemcpy(void *dest, void *src, size_t bytes)
 	for (i = 0; i < (bytes >> 2); i++)
 		d[i] = s[i];
 
-	SOC_DCACHE_FLUSH(dest, bytes);
+	z_xtensa_cache_flush(dest, bytes);
 }
 
 /* bzero used by bootloader */
@@ -82,7 +83,7 @@ static inline void bbzero(void *dest, size_t bytes)
 	for (i = 0; i < (bytes >> 2); i++)
 		d[i] = 0;
 
-	SOC_DCACHE_FLUSH(dest, bytes);
+	z_xtensa_cache_flush(dest, bytes);
 }
 
 static void parse_module(struct sof_man_fw_header *hdr,
diff --git a/soc/xtensa/intel_adsp/common/include/adsp/cache.h b/soc/xtensa/intel_adsp/common/include/adsp/cache.h
index ea7c66f456b080..6758de557800f4 100644
--- a/soc/xtensa/intel_adsp/common/include/adsp/cache.h
+++ b/soc/xtensa/intel_adsp/common/include/adsp/cache.h
@@ -7,13 +7,13 @@
 #ifndef __COMMON_ADSP_CACHE_H__
 #define __COMMON_ADSP_CACHE_H__
 
-#include <xtensa/hal.h>
+#include <arch/xtensa/cache.h>
 
 /* macros for data cache operations */
 #define SOC_DCACHE_FLUSH(addr, size)		\
-	xthal_dcache_region_writeback((addr), (size))
+	z_xtensa_cache_flush((addr), (size))
 #define SOC_DCACHE_INVALIDATE(addr, size)	\
-	xthal_dcache_region_invalidate((addr), (size))
+	z_xtensa_cache_inv((addr), (size))
 
 /**
  * @brief Return uncached pointer to a RAM address
diff --git a/soc/xtensa/intel_adsp/common/soc_mp.c b/soc/xtensa/intel_adsp/common/soc_mp.c
index 6dac17b7d9dfe0..710b1821e2846a 100644
--- a/soc/xtensa/intel_adsp/common/soc_mp.c
+++ b/soc/xtensa/intel_adsp/common/soc_mp.c
@@ -18,6 +18,7 @@
 LOG_MODULE_REGISTER(soc_mp, CONFIG_SOC_LOG_LEVEL);
 
 #include <soc.h>
+#include <arch/xtensa/cache.h>
 #include <adsp/io.h>
 
 #include <soc/shim.h>
@@ -125,7 +126,7 @@ void z_mp_entry(void)
 	 * isn't using yet.  Manual inspection of generated code says
 	 * we're safe, but really we need a better solution here.
 	 */
-	xthal_dcache_all_writeback_inv();
+	z_xtensa_cache_flush_inv_all();
 
 	/* Copy over VECBASE from the main CPU for an initial value
 	 * (will need to revisit this if we ever allow a user API to
diff --git a/soc/xtensa/intel_s1000/soc.h b/soc/xtensa/intel_s1000/soc.h
index 51651b28d58735..97ee1494fdd9d9 100644
--- a/soc/xtensa/intel_s1000/soc.h
+++ b/soc/xtensa/intel_s1000/soc.h
@@ -6,6 +6,8 @@
 #ifndef __INC_SOC_H
 #define __INC_SOC_H
 
+#include <arch/xtensa/cache.h>
+
 /* macros related to interrupt handling */
 #define XTENSA_IRQ_NUM_SHIFT			0
 #define CAVS_IRQ_NUM_SHIFT			8
@@ -224,9 +226,9 @@ struct soc_global_regs {
 
 /* macros for data cache operations */
 #define SOC_DCACHE_FLUSH(addr, size)		\
-	xthal_dcache_region_writeback((addr), (size))
+	z_xtensa_cache_flush((addr), (size))
 #define SOC_DCACHE_INVALIDATE(addr, size)	\
-	xthal_dcache_region_invalidate((addr), (size))
+	z_xtensa_cache_inv((addr), (size))
 
 extern void z_soc_irq_enable(uint32_t irq);
 extern void z_soc_irq_disable(uint32_t irq);
diff --git a/soc/xtensa/intel_s1000/soc_mp.c b/soc/xtensa/intel_s1000/soc_mp.c
index b48ba7de09a005..876e5a28021743 100644
--- a/soc/xtensa/intel_s1000/soc_mp.c
+++ b/soc/xtensa/intel_s1000/soc_mp.c
@@ -10,6 +10,7 @@
 #include <kernel_structs.h>
 #include <sys/sys_io.h>
 #include <sys/__assert.h>
+#include <xtensa/corebits.h>
 
 #include <logging/log.h>
 LOG_MODULE_REGISTER(soc_mp, CONFIG_SOC_LOG_LEVEL);
diff --git a/tests/boards/intel_s1000_crb/cache/src/cache_test.c b/tests/boards/intel_s1000_crb/cache/src/cache_test.c
index d249f9badb1604..0f3d46b151600e 100644
--- a/tests/boards/intel_s1000_crb/cache/src/cache_test.c
+++ b/tests/boards/intel_s1000_crb/cache/src/cache_test.c
@@ -51,7 +51,7 @@ static void cache_flush_test(void)
 	}
 
 	LOG_INF("Flushing cache to commit contents to main memory ...");
-	xthal_dcache_region_writeback(cached_buffer->flush,
+	z_xtensa_cache_flush(cached_buffer->flush,
 			CACHE_TEST_BUFFER_SIZE);
 
 	LOG_INF("Comparing contents of cached memory vs main memory ...");
@@ -80,7 +80,7 @@ static void cache_invalidation_test(void)
 	}
 
 	LOG_INF("Invalidating cache to read contents from main memory ...");
-	xthal_dcache_region_invalidate(cached_buffer->invalidate,
+	z_xtensa_cache_inv(cached_buffer->invalidate,
 			CACHE_TEST_BUFFER_SIZE);
 
 	LOG_INF("Comparing contents of cached memory vs main memory ...");
diff --git a/tests/boards/intel_s1000_crb/main/src/dma_test.c b/tests/boards/intel_s1000_crb/main/src/dma_test.c
index ea742bef6c2063..029b4ed97933cc 100644
--- a/tests/boards/intel_s1000_crb/main/src/dma_test.c
+++ b/tests/boards/intel_s1000_crb/main/src/dma_test.c
@@ -190,10 +190,10 @@ static int test_task(uint32_t chan_id, uint32_t blen, uint32_t block_count)
 		printk("*** timed out waiting for dma to complete ***\n");
 	}
 
-	xthal_dcache_region_invalidate(rx_data, RX_BUFF_SIZE);
-	xthal_dcache_region_invalidate(rx_data2, RX_BUFF_SIZE);
-	xthal_dcache_region_invalidate(rx_data3, RX_BUFF_SIZE);
-	xthal_dcache_region_invalidate(rx_data4, RX_BUFF_SIZE);
+	z_xtensa_cache_inv(rx_data, RX_BUFF_SIZE);
+	z_xtensa_cache_inv(rx_data2, RX_BUFF_SIZE);
+	z_xtensa_cache_inv(rx_data3, RX_BUFF_SIZE);
+	z_xtensa_cache_inv(rx_data4, RX_BUFF_SIZE);
 
 	/* Intentionally break has been omitted (fall-through) */
 	switch (block_count) {

From 713057151e20889d216c36d0708f89bbd6b5b422 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 22 Feb 2021 14:31:11 -0800
Subject: [PATCH 13/17] arch/xtensa: Invalidate bottom of outbound stacks

Both new thread creation and context switch had the same mistake in
cache management: the bottom of the stack (the "unused" region between
the lower memory bound and the live stack pointer) needs to be
invalidated before we switch, because otherwise any dirty lines we
might have left over can get flushed out on top of the same thread on
another CPU that is putting live data there.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/xtensa-asm2.c         |  8 ++--
 arch/xtensa/include/kernel_arch_func.h | 57 +++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/arch/xtensa/core/xtensa-asm2.c b/arch/xtensa/core/xtensa-asm2.c
index 46636b53fc3e3e..3e292b6a90427e 100644
--- a/arch/xtensa/core/xtensa-asm2.c
+++ b/arch/xtensa/core/xtensa-asm2.c
@@ -60,9 +60,6 @@ void *xtensa_init_stack(struct k_thread *thread, int *stack_top,
 	bsa[-9] = bsa;
 	ret = &bsa[-9];
 
-#ifdef CONFIG_KERNEL_COHERENCE
-	z_xtensa_cache_flush(ret, (char *)stack_top - (char *)ret);
-#endif
 	return ret;
 }
 
@@ -73,6 +70,11 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
 	thread->switch_handle = xtensa_init_stack(thread,
 						  (int *)stack_ptr, entry,
 						  p1, p2, p3);
+#ifdef CONFIG_KERNEL_COHERENCE
+	__ASSERT((((size_t)stack) % XCHAL_DCACHE_LINESIZE) == 0, "");
+	__ASSERT((((size_t)stack_ptr) % XCHAL_DCACHE_LINESIZE) == 0, "");
+	z_xtensa_cache_flush_inv(stack, (char *)stack_ptr - (char *)stack);
+#endif
 }
 
 void z_irq_spurious(const void *arg)
diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
index 607d78ecd9c08d..daf48e22b5243f 100644
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@@ -71,9 +71,9 @@ static inline bool arch_mem_coherent(void *ptr)
 #endif
 
 #ifdef CONFIG_KERNEL_COHERENCE
-static inline void arch_cohere_stacks(struct k_thread *old_thread,
-				      void *old_switch_handle,
-				      struct k_thread *new_thread)
+static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
+					     void *old_switch_handle,
+					     struct k_thread *new_thread)
 {
 	size_t ostack = old_thread->stack_info.start;
 	size_t osz    = old_thread->stack_info.size;
@@ -83,24 +83,61 @@ static inline void arch_cohere_stacks(struct k_thread *old_thread,
 	size_t nsz    = new_thread->stack_info.size;
 	size_t nsp    = (size_t) new_thread->switch_handle;
 
+	/* The "live" area (the region between the switch handle,
+	 * which is the stack pointer, and the top of the stack
+	 * memory) of the inbound stack needs to be invalidated: it
+	 * may contain data that was modified on another CPU since the
+	 * last time this CPU ran the thread, and our cache may be
+	 * stale.
+	 *
+	 * The corresponding "dead area" of the inbound stack can be
+	 * ignored.  We may have cached data in that region, but by
+	 * definition any unused stack memory will always be written
+	 * before being read (well, unless the code has an
+	 * uninitialized data error) so our stale cache will be
+	 * automatically overwritten as needed.
+	 */
 	z_xtensa_cache_inv((void *)nsp, (nstack + nsz) - nsp);
 
-	/* FIXME: dummy initializion threads don't have stack info set
-	 * up and explode the logic above.  Find a way to get this
-	 * test out of the hot paths!
+	/* Dummy threads appear at system initialization, but don't
+	 * have stack_info data and will never be saved.  Ignore.
 	 */
 	if (old_thread->base.thread_state & _THREAD_DUMMY) {
 		return;
 	}
 
-	/* In interrupt context, we have a valid frame already from
-	 * the interrupt entry code, but for arch_switch() that hasn't
-	 * happened yet.  It will do the flush itself, we just have to
-	 * calculate the boundary for it.
+	/* For the outbound thread, we obviousy want to flush any data
+	 * in the live area (for the benefit of whichever CPU runs
+	 * this thread next).  But we ALSO have to invalidate the dead
+	 * region of the stack.  Those lines may have DIRTY data in
+	 * our own cache, and we cannot be allowed to write them back
+	 * later on top of the stack's legitimate owner!
+	 *
+	 * This work comes in two flavors.  In interrupts, the
+	 * outgoing context has already been saved for us, so we can
+	 * do the flush right here.  In direct context switches, we
+	 * are still using the stack, so we do the invalidate of the
+	 * bottom here, (and flush the line containing SP to handle
+	 * the overlap).  The remaining flush of the live region
+	 * happens in the assembly code once the context is pushed, up
+	 * to the stack top stashed in a special register.
 	 */
 	if (old_switch_handle != NULL) {
 		z_xtensa_cache_flush((void *)osp, (ostack + osz) - osp);
+		z_xtensa_cache_inv((void *)ostack, osp - ostack);
 	} else {
+		/* When in a switch, our current stack is the outbound
+		 * stack.  Flush the single line containing the stack
+		 * bottom (which is live data) before invalidating
+		 * everything below that.  Remember that the 16 bytes
+		 * below our SP are the calling function's spill area
+		 * and may be live too.
+		 */
+		__asm__ volatile("mov %0, a1" : "=r"(osp));
+		osp -= 16;
+		z_xtensa_cache_flush((void *)osp, 1);
+		z_xtensa_cache_inv((void *)ostack, osp - ostack);
+
 		/* FIXME: hardcoding EXCSAVE3 is bad, should be
 		 * configurable a-la XTENSA_KERNEL_CPU_PTR_SR.
 		 */

From 1aab761d58ffa96dc0a490b83f23a97e91811093 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Thu, 25 Feb 2021 13:51:13 -0800
Subject: [PATCH 14/17] tests/fifo_api: Move k_fifo off stack

Putting spinlocks (or things containing them) onto the stack is a
KERNEL_COHERENCE violation.  This doesn't need to be there so just
make it static.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 tests/kernel/fifo/fifo_api/src/test_fifo_fail.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernel/fifo/fifo_api/src/test_fifo_fail.c b/tests/kernel/fifo/fifo_api/src/test_fifo_fail.c
index 584ad6239244ea..92913098d0aff8 100644
--- a/tests/kernel/fifo/fifo_api/src/test_fifo_fail.c
+++ b/tests/kernel/fifo/fifo_api/src/test_fifo_fail.c
@@ -21,7 +21,7 @@
  */
 void test_fifo_get_fail(void *p1, void *p2, void *p3)
 {
-	struct k_fifo fifo;
+	static struct k_fifo fifo;
 
 	k_fifo_init(&fifo);
 	/**TESTPOINT: fifo get returns NULL*/

From 1e3b4aff40f19a0682db75e1beb8d0ce49c7c9c8 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 1 Mar 2021 11:51:28 -0800
Subject: [PATCH 15/17] arch/xtensa: Remember to spill windows in
 arch_cohere_stacks()

When we reach this code in interrupt context, our upper GPRs contain a
cross-stack call that may still include some registers from the
interrupted thread.  Those need to go out to memory before we can do
our cache coherence dance here.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/core/xtensa-asm2-util.S    | 11 +++++------
 arch/xtensa/include/kernel_arch_func.h |  9 +++++++++
 arch/xtensa/include/xtensa-asm2-s.h    |  5 +++++
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/xtensa/core/xtensa-asm2-util.S b/arch/xtensa/core/xtensa-asm2-util.S
index a6e50cc2d0c549..d46ea5ee3a75b9 100644
--- a/arch/xtensa/core/xtensa-asm2-util.S
+++ b/arch/xtensa/core/xtensa-asm2-util.S
@@ -10,12 +10,11 @@
 /*
  * xtensa_spill_reg_windows
  *
- * Globally visible symbol to do register spills.  Useful for unit
- * testing, or maybe as part of a debug/watchdog/error handler.  Not a
- * C function, call this via CALL0 (so you probably have to save off
- * A0, but no other registers need to be spilled).  On return, all
- * registers not part of the current function will be spilled to
- * memory.
+ * Spill all register windows.  Not a C function, enter this via CALL0
+ * (so you have to save off A0, but no other registers need to be
+ * spilled).  On return, all registers not part of the current
+ * function will be spilled to memory.  The WINDOWSTART SR will have a
+ * single 1 bit corresponding to the current frame at WINDOWBASE.
  */
 .global xtensa_spill_reg_windows
 .align 4
diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
index daf48e22b5243f..b69f951adce128 100644
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@@ -83,6 +83,15 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 	size_t nsz    = new_thread->stack_info.size;
 	size_t nsp    = (size_t) new_thread->switch_handle;
 
+	if (old_switch_handle != NULL) {
+		int32_t a0save;
+
+		__asm__ volatile("mov %0, a0;"
+				 "call0 xtensa_spill_reg_windows;"
+				 "mov a0, %0"
+				 : "=r"(a0save));
+	}
+
 	/* The "live" area (the region between the switch handle,
 	 * which is the stack pointer, and the top of the stack
 	 * memory) of the inbound stack needs to be invalidated: it
diff --git a/arch/xtensa/include/xtensa-asm2-s.h b/arch/xtensa/include/xtensa-asm2-s.h
index cbfe81a0970a3a..3b98b366a92c86 100644
--- a/arch/xtensa/include/xtensa-asm2-s.h
+++ b/arch/xtensa/include/xtensa-asm2-s.h
@@ -324,7 +324,12 @@ _do_call_\@:
 	beq a6, a1, _restore_\@
 	l32i a1, a1, 0
 	addi a1, a1, BASE_SAVE_AREA_SIZE
+#ifndef CONFIG_KERNEL_COHERENCE
+	/* When using coherence, the registers of the interrupted
+	 * context got spilled upstream in arch_cohere_stacks()
+	 */
 	SPILL_ALL_WINDOWS
+#endif
 	mov a1, a6
 
 _restore_\@:

From 3ead8f15e4022295f12ab9368391ed43d63336ad Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 1 Mar 2021 16:55:37 -0800
Subject: [PATCH 16/17] tests/queue: tests/lifo_usage: Address ADSP/coherence
 issues

These tests would pass pointers to data on their own stacks to other
threads, which is forbidden when CONFIG_KERNEL_COHERENCE (because
stack memory isn't cache-coherent).  Make the variables static.

Also, queue had two sleeps of 2 ticks (having been written in an era
where that meant "20-30ms"), and on a device with a 50 kHz tick rate
that's not very much time at all.  It would sometimes fail spuriously
because the spawned threads didn't consume the queue entries in time.
How about 10ms of real time instead?

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 tests/kernel/lifo/lifo_usage/src/main.c      |  8 ++++----
 tests/kernel/queue/src/test_queue_contexts.c | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/kernel/lifo/lifo_usage/src/main.c b/tests/kernel/lifo/lifo_usage/src/main.c
index a035b2d38a9d60..4b34fa6590d9a5 100644
--- a/tests/kernel/lifo/lifo_usage/src/main.c
+++ b/tests/kernel/lifo/lifo_usage/src/main.c
@@ -310,7 +310,7 @@ static void test_timeout_non_empty_lifo(void)
 static void test_timeout_lifo_thread(void)
 {
 	void *packet, *scratch_packet;
-	struct reply_packet reply_packet;
+	static volatile struct reply_packet reply_packet;
 	uint32_t start_time, timeout;
 
 	/*
@@ -338,7 +338,7 @@ static void test_timeout_lifo_thread(void)
 	 */
 	tid[0] = k_thread_create(&ttdata[0], ttstack[0], TSTACK_SIZE,
 				test_thread_timeout_reply_values,
-				&reply_packet, NULL, NULL,
+				 (void *)&reply_packet, NULL, NULL,
 				LIFO_THREAD_PRIO, K_INHERIT_PERMS, K_NO_WAIT);
 
 	k_yield();
@@ -357,7 +357,7 @@ static void test_timeout_lifo_thread(void)
 
 	tid[0] = k_thread_create(&ttdata[0], ttstack[0], TSTACK_SIZE,
 				test_thread_timeout_reply_values,
-				&reply_packet, NULL, NULL,
+				(void *)&reply_packet, NULL, NULL,
 				LIFO_THREAD_PRIO, K_INHERIT_PERMS, K_NO_WAIT);
 
 	k_yield();
@@ -377,7 +377,7 @@ static void test_timeout_lifo_thread(void)
 
 	tid[0] = k_thread_create(&ttdata[0], ttstack[0], TSTACK_SIZE,
 				test_thread_timeout_reply_values_wfe,
-				&reply_packet, NULL, NULL,
+				(void *)&reply_packet, NULL, NULL,
 				LIFO_THREAD_PRIO, K_INHERIT_PERMS, K_NO_WAIT);
 
 	packet = k_lifo_get(&timeout_order_lifo, K_FOREVER);
diff --git a/tests/kernel/queue/src/test_queue_contexts.c b/tests/kernel/queue/src/test_queue_contexts.c
index 66d32d20da69a3..bdb558d32d2954 100644
--- a/tests/kernel/queue/src/test_queue_contexts.c
+++ b/tests/kernel/queue/src/test_queue_contexts.c
@@ -334,22 +334,22 @@ static void queue_poll_race_consume(void *p1, void *p2, void *p3)
 void test_queue_poll_race(void)
 {
 	int prio = k_thread_priority_get(k_current_get());
-	int mid_count = 0, low_count = 0;
+	static volatile int mid_count, low_count;
 
 	k_queue_init(&queue);
 
 	k_thread_create(&tdata, tstack, STACK_SIZE,
 			queue_poll_race_consume,
-			&queue, &mid_count, NULL,
+			&queue, (void *)&mid_count, NULL,
 			prio + 1, 0, K_NO_WAIT);
 
 	k_thread_create(&tdata1, tstack1, STACK_SIZE,
 			queue_poll_race_consume,
-			&queue, &low_count, NULL,
+			&queue, (void *)&low_count, NULL,
 			prio + 2, 0, K_NO_WAIT);
 
 	/* Let them initialize and block */
-	k_sleep(K_TICKS(2));
+	k_sleep(K_MSEC(10));
 
 	/* Insert two items.  This will wake up both threads, but the
 	 * higher priority thread (tdata1) might (if CONFIG_POLL)
@@ -362,7 +362,7 @@ void test_queue_poll_race(void)
 	zassert_true(low_count == 0, NULL);
 	zassert_true(mid_count == 0, NULL);
 
-	k_sleep(K_TICKS(2));
+	k_sleep(K_MSEC(10));
 
 	zassert_true(low_count + mid_count == 2, NULL);
 

From c780c206f2e8970bfc1440c8d9ac4b1af551ee50 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Mon, 1 Mar 2021 17:29:08 -0800
Subject: [PATCH 17/17] tests/p4wq: Fix impossible sleep interval

The code here was written to "get out of the way just long enough for
the trivial context switch and callback to execute".  But on a machine
with 50 kHz ticks, that's not reliably enough time and this was
failing spuriously.  Which would have been a reasonably forgivable
mistake to make had I not written this code with this very machine in
mind...

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 tests/lib/p4workq/src/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lib/p4workq/src/main.c b/tests/lib/p4workq/src/main.c
index 020eb520f06c25..4754450e0ed327 100644
--- a/tests/lib/p4workq/src/main.c
+++ b/tests/lib/p4workq/src/main.c
@@ -260,7 +260,7 @@ static void test_p4wq_simple(void)
 	k_p4wq_submit(&wq, &simple_item);
 	zassert_false(has_run, "ran too early");
 
-	k_sleep(K_TICKS(1));
+	k_msleep(10);
 	zassert_true(has_run, "low-priority item didn't run");
 
 	/* Higher priority, should preempt us */