diff --git a/aclint.c b/aclint.c
index 2be7d46..9cac4b6 100644
--- a/aclint.c
+++ b/aclint.c
@@ -6,10 +6,13 @@
 /* ACLINT MTIMER */
 void aclint_mtimer_update_interrupts(hart_t *hart, mtimer_state_t *mtimer)
 {
-    if (semu_timer_get(&mtimer->mtime) >= mtimer->mtimecmp[hart->mhartid])
+    if (semu_timer_get(&mtimer->mtime) >= mtimer->mtimecmp[hart->mhartid]) {
         hart->sip |= RV_INT_STI_BIT; /* Set Supervisor Timer Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected - wakes the hart */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_STI_BIT; /* Clear Supervisor Timer Interrupt */
+    }
 }
 
 static bool aclint_mtimer_reg_read(mtimer_state_t *mtimer,
@@ -106,10 +109,13 @@ void aclint_mtimer_write(hart_t *hart,
 /* ACLINT MSWI */
 void aclint_mswi_update_interrupts(hart_t *hart, mswi_state_t *mswi)
 {
-    if (mswi->msip[hart->mhartid])
+    if (mswi->msip[hart->mhartid]) {
         hart->sip |= RV_INT_SSI_BIT; /* Set Machine Software Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_SSI_BIT; /* Clear Machine Software Interrupt */
+    }
 }
 
 static bool aclint_mswi_reg_read(mswi_state_t *mswi,
@@ -165,10 +171,13 @@ void aclint_mswi_write(hart_t *hart,
 /* ACLINT SSWI */
 void aclint_sswi_update_interrupts(hart_t *hart, sswi_state_t *sswi)
 {
-    if (sswi->ssip[hart->mhartid])
+    if (sswi->ssip[hart->mhartid]) {
         hart->sip |= RV_INT_SSI_BIT; /* Set Supervisor Software Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_SSI_BIT; /* Clear Supervisor Software Interrupt */
+    }
 }
 
 static bool aclint_sswi_reg_read(__attribute__((unused)) sswi_state_t *sswi,
diff --git a/coro.c b/coro.c
index 261cfcd..968e431 100644
--- a/coro.c
+++ b/coro.c
@@ -607,5 +607,9 @@ bool coro_is_suspended(uint32_t slot_id)
 
 uint32_t coro_current_hart_id(void)
 {
+    /* Return sentinel value if coroutine subsystem not initialized */
+    if (!coro_state.initialized)
+        return UINT32_MAX;
+
     return coro_state.current_hart;
 }
diff --git a/device.h b/device.h
index 963911e..88baea8 100644
--- a/device.h
+++ b/device.h
@@ -60,6 +60,9 @@ typedef struct {
     /* I/O handling */
     int in_fd, out_fd;
     bool in_ready;
+    /* Coroutine support for input waiting (SMP mode) */
+    uint32_t waiting_hart_id; /**< Hart ID waiting for input */
+    bool has_waiting_hart;    /**< true if a hart is yielding for input */
 } u8250_state_t;
 
 void u8250_update_interrupts(u8250_state_t *uart);
diff --git a/main.c b/main.c
index 0b608fb..7bbc522 100644
--- a/main.c
+++ b/main.c
@@ -768,6 +768,8 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
 
     /* Set up peripherals */
     emu->uart.in_fd = 0, emu->uart.out_fd = 1;
+    emu->uart.waiting_hart_id = UINT32_MAX;
+    emu->uart.has_waiting_hart = false;
     capture_keyboard_input(); /* set up uart */
 #if SEMU_HAS(VIRTIONET)
     /* Always set ram pointer, even if netdev is not configured.
@@ -809,7 +811,7 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
     emu->peripheral_update_ctr = 0;
     emu->debug = debug;
 
-    /* Initialize coroutine system for SMP mode (n_hart > 1) */
+    /* Initialize coroutine system for multi-hart mode (SMP > 1) */
     if (vm->n_hart > 1) {
         uint32_t total_slots = vm->n_hart;
 #if SEMU_HAS(VIRTIONET)
@@ -853,17 +855,29 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
  */
 static void wfi_handler(hart_t *hart)
 {
-    vm_t *vm = hart->vm;
-    /* Only yield in SMP mode (n_hart > 1) */
-    if (vm->n_hart > 1) {
-        /* Per RISC-V spec: WFI returns immediately if interrupt is pending.
-         * Only yield to scheduler if no interrupt is currently pending.
+    /* Per RISC-V spec: WFI returns immediately if interrupt is pending.
+     * We check if any interrupt is actually pending (sip & sie != 0).
+     */
+    bool interrupt_pending = (hart->sip & hart->sie) != 0;
+
+    if (!interrupt_pending) {
+        emu_state_t *emu = PRIV(hart);
+        vm_t *vm = &emu->vm;
+
+        /* Only use coroutine yielding in multi-hart mode where the coroutine
+         * scheduler loop is active. In single-hart mode, WFI is a no-op since
+         * there's no scheduler to resume execution after yield.
          */
-        if (!(hart->sip & hart->sie)) {
-            hart->in_wfi = true;  /* Mark as waiting for interrupt */
-            coro_yield();         /* Suspend until scheduler resumes us */
-            hart->in_wfi = false; /* Resumed - no longer waiting */
+        if (vm->n_hart > 1) {
+            hart->in_wfi = true; /* Mark as waiting for interrupt */
+            coro_yield();        /* Suspend until scheduler resumes us */
+            /* NOTE: Do NOT clear in_wfi here to avoid race condition.
+             * The scheduler needs to see this flag to detect idle state.
+             * The flag will be cleared when an interrupt is actually injected.
+             */
         }
+    } else {
+        hart->in_wfi = false; /* Clear if interrupt already pending */
     }
 }
 
@@ -1150,87 +1164,150 @@ static int semu_run(emu_state_t *emu)
                 poll_capacity = needed;
             }
 
+            /* Determine poll timeout based on hart states BEFORE setting up
+             * poll fds. This check must happen before coro_resume_hart()
+             * modifies flags.
+             *
+             * - If no harts are STARTED, block indefinitely (wait for IPI)
+             * - If all STARTED harts are idle (WFI or UART waiting), block
+             * - Otherwise, use non-blocking poll (timeout=0)
+             */
+            int poll_timeout = 0;
+            uint32_t started_harts = 0;
+            uint32_t idle_harts = 0;
+            for (uint32_t i = 0; i < vm->n_hart; i++) {
+                if (vm->hart[i]->hsm_status == SBI_HSM_STATE_STARTED) {
+                    started_harts++;
+                    /* Count hart as idle if it's in WFI or waiting for UART */
+                    if (vm->hart[i]->in_wfi ||
+                        (emu->uart.has_waiting_hart &&
+                         emu->uart.waiting_hart_id == i)) {
+                        idle_harts++;
+                    }
+                }
+            }
+
             /* Collect file descriptors for poll() */
             size_t pfd_count = 0;
             int timer_index = -1;
 
-            /* Add periodic timer fd (1ms interval for guest timer emulation) */
+            /* Add periodic timer fd (1ms interval for guest timer emulation).
+             * Only add timer when ALL harts are active (none idle) to allow
+             * poll() to sleep when any harts are in WFI. When harts are idle,
+             * timer updates can be deferred until they wake up.
+             *
+             * During SMP boot (started_harts < vm->n_hart), always include the
+             * timer to ensure secondary harts can complete initialization. Only
+             * apply conditional exclusion after all harts have started.
+             *
+             * For single-hart configurations (n_hart == 1), disable
+             * optimization entirely to avoid boot issues, as the first hart
+             * starts immediately.
+             */
+            bool all_harts_started = (started_harts >= vm->n_hart);
+            bool harts_active =
+                (vm->n_hart == 1) || !all_harts_started || (idle_harts == 0);
 #ifdef __APPLE__
             /* macOS: use kqueue with EVFILT_TIMER */
-            if (kq >= 0 && pfd_count < poll_capacity) {
+            if (kq >= 0 && pfd_count < poll_capacity && harts_active) {
                 pfds[pfd_count] = (struct pollfd){kq, POLLIN, 0};
                 timer_index = (int) pfd_count;
                 pfd_count++;
             }
 #else
             /* Linux: use timerfd */
-            if (wfi_timer_fd >= 0 && pfd_count < poll_capacity) {
+            if (wfi_timer_fd >= 0 && pfd_count < poll_capacity &&
+                harts_active) {
                 pfds[pfd_count] = (struct pollfd){wfi_timer_fd, POLLIN, 0};
                 timer_index = (int) pfd_count;
                 pfd_count++;
             }
 #endif
 
-            /* Add UART input fd (stdin for keyboard input) */
-            if (emu->uart.in_fd >= 0 && pfd_count < poll_capacity) {
+            /* Add UART input fd (stdin for keyboard input).
+             * Only add UART when:
+             * 1. Single-hart configuration (n_hart == 1), OR
+             * 2. Not all harts started (!all_harts_started), OR
+             * 3. All harts are active (idle_harts == 0), OR
+             * 4. A hart is actively waiting for UART input
+             *
+             * This prevents UART (which is always "readable" on TTY) from
+             * preventing poll() sleep when harts are idle. Trade-off: user
+             * input (Ctrl+A x) may be delayed by up to poll_timeout (10ms)
+             * when harts are idle, which is acceptable for an emulator.
+             */
+            bool need_uart = (vm->n_hart == 1) || !all_harts_started ||
+                             (idle_harts == 0) || emu->uart.has_waiting_hart;
+            if (emu->uart.in_fd >= 0 && pfd_count < poll_capacity &&
+                need_uart) {
                 pfds[pfd_count] = (struct pollfd){emu->uart.in_fd, POLLIN, 0};
                 pfd_count++;
             }
 
-            /* Determine poll timeout based on hart WFI states:
-             * - If no harts are STARTED, block indefinitely (wait for IPI)
-             * - If all STARTED harts are in WFI, block indefinitely
-             * - Otherwise, use non-blocking poll (timeout=0)
+            /* Set poll timeout based on current idle state (adaptive timeout).
+             * Three-tier strategy:
+             * 1. Blocking (-1): All harts idle + have fds → wait for events
+             * 2. Short sleep (10ms): Some harts idle → reduce CPU usage
+             * 3. Non-blocking (0): All harts active → maximum responsiveness
+             *
+             * SAFETY: Never use blocking timeout when pfd_count==0, as
+             * poll(0,-1) would hang indefinitely. Always use 10ms timeout as
+             * fallback.
              */
-            int poll_timeout = 0;
-            uint32_t started_harts = 0;
-            uint32_t wfi_harts = 0;
-            for (uint32_t i = 0; i < vm->n_hart; i++) {
-                if (vm->hart[i]->hsm_status == SBI_HSM_STATE_STARTED) {
-                    started_harts++;
-                    if (vm->hart[i]->in_wfi)
-                        wfi_harts++;
-                }
-            }
-            /* Block if no harts running or all running harts are waiting */
             if (pfd_count > 0 &&
-                (started_harts == 0 || wfi_harts == started_harts))
+                (started_harts == 0 || idle_harts == started_harts)) {
+                /* All harts idle + have fds: block until event */
                 poll_timeout = -1;
+            } else if (idle_harts > 0) {
+                /* Some/all harts idle (or all idle but no fds): 10ms sleep */
+                poll_timeout = 10;
+            } else {
+                /* All harts active: non-blocking */
+                poll_timeout = 0;
+            }
 
             /* Execute poll() to wait for I/O events.
-             * - timeout=0: non-blocking poll when harts are running
-             * - timeout=-1: blocking poll when all harts in WFI (idle state)
+             * - timeout=0: non-blocking poll when harts are active
+             * - timeout=10: short sleep when some harts idle
+             * - timeout=-1: blocking poll when all harts idle (WFI or UART
+             *   wait)
+             *
+             * When pfd_count==0, poll() acts as a pure sleep mechanism.
              */
-            if (pfd_count > 0) {
-                int nevents = poll(pfds, pfd_count, poll_timeout);
-                if (nevents > 0) {
-                    /* Consume timer expiration events to prevent fd staying
-                     * readable
-                     */
-                    if (timer_index >= 0 &&
-                        (pfds[timer_index].revents & POLLIN)) {
+            int nevents = poll(pfds, pfd_count, poll_timeout);
+
+            if (pfd_count > 0 && nevents > 0) {
+                /* Consume timer expiration events to prevent fd staying
+                 * readable
+                 */
+                if (timer_index >= 0 && (pfds[timer_index].revents & POLLIN)) {
 #ifdef __APPLE__
-                        /* drain kqueue events with non-blocking kevent */
-                        struct kevent events[32];
-                        struct timespec timeout_zero = {0, 0};
-                        kevent(kq, NULL, 0, events, 32, &timeout_zero);
+                    /* drain kqueue events with non-blocking kevent */
+                    struct kevent events[32];
+                    struct timespec timeout_zero = {0, 0};
+                    kevent(kq, NULL, 0, events, 32, &timeout_zero);
 #else
-                        /* Linux: read timerfd to consume expiration count */
-                        uint64_t expirations;
-                        ssize_t ret_read = read(wfi_timer_fd, &expirations,
-                                                sizeof(expirations));
-                        (void) ret_read;
+                    /* Linux: read timerfd to consume expiration count */
+                    uint64_t expirations;
+                    ssize_t ret_read =
+                        read(wfi_timer_fd, &expirations, sizeof(expirations));
+                    (void) ret_read;
 #endif
-                    }
-                } else if (nevents < 0 && errno != EINTR) {
-                    perror("poll");
                 }
+            } else if (nevents < 0 && errno != EINTR) {
+                perror("poll");
             }
 
             /* Resume all hart coroutines (round-robin scheduling).
              * Each hart executes a batch of instructions, then yields back.
-             * Harts in WFI will clear their in_wfi flag when resuming from
-             * coro_yield() in wfi_handler().
+             * Harts in WFI will have their in_wfi flag cleared by interrupt
+             * handlers (ACLINT, PLIC, UART) when interrupts are injected.
+             *
+             * Note: We must always resume harts after poll() returns, even if
+             * all harts appear idle. The in_wfi flag is only cleared when
+             * interrupt sources inject interrupts, so skipping resume would
+             * cause a deadlock where harts remain stuck waiting even after
+             * events arrive.
              */
             for (uint32_t i = 0; i < vm->n_hart; i++) {
                 coro_resume_hart(i);
diff --git a/plic.c b/plic.c
index bbf18c9..e7c9133 100644
--- a/plic.c
+++ b/plic.c
@@ -11,10 +11,13 @@ void plic_update_interrupts(vm_t *vm, plic_state_t *plic)
     plic->masked |= plic->active;
     /* Send interrupt to target */
     for (uint32_t i = 0; i < vm->n_hart; i++) {
-        if (plic->ip & plic->ie[i])
+        if (plic->ip & plic->ie[i]) {
             vm->hart[i]->sip |= RV_INT_SEI_BIT;
-        else
+            /* Clear WFI flag when external interrupt is injected */
+            vm->hart[i]->in_wfi = false;
+        } else {
             vm->hart[i]->sip &= ~RV_INT_SEI_BIT;
+        }
     }
 }
 
diff --git a/uart.c b/uart.c
index 3c997bb..685e048 100644
--- a/uart.c
+++ b/uart.c
@@ -6,6 +6,7 @@
 #include <termios.h>
 #include <unistd.h>
 
+#include "coro.h"
 #include "device.h"
 #include "riscv.h"
 #include "riscv_private.h"
@@ -86,12 +87,45 @@ static void u8250_handle_out(u8250_state_t *uart, uint8_t value)
         fprintf(stderr, "failed to write UART output: %s\n", strerror(errno));
 }
 
+/* Wait for UART input using coroutine yield (SMP mode only)
+ * This function allows a hart to yield when no UART input is available,
+ * preventing CPU spinning when waiting for stdin. The hart will be resumed
+ * by the event loop when stdin becomes readable.
+ */
+static void u8250_wait_for_input(u8250_state_t *uart)
+{
+    /* Only yield in SMP mode - single-core mode doesn't use coroutines */
+    uint32_t hart_id = coro_current_hart_id();
+    if (hart_id == UINT32_MAX)
+        return; /* Not in a coroutine, skip yielding */
+
+    /* Mark this hart as waiting for UART input */
+    uart->waiting_hart_id = hart_id;
+    uart->has_waiting_hart = true;
+
+    /* Yield until stdin has data available. The event loop will resume this
+     * hart when poll() detects POLLIN on stdin fd.
+     */
+    coro_yield();
+
+    /* Resumed - clear waiting state */
+    uart->has_waiting_hart = false;
+    uart->waiting_hart_id = UINT32_MAX;
+}
+
 static uint8_t u8250_handle_in(u8250_state_t *uart)
 {
     uint8_t value = 0;
     u8250_check_ready(uart);
-    if (!uart->in_ready)
-        return value;
+
+    /* If no data available, yield and wait for stdin to become readable */
+    if (!uart->in_ready) {
+        u8250_wait_for_input(uart);
+        /* After resume, re-check if data is now available */
+        u8250_check_ready(uart);
+        if (!uart->in_ready)
+            return value; /* Spurious wakeup - still no data */
+    }
 
     if (read(uart->in_fd, &value, 1) < 0)
         fprintf(stderr, "failed to read UART input: %s\n", strerror(errno));