diff --git a/SPIKE.hash b/SPIKE.hash
index ccafc0db..3137a068 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-9b0082a416a4f1967fda434c7129953fad77b2af
+bc3222e351cdd645b6fd2605fd9611e3bc0d9cae
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index d68fe69c..bd9dbe0b 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit d68fe69ce930dd18bf62ad28ab3015ef5087177d
+Subproject commit bd9dbe0b0dcde33b5445711ed27c6840167c10bf
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 2b7c9a82..d9a0f93a 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -52,8 +52,6 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   implicit val edge = outer.node.edges.out.head
   val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes))
   (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2)
-  tlb.io.exp.flush_skip := false.B
-  tlb.io.exp.flush_retry := false.B
 
   io.ptw.head <> tlb.io.ptw
   /*io.ptw.head.req <> tlb.io.ptw.req
@@ -63,7 +61,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   tlb.io.ptw.pmp := io.ptw.head.pmp
   tlb.io.ptw.customCSRs := io.ptw.head.customCSRs*/
 
-  spad.module.io.flush := tlb.io.exp.flush()
+  spad.module.io.flush := tlb.io.exp.flush
 
   /*
   //=========================================================================
@@ -107,7 +105,30 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   // Incoming commands and ROB
   val rob = Module(new ROB(outer.config, new RoCCCommand))
 
-  val raw_cmd = Queue(io.cmd)
+  val raw_cmd_q = Module(new Queue(new RoCCCommand, 2))
+  val fence_stall = io.cmd.bits.inst.funct === FENCE_CMD && io.busy
+  raw_cmd_q.io.enq.valid := io.cmd.valid && io.resp.ready && !fence_stall
+  raw_cmd_q.io.enq.bits  := io.cmd.bits
+
+  io.resp.valid     := io.cmd.valid && raw_cmd_q.io.enq.ready && !fence_stall
+  io.resp.bits.rd   := io.cmd.bits.inst.rd
+  io.resp.bits.data := 0.U
+
+  io.cmd.ready := io.resp.ready && raw_cmd_q.io.enq.ready && !fence_stall
+
+  // When TLB is busy with exception, don't enqueue new instructions, instead use RD to pass back exception info
+  when (tlb.io.exp.interrupt) {
+    io.cmd.ready := true.B
+    raw_cmd_q.io.enq.valid := false.B
+    io.resp.valid := io.cmd.valid 
+    io.resp.bits.data := tlb.io.exp.vaddr
+  }
+
+  tlb.io.exp.flush := io.cmd.fire() && io.cmd.bits.inst.funct === FLUSH_CMD
+
+
+
+  val raw_cmd = raw_cmd_q.io.deq
 
   // TODO replace 4,12,2 with parameters based on ROB size
   val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
@@ -300,8 +321,8 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   rob_completed_arb.io.out.ready := true.B
 
   // Wire up global RoCC signals
-  io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
-  io.interrupt := tlb.io.exp.interrupt
+  io.busy := (raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid) && !tlb.io.exp.interrupt
+  io.interrupt := false.B
 
   rob.io.solitary_preload := ex_controller.io.solitary_preload
 
@@ -354,32 +375,15 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     val risc_funct = unrolled_cmd.bits.inst.funct
 
     val is_flush = risc_funct === FLUSH_CMD
+    val is_fence = risc_funct === FENCE_CMD
     /*
     val is_load = (funct === LOAD_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD)
     val is_store = (funct === STORE_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE)
     val is_ex = (funct === COMPUTE_AND_FLIP_CMD || funct === COMPUTE_AND_STAY_CMD || funct === PRELOAD_CMD) ||
     (funct === CONFIG_CMD && config_cmd_type === CONFIG_EX)
     */
-
-    when (is_flush) {
-      // val skip = compressed_cmd.bits.rs1(0)
-      val skip = unrolled_cmd.bits.rs1(0)
-      tlb.io.exp.flush_skip := skip
-      tlb.io.exp.flush_retry := !skip
-
-      // compressed_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB?
-      unrolled_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB?
-    }
-
-    .otherwise {
-      rob.io.alloc.valid := true.B
-
-      when(rob.io.alloc.fire()) {
-        // compressed_cmd.ready := true.B
-        unrolled_cmd.ready := true.B
-      }
-    }
-
+    unrolled_cmd.ready := is_fence || is_flush || rob.io.alloc.ready
+    rob.io.alloc.valid := !is_flush && !is_fence
   }
 
   /*
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index 73416816..819d9f57 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -17,11 +17,12 @@ class DecoupledTLBReq(val lgMaxSize: Int)(implicit p: Parameters) extends CoreBu
 }
 
 class TLBExceptionIO extends Bundle {
+  // interrupt means we are stalling loads and stores until a gemmini_flush command is received
   val interrupt = Output(Bool())
-  val flush_retry = Input(Bool())
-  val flush_skip = Input(Bool())
+  // vaddr of faulting inst. LSB indicates is_Store
+  val vaddr = Output(UInt(64.W))
 
-  def flush(dummy: Int = 0): Bool = flush_retry || flush_skip
+  val flush = Input(Bool())
 }
 
 // TODO can we make TLB hits only take one cycle?
@@ -30,7 +31,7 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para
 
   val lgMaxSize = log2Ceil(maxSize)
   val io = new Bundle {
-    val req = Flipped(Valid(new DecoupledTLBReq(lgMaxSize)))
+    val req = Flipped(Decoupled(new DecoupledTLBReq(lgMaxSize)))
     val resp = new TLBResp
     val ptw = new TLBPTWIO
 
@@ -38,7 +39,11 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para
   }
 
   val interrupt = RegInit(false.B)
+  val interrupt_vaddr = Reg(UInt(64.W))
   io.exp.interrupt := interrupt
+  io.exp.vaddr := interrupt_vaddr
+
+  io.req.ready := !interrupt
 
   val tlb = Module(new TLB(false, lgMaxSize, TLBConfig(nSets=1, nWays=entries)))
   tlb.io.req.valid := io.req.valid
@@ -46,7 +51,7 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para
   io.resp := tlb.io.resp
   tlb.io.kill := false.B
 
-  tlb.io.sfence.valid := io.exp.flush()
+  tlb.io.sfence.valid := io.exp.flush
   tlb.io.sfence.bits.rs1 := false.B
   tlb.io.sfence.bits.rs2 := false.B
   tlb.io.sfence.bits.addr := DontCare
@@ -54,13 +59,16 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para
 
   io.ptw <> tlb.io.ptw
   tlb.io.ptw.status := io.req.bits.status
-  val exception = io.req.valid && Mux(io.req.bits.tlb_req.cmd === M_XRD, tlb.io.resp.pf.ld || tlb.io.resp.ae.ld, tlb.io.resp.pf.st || tlb.io.resp.ae.st)
-  when (exception) { interrupt := true.B }
+  val xcpt_ld = io.req.valid && (io.req.bits.tlb_req.cmd === M_XRD) && (tlb.io.resp.pf.ld || tlb.io.resp.ae.ld)
+  val xcpt_st = io.req.valid && (io.req.bits.tlb_req.cmd === M_XWR) && (tlb.io.resp.pf.st || tlb.io.resp.ae.st)
+  when (!interrupt && (xcpt_ld || xcpt_st)) {
+    interrupt := true.B
+    interrupt_vaddr := Cat(tlb.io.req.bits.vaddr >> 1, xcpt_st)
+
+  }
   when (interrupt && tlb.io.sfence.fire()) {
     interrupt := false.B
   }
-
-  assert(!io.exp.flush_retry || !io.exp.flush_skip, "TLB: flushing with both retry and skip at same time")
 }
 
 class FrontendTLBIO(implicit p: Parameters) extends CoreBundle {
@@ -81,9 +89,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int)
   val lgMaxSize = log2Ceil(coreDataBytes)
   val tlbArb = Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients))
   val tlb = Module(new DecoupledTLB(entries, maxSize))
-  tlb.io.req.valid := tlbArb.io.out.valid
-  tlb.io.req.bits := tlbArb.io.out.bits
-  tlbArb.io.out.ready := true.B
+  tlb.io.req <> tlbArb.io.out
 
   io.ptw <> tlb.io.ptw
   io.exp <> tlb.io.exp
@@ -101,7 +107,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int)
       last_translated_vpn := req.bits.tlb_req.vaddr
       last_translated_ppn := tlb.io.resp.paddr
     }
-    when (io.exp.flush()) {
+    when (io.exp.flush) {
       last_translated_valid := false.B
     }
 
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index b49087c7..2bf48dad 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -31,6 +31,8 @@ object GemminiISA {
   val LOOP_CONV_WS_CONFIG_5 = 20.U // *weights | *output
   val LOOP_CONV_WS_CONFIG_6 = 21.U // *bias, *input
 
+  val FENCE_CMD = 127.U
+
   // rs1[2:0] values
   val CONFIG_EX = 0.U
   val CONFIG_LOAD = 1.U