Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into prefetch
Browse files Browse the repository at this point in the history
  • Loading branch information
jerryz123 committed Mar 11, 2021
2 parents 55c28b0 + a7b0d8c commit ea466f2
Show file tree
Hide file tree
Showing 17 changed files with 417 additions and 237 deletions.
2 changes: 1 addition & 1 deletion SPIKE.hash
Original file line number Diff line number Diff line change
@@ -1 +1 @@
86265d02e8abea3b367114393d6b0661fd35b156
9b0082a416a4f1967fda434c7129953fad77b2af
9 changes: 8 additions & 1 deletion src/main/scala/gemmini/Configs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ object GemminiConfigs {
dataflow = Dataflow.BOTH,
acc_capacity = CapacityInKilobytes(64),
mem_pipeline = 4,
hasIm2col = true, //declare im2col block
hasIm2col = false, //declare im2col block
dma_maxbytes = 64, // TODO get this from cacheblockbytes
dma_buswidth = 128, // TODO get this from SystemBusKey
aligned_to = 1,
Expand Down Expand Up @@ -177,6 +177,11 @@ object GemminiConfigs {
use_dedicated_tl_port = false,
pe_latency = 0,

ex_read_from_spad = true,
ex_read_from_acc = true,
ex_write_to_spad = true,
ex_write_to_acc = true,

tlb_size = 4,
use_tlb_register_filter = true,
max_in_flight_reqs = 16,
Expand All @@ -186,6 +191,8 @@ object GemminiConfigs {
val largeChipConfig = defaultConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64), dataflow=Dataflow.WS,
meshRows=32, meshColumns=32
)

val highPerfConfig = defaultConfig.copy(dataflow=Dataflow.WS, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, max_in_flight_reqs = 64)
}

/**
Expand Down
20 changes: 20 additions & 0 deletions src/main/scala/gemmini/ConfigsFP.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ object GemminiFPConfigs {
acc_read_small_width = true,

pe_latency = 1,

ex_read_from_spad = true,
ex_read_from_acc = true,
ex_write_to_spad = true,
ex_write_to_acc = true,
)

//FP32 Single Precision Configuration
Expand Down Expand Up @@ -130,6 +135,21 @@ class GemminiBF16DefaultConfig extends Config((site, here, up) => {
case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
})

class GemminiBF16DefaultHighPerfConfig extends Config((site, here, up) => {
case BuildRoCC => Seq(
(p: Parameters) => {
implicit val q = p
implicit val v = implicitly[ValName]
val gemmini = LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig.copy(
ex_read_from_acc = false,
ex_write_to_spad = false,
)))
gemmini
}
)
case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
})

//===========BFLOAT16 Default Config 8x8=========
class GemminiBF16Default8Config extends Config((site, here, up) => {
case BuildRoCC => Seq(
Expand Down
98 changes: 9 additions & 89 deletions src/main/scala/gemmini/Controller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,80 +20,6 @@ class GemminiCmd(rob_entries: Int)(implicit p: Parameters) extends Bundle {
override def cloneType: this.type = new GemminiCmd(rob_entries).asInstanceOf[this.type]
}


class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_entries: Int) extends Bundle {
private val localAddrBits = 32 // TODO magic number

private val spAddrBits = log2Ceil(sp_banks * sp_bank_entries)
private val accAddrBits = log2Ceil(acc_banks * acc_bank_entries)
private val maxAddrBits = spAddrBits max accAddrBits

private val spBankBits = log2Up(sp_banks)
private val spBankRowBits = log2Up(sp_bank_entries)

private val accBankBits = log2Up(acc_banks)
private val accBankRowBits = log2Up(acc_bank_entries)

val is_acc_addr = Bool()
val accumulate = Bool()
val read_full_acc_row = Bool()
val garbage = UInt(((localAddrBits - maxAddrBits - 4) max 0).W)
val garbage_bit = if (localAddrBits - maxAddrBits >= 4) UInt(1.W) else UInt(0.W)
val data = UInt(maxAddrBits.W)

def sp_bank(dummy: Int = 0) = if (spAddrBits == spBankRowBits) 0.U else data(spAddrBits - 1, spBankRowBits)
def sp_row(dummy: Int = 0) = data(spBankRowBits - 1, 0)
def acc_bank(dummy: Int = 0) = if (accAddrBits == accBankRowBits) 0.U else data(accAddrBits - 1, accBankRowBits)
def acc_row(dummy: Int = 0) = data(accBankRowBits - 1, 0)

def full_sp_addr(dummy: Int = 0) = data(spAddrBits - 1, 0)
def full_acc_addr(dummy: Int = 0) = data(accAddrBits - 1, 0)

def is_same_address(other: LocalAddr): Bool = is_acc_addr === other.is_acc_addr && data === other.data
def is_same_address(other: UInt): Bool = is_same_address(other.asTypeOf(this))
def is_garbage(dummy: Int = 0) = is_acc_addr && accumulate && read_full_acc_row && data.andR() &&
(if (garbage_bit.getWidth > 0) garbage_bit.asBool() else true.B)

def +(other: UInt) = {
require(isPow2(sp_bank_entries)) // TODO remove this requirement
require(isPow2(acc_bank_entries)) // TODO remove this requirement

val result = WireInit(this)
result.data := data + other
result
}

def <=(other: LocalAddr) =
is_acc_addr === other.is_acc_addr &&
Mux(is_acc_addr, full_acc_addr() <= other.full_acc_addr(), full_sp_addr() <= other.full_sp_addr())

def >(other: LocalAddr) =
is_acc_addr === other.is_acc_addr &&
Mux(is_acc_addr, full_acc_addr() > other.full_acc_addr(), full_sp_addr() > other.full_sp_addr())

def add_with_overflow(other: UInt): Tuple2[LocalAddr, Bool] = {
require(isPow2(sp_bank_entries)) // TODO remove this requirement
require(isPow2(acc_bank_entries)) // TODO remove this requirement

val sum = data +& other

val result = WireInit(this)
result.data := sum(data.getWidth-1, 0)

(result, sum(data.getWidth))
}

def make_this_garbage(dummy: Int = 0): Unit = {
is_acc_addr := true.B
accumulate := true.B
read_full_acc_row := true.B
garbage_bit := 1.U
data := ~(0.U(maxAddrBits.W))
}

override def cloneType: LocalAddr.this.type = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries).asInstanceOf[this.type]
}

class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiArrayConfig[T, U, V])
(implicit p: Parameters)
extends LazyRoCC (
Expand Down Expand Up @@ -182,25 +108,19 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
val rob = Module(new ROB(outer.config, new RoCCCommand))

val raw_cmd = Queue(io.cmd)
val max_lds = rob_entries * 1 / 4
val max_exs = rob_entries * 3 / 4
val max_sts = rob_entries * 1 / 8

// TODO replace 4,12,2 with parameters based on ROB size
val loop_conv_unroller_busy = false.B
/*val (unrolled_cmd_after_conv, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
meshRows*tileRows, coreMaxAddrBits, rob_entries, 4, 12, 2, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
inputType.getWidth, accType.getWidth, dma_maxbytes)
unrolled_cmd_after_conv.ready := false.B*/

// val (compressed_cmd, compressor_busy) = InstCompressor(unrolled_cmd)
// compressed_cmd.ready := false.B

// val (unrolled_cmd, loop_matmul_unroller_busy) = LoopMatmul(unrolled_cmd_after_conv, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,

val max_lds = rob_entries * 1 / 4
val max_exs = rob_entries * 3 / 4
val max_sts = rob_entries * 1 / 8
val (loop_cmd, loop_matmul_unroller_busy, prefetch) = LoopMatmul(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
val (loop_cmd, loop_matmul_unroller_busy, prefetch) = LoopMatmul(conv_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
inputType.getWidth, accType.getWidth, dma_maxbytes)

val unrolled_cmd = Queue(loop_cmd)
unrolled_cmd.ready := false.B

Expand Down Expand Up @@ -378,12 +298,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
rob_completed_arb.io.out.ready := true.B

// Wire up global RoCC signals
io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid
io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
io.interrupt := tlb.io.exp.interrupt

rob.io.solitary_preload := ex_controller.io.solitary_preload

assert(!io.interrupt, "Interrupt handlers have not been written yet")
// assert(!io.interrupt, "Interrupt handlers have not been written yet")

// Cycle counters
val ld_cycles = RegInit(0.U(34.W))
Expand Down
42 changes: 28 additions & 14 deletions src/main/scala/gemmini/DMA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -313,10 +313,11 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
}
}

class StreamWriteRequest(val dataWidth: Int)(implicit p: Parameters) extends CoreBundle {
class StreamWriteRequest(val dataWidth: Int, val maxBytes: Int)(implicit p: Parameters) extends CoreBundle {
val vaddr = UInt(coreMaxAddrBits.W)
val data = UInt(dataWidth.W)
val len = UInt(log2Up(dataWidth/8+1).W) // The number of bytes to write
val len = UInt(log2Up((dataWidth/8 max maxBytes)+1).W) // The number of bytes to write
val block = UInt(8.W) // TODO magic number
val status = new MStatus

// Pooling variables
Expand All @@ -338,11 +339,13 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
val beatBytes = beatBits / 8
val lgBeatBytes = log2Ceil(beatBytes)
val maxBeatsPerReq = maxBytes / beatBytes
val inputTypeRowBytes = block_cols * inputType.getWidth / 8
val maxBlocks = maxBytes / inputTypeRowBytes

require(beatBytes > 0)

val io = IO(new Bundle {
val req = Flipped(Decoupled(new StreamWriteRequest(dataWidth)))
val req = Flipped(Decoupled(new StreamWriteRequest(dataWidth, maxBytes)))
val tlb = new FrontendTLBIO
val busy = Output(Bool())
val flush = Input(Bool())
Expand All @@ -351,9 +354,14 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
val (s_idle :: s_writing_new_block :: s_writing_beats :: Nil) = Enum(3)
val state = RegInit(s_idle)

val req = Reg(new StreamWriteRequest(dataWidth))
val req = Reg(new StreamWriteRequest(dataWidth, maxBytes))

val bytesSent = Reg(UInt(log2Ceil(dataBytes+1).W)) // TODO this only needs to count up to (dataBytes/aligned_to), right?
// TODO use the same register to hold data_blocks and data_single_block, so that this Mux here is not necessary
val data_blocks = Reg(Vec(maxBlocks, UInt((inputTypeRowBytes * 8).W)))
val data_single_block = Reg(UInt(dataWidth.W)) // For data that's just one-block-wide
val data = Mux(req.block === 0.U, data_single_block, data_blocks.asUInt())

val bytesSent = Reg(UInt(log2Ceil((dataBytes max maxBytes)+1).W)) // TODO this only needs to count up to (dataBytes/aligned_to), right?
val bytesLeft = req.len - bytesSent

val xactBusy = RegInit(0.U(nXacts.W))
Expand Down Expand Up @@ -453,14 +461,14 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
fromSource = RegEnableThru(xactId, state === s_writing_new_block),
toAddress = 0.U,
lgSize = lg_write_size,
data = (req.data >> (bytesSent * 8.U)).asUInt()
data = (data >> (bytesSent * 8.U)).asUInt()
)._2

val putPartial = edge.Put(
fromSource = RegEnableThru(xactId, state === s_writing_new_block),
toAddress = 0.U,
lgSize = lg_write_size,
data = ((req.data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt(),
data = ((data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt(),
mask = write_mask.asUInt()
)._2

Expand Down Expand Up @@ -490,7 +498,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
val tlb_q = Module(new Queue(new TLBundleAWithInfo, 1, pipe=true))
tlb_q.io.enq <> tlb_arb.io.out

io.tlb.req.valid := tlb_q.io.deq.valid
io.tlb.req.valid := tlb_q.io.deq.fire()
io.tlb.req.bits.tlb_req.vaddr := tlb_q.io.deq.bits.vaddr
io.tlb.req.bits.tlb_req.passthrough := false.B
io.tlb.req.bits.tlb_req.size := 0.U // send_size
Expand All @@ -504,15 +512,15 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
shadow_retry_a.io.enq.valid := tlb_q.io.deq.valid
shadow_retry_a.io.enq.bits := tlb_q.io.deq.bits
}
translate_q.io.deq.ready := true.B
translate_q.io.deq.ready := tl.a.ready || io.tlb.resp.miss

retry_a.valid := translate_q.io.deq.valid && (io.tlb.resp.miss || !tl.a.ready)
retry_a.valid := translate_q.io.deq.valid && io.tlb.resp.miss
retry_a.bits := translate_q.io.deq.bits
assert(retry_a.ready)
assert(!(retry_a.valid && !retry_a.ready))

tl.a.valid := translate_q.io.deq.valid && !io.tlb.resp.miss
tl.a.bits := translate_q.io.deq.bits.tl_a
tl.a.bits.address := io.tlb.resp.paddr
tl.a.bits.address := RegEnableThru(io.tlb.resp.paddr, RegNext(io.tlb.req.fire()))

tl.d.ready := xactBusy.orR()

Expand Down Expand Up @@ -557,17 +565,23 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
val pooled = {
val cols = dataWidth / inputType.getWidth
val v1 = io.req.bits.data.asTypeOf(Vec(cols, inputType))
val v2 = req.data.asTypeOf(Vec(cols, inputType))
val v2 = data_single_block.asTypeOf(Vec(cols, inputType))
val m = v1.zip(v2)
VecInit(m.zipWithIndex.map{case ((x, y), i) => if (i < block_cols) maxOf(x, y) else y}).asUInt()
}

req := io.req.bits
req.data := Mux(io.req.bits.pool_en, pooled, io.req.bits.data)
req.len := io.req.bits.block * inputTypeRowBytes.U + io.req.bits.len

data_single_block := Mux(io.req.bits.pool_en, pooled, io.req.bits.data)
data_blocks(io.req.bits.block) := io.req.bits.data

bytesSent := 0.U

state := Mux(io.req.bits.store_en, s_writing_new_block, s_idle)

assert(io.req.bits.len <= (block_cols * inputType.getWidth / 8).U || io.req.bits.block === 0.U, "DMA can't write multiple blocks to main memory when writing full accumulator output")
assert(!io.req.bits.pool_en || io.req.bits.block === 0.U, "Can't pool with block-mvout")
}
}
}
5 changes: 5 additions & 0 deletions src/main/scala/gemmini/DSEConfigs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ object DSEBaseConfig {
use_dedicated_tl_port = false,
pe_latency = 0,

ex_read_from_spad = true,
ex_read_from_acc = true,
ex_write_to_spad = true,
ex_write_to_acc = true,

tlb_size = 4,
use_tlb_register_filter = true,
max_in_flight_reqs = 16,
Expand Down
Loading

0 comments on commit ea466f2

Please sign in to comment.