/
cpuv2.v
427 lines (395 loc) · 12.8 KB
/
cpuv2.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* robin, a SoC design for the IceBreaker board.
*
* cpuv2.v : a risc cpu
*
* Copyright 2019,2020 Michel Anders
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
`define COUNTER
module cpuv2(clk, mem_data_out, mem_data_in, mem_raddr, mem_waddr, mem_write, mem_ready, start_address, reset, halt, halted);
parameter addr_width = 9;
input clk;
input [7:0] mem_data_out; // from memory to cpu
output reg [7:0] mem_data_in; // from cpu to memory
output reg [addr_width-1:0] mem_raddr;
output reg [addr_width-1:0] mem_waddr;
output reg mem_write;
input mem_ready;
input [addr_width-1:0] start_address;
input reset;
input halt;
output reg halted;
// general registers
reg [31:0] r[0:15];
reg [31:0] temp;
// special registers
reg [15:0] instruction;
`ifdef COUNTER
reg [31:0] counter;
`endif
// alu (combinatorial)
wire [31:0] alu_a = r[R1];
wire [31:0] alu_b = r[R0];
wire [3:0] alu_op = r[13][3:0];
wire multicycle = r[13][4];
wire [31:0] alu_c;
wire alu_is_zero;
wire alu_is_negative;
alu alu0(
.a(alu_a),
.b(alu_b),
.op(alu_op),
.c(alu_c),
.is_zero(alu_is_zero),
.is_negative(alu_is_negative)
);
// divider (multicycle)
wire [31:0] div_a = r[R1];
wire [31:0] div_b = r[R0];
wire [31:0] div_c;
wire div_divs = alu_op[0];
wire remainder = alu_op[1];
wire div_is_zero;
wire div_is_negative;
wire div_is_available;
divider div0(
.clk(clk),
.reset(reset),
.a(div_a),
.b(div_b),
.go(div_go),
.divs(div_divs),
.remainder(remainder),
.c(div_c),
.is_zero(div_is_zero),
.is_negative(div_is_negative),
.available(div_is_available)
);
// state machine
reg [3:0] state;
localparam FETCH1 = 0;
localparam FETCH2 = 1;
localparam FETCH3 = 2;
localparam FETCH4 = 3;
localparam DECODE = 4;
localparam EXEC1 = 5;
localparam EXEC2 = 6;
localparam EXEC3 = 7;
localparam EXEC4 = 8;
localparam EXEC5 = 9;
localparam HALT = 15;
// instruction pointer
wire [addr_width-1:0] ip = r[15][addr_width-1:0]; // the addressable bits of the program counter
wire [31:0] ip1 = {{(32-addr_width){1'b0}},ip+1}; // incremented program counter, just for the addressable bits
wire [31:0] ip2 = {{(32-addr_width){1'b0}},ip+2}; // pc + 2, just for the addressable bits
// decoded signals
reg pop;
reg div, div_go;
reg loadb3, loadb2, loadb1, loadb0;
reg branch;
reg storb3, storb2, storb1, storb0;
reg loadli;
// instruction fields
wire [3:0] cmd = instruction[15:12]; // main opcode
wire [3:0] R2 = instruction[11: 8]; // destination register
wire [3:0] R1 = instruction[ 7: 4]; // source register 1
wire [3:0] R0 = instruction[ 3: 0]; // source register 0
wire [7:0] immediate = instruction[7:0];
wire haltinstruction = &instruction; // all ones
wire [31:0] r1_offset = r[R1] + {{26{R0[3]}},R0,2'b00}; // sign extended offset * 4
// branch logic
wire [31:0] relative = {{16{temp[31]}},temp[30:16]}; // 16 bit sign extended to 32; those 16 bits will be in the upper half of temp!
wire [31:0] branchtarget = r[15] + relative;
wire takebranch = ((r[13][31:29] & instruction[10:8]) == ({3{instruction[11]}} & instruction[10:8]));
// addressing
wire [31:0] sumr1r0 = r[R1] + r[R0];
wire [addr_width-1:0] sumr1r0_addr = sumr1r0[addr_width-1:0];
// opcodes
localparam CMD_MOVE = 0; // r2 <- r1 + r0
localparam CMD_POP = 1; // r2 <- [r14] ; r14 += 4
localparam CMD_ALU = 2; // r2 <- r1 alu_op r0
localparam CMD_MOVER = 3; // r2 <- r1 + n*4
localparam CMD_LOADB = 4; // r2 <- [r1 + r0] (byte)
localparam CMD_SETBRA = 5; // r1 <- cond == true ; pc += cond == true ? offset : 0
localparam CMD_LOADL = 6; // r2 <- [r1 + r0] (long)
localparam CMD_LOADIL = 7; // r2 <- [pc] ; pc += 4 (long)
localparam CMD_STORB = 8; // r2 -> [r1 + r0] (byte)
localparam CMD_PUSH = 9; // r14 -= 4 ; r2 -> [r14]
localparam CMD_STORL = 10; // r2 -> [r1 + r0] (long)
`ifdef COUNTER
localparam CMD_MARK = 11; // r2 <- counter
`endif
localparam CMD_LOADI = 12; // r2 <- byte
localparam CMD_JUMP = 14; // r2 <- pc + 2 ; pc <- r1 + r0
localparam CMD_HALT = 15; // halt
`ifdef COUNTER
always @(posedge clk) begin
counter <= counter + 1;
end
`endif
always @(posedge clk) begin
mem_write <= 0;
if(reset) begin
r[15] <= start_address;
halted <= 0;
state <= FETCH1;
instruction <= 0;
end else
if(halt | haltinstruction) begin
state <= HALT;
instruction <= 0; // this will clear haltinstruction
end else
case(state)
FETCH1 : begin
mem_raddr <= ip; // address of instruction hi byte
state <= FETCH2;
end
FETCH2 : begin
r[0] <= 0; // make sure fixed registers stay fixed
r[1] <= 1;
r[13][31] <= 1; // force the always on bit
state <= FETCH3; // there need to be two clock cycles between loading the mem_raddr and reading mem_data_out
r[15] <= ip1; // but then we can update and read every new clock cycle
mem_raddr <= ip1; // address of instruction lo byte
end
FETCH3 : begin // for efficiency reasons part of the decoding starts here already
pop <= 0;
instruction[15:8] <= mem_data_out; // instruction hi byte
state <= FETCH4;
if(mem_data_out[7:4] == CMD_POP) begin // to keep the read pipeline filled we either
mem_raddr <= r[14]; // start reading from whatever the stack pointer is
pop <= 1; // then this will point to b3 (hi byte, big endian)
end else begin // or we keep reading past the instruction
mem_raddr <= mem_raddr + 1; // in case this will turn out to be a long immediate
end // then this will point to b3 (hi byte, big endian)
end
FETCH4 : begin
instruction[7:0] <= mem_data_out; // instruction lo byte
mem_raddr <= mem_raddr + 1; // this will point to b2
r[15] <= ip1; // in the decode state ip will point to next instruction
div_go <= 0; // make sure the divider is reset
state <= DECODE;
div <= 0; // all decoding signals reset (except for pop)
loadb3 <= 0;
loadb2 <= 0;
loadb1 <= 0;
loadb0 <= 0;
branch <= 0;
storb3 <= 0;
storb2 <= 0;
storb1 <= 0;
storb0 <= 0;
loadli <= 0;
end
DECODE : begin
state <= EXEC1;
mem_raddr <= mem_raddr + 1; // this will point to b1
case(cmd)
CMD_MOVE: begin
state <= FETCH1;
r[R2] <= sumr1r0;
if( ~ &R2 ) begin // if R2 is not the PC we can take a 1 cycle shortcut
mem_raddr <= ip;
state <= FETCH2;
end
end
CMD_ALU: begin
if(multicycle) begin
div_go <= 1; // start the divider module if we have a divider operation
div <= 1;
end else begin
r[R2] <= alu_c;
r[13][29] <= alu_is_zero;
r[13][30] <= alu_is_negative;
state <= FETCH1;
if( ~ &R2 ) begin // if R2 is not the PC we can take a 1 cycle shortcut
mem_raddr <= ip;
state <= FETCH2;
end
end
end
CMD_POP: begin // empty instruction, pop was already set during fetch
pop <= 1;
end
CMD_MOVER: begin
r[R2] <= r1_offset;
state <= FETCH1;
if( ~ &R2 ) begin // if R2 is not the PC we can take a 1 cycle shortcut
mem_raddr <= ip;
state <= FETCH2;
end
end
CMD_LOADB: begin
loadb3 <= 1;
mem_raddr <= sumr1r0_addr;
end
CMD_LOADL: begin
loadb3 <= 1;
loadb2 <= 1;
loadb1 <= 1;
loadb0 <= 1;
mem_raddr <= sumr1r0_addr;
end
CMD_LOADIL: begin
loadli <= 1;
r[15] <= ip2; // we increment the pc in two steps to save on LUTs needed for adder
end
CMD_STORB: begin
storb3 <= 1;
mem_waddr <= sumr1r0_addr;
mem_data_in <= r[R2][7:0];
mem_write <= 1;
end
CMD_PUSH: begin
r[14] <= r[14] - 4;
mem_waddr <= r[14] - 4;
mem_data_in <= r[R2][31:24];
mem_write <= 1;
storb3 <= 1;
storb2 <= 1;
storb1 <= 1;
storb0 <= 1;
end
CMD_STORL: begin
storb3 <= 1;
storb2 <= 1;
storb1 <= 1;
storb0 <= 1;
mem_waddr <= sumr1r0_addr;
mem_data_in <= r[R2][31:24];
mem_write <= 1;
end
CMD_LOADI: begin
r[R2][7:0] <= immediate;
state <= FETCH1;
if( ~ &R2 ) begin // if R2 is not the PC we can take a 1 cycle shortcut
mem_raddr <= ip;
state <= FETCH2;
end
end
CMD_SETBRA: begin
branch <= 1;
r[15] <= ip2;
r[R1] <= takebranch; // R1 because R2 decodes the condition
if( ~takebranch ) state <= FETCH1;
end
CMD_JUMP: begin
r[R2] <= r[15];
r[15] <= sumr1r0;
state <= FETCH1;
end
`ifdef COUNTER
CMD_MARK: begin
r[R2] <= counter;
end
`endif
default: state <= FETCH1;
endcase
temp[31:24] <= mem_data_out; // read b3 for POP or LOADLI
end
EXEC1 : begin
temp[23:16] <= mem_data_out; // read b2 for POP or LOADLI
mem_raddr <= mem_raddr + 1; // this will point to b0 or b2
div_go <= 0; // flag down the divider module again so that it is not reset forever
state <= EXEC2;
if (div) begin // a divider operation (multiple cycles)
if(div_is_available) begin // keep cycling until done
r[R2] <= div_c;
r[13][29] <= div_is_zero;
r[13][30] <= div_is_negative;
state <= FETCH1;
end else begin
state <= EXEC1;
end
end
if(storb3 & ~storb2) begin
mem_write <= 1;
state <= FETCH1;
end
if(storb2) begin
mem_waddr <= mem_waddr + 1;
mem_data_in <= r[R2][23:16];
mem_write <= 1;
end
if(loadli) r[15] <= ip2; // we increment the pc in two steps to save on LUTs needed for adder
end
EXEC2 : begin
mem_raddr <= mem_raddr + 1; // the address for b1
state <= FETCH1;
if(loadb3) begin
temp[31:24] <= mem_data_out;
state <= EXEC3;
end
if(branch & takebranch) r[15] <= branchtarget; // branchtarget refers to upper half of temp so should be ready
if(loadli | pop) begin
temp[15:8] <= mem_data_out; // b1
state <= EXEC3;
end
if(storb1) begin
mem_waddr <= mem_waddr + 1;
mem_data_in <= r[R2][15:8];
mem_write <= 1;
state <= EXEC3;
end
if(pop) r[14] <= r[14] + 4; // no need to set state because has been done by loadli | pop
end
EXEC3 : begin
mem_raddr <= mem_raddr + 1; // the address for b0
state <= FETCH1;
if(loadb2) begin
temp[23:16] <= mem_data_out;
state <= EXEC4;
end
if(loadb3 & ~loadb2) r[R2][7:0] <= temp[31:24]; // a single byte load, not a long
if(loadb3 & ~loadb2 & ~ &R2 ) begin // if R2 is not the PC we can take a 1 cycle shortcut
mem_raddr <= ip;
state <= FETCH2;
end
if(loadli | pop) begin
mem_raddr <= ip;
state <= FETCH2;
r[R2][31:8] <= temp[31:8];
r[R2][7:0] <= mem_data_out;
end
if(storb0) begin
mem_waddr <= mem_waddr + 1;
mem_data_in <= r[R2][7:0];
mem_write <= 1;
state <= EXEC4;
end
end
EXEC4 : begin
state <= FETCH1;
if(loadb1) begin
temp[15:8] <= mem_data_out;
state <= EXEC5;
end
if(storb0) begin
mem_write <= 1;
end
end
EXEC5 : begin
mem_raddr <= ip;
state <= FETCH2;
// no need to test for signals only LOADL (loadb0) can end up in state == EXEC5
r[R2][31:8] <= temp[31:8];
r[R2][7:0] <= mem_data_out;
end
HALT : begin
halted <= 1;
state <= HALT;
end
endcase
end
endmodule