From c4d15915f30dc5a1ad97588eb52a3ce718fc4d66 Mon Sep 17 00:00:00 2001 From: Ayke van Laethem Date: Thu, 14 Apr 2022 16:19:07 +0200 Subject: [PATCH 1/2] ws2812: write inline assembly using C instead of Go This is better for various reasons: * I don't like the inline assembly implementation in TinyGo. It kinda works, but it's hard to use correctly. * The Go version had a subtle bug: the i and value variables weren't marked as modified. The compiler produced correct code by chance. In GCC-style inline assembly, it's possible to correctly mark it as an input+output operand. * LLVM 14 has some changes to inline assembly that change how memory operands can be created. Instead of supporting that, I'd like to get rid of memory operands entirely (and possibly inline assembly in general). I did some light testing: the ARM assembly changed a bit but not in any way that should have a practical effect, and the RISC-V assembly didn't change at all. --- Makefile | 2 + ws2812/gen-ws2812.go | 105 +- ws2812/ws2812-asm_cortexm.go | 2033 +++++++++++++-------------- ws2812/ws2812-asm_tinygoriscv.go | 2198 +++++++++++++++--------------- 4 files changed, 2205 insertions(+), 2133 deletions(-) diff --git a/Makefile b/Makefile index 59cbce97c..2074ec40d 100644 --- a/Makefile +++ b/Makefile @@ -159,6 +159,8 @@ smoke-test: @md5sum ./build/test.hex tinygo build -size short -o ./build/test.hex -target=circuitplay-express ./examples/ws2812 @md5sum ./build/test.hex + tinygo build -size short -o ./build/test.bin -target=m5stamp-c3 ./examples/ws2812 + @md5sum ./build/test.bin tinygo build -size short -o ./build/test.hex -target=feather-nrf52840 ./examples/is31fl3731/main.go @md5sum ./build/test.hex ifneq ($(AVR), 0) diff --git a/ws2812/gen-ws2812.go b/ws2812/gen-ws2812.go index b00f72e8a..2948ebe60 100644 --- a/ws2812/gen-ws2812.go +++ b/ws2812/gen-ws2812.go @@ -60,21 +60,21 @@ var architectures = map[string]architectureImpl{ minBaseCyclesT1H: 1 + 1 + 2, // shift + branch (taken) + store maxBaseCyclesT1H: 1 + 3 + 2, // shift + branch (taken) + store minBaseCyclesTLD: 1 + 2 + 2, // subtraction + branch x2 + store (in next cycle) - valueTemplate: "uint32(c) << 24", + valueTemplate: "(uint32_t)c << 24", template: ` 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here + str %[maskSet], %[portSet] @ [2] T0H and T0L start here @DELAY1 - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition + lsls %[value], #1 @ [1] + bcs.n 2f @ [1/3] skip_store + str %[maskClear], %[portClear] @ [2] T0H -> T0L transition 2: @ skip_store @DELAY2 - str {maskClear}, {portClear} @ [2] T1H -> T1L transition + str %[maskClear], %[portClear] @ [2] T1H -> T1L transition @DELAY3 - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit + subs %[i], #1 @ [1] + beq.n 3f @ [1/3] end + b 1b @ [1/3] send_bit 3: @ end `, }, @@ -90,25 +90,25 @@ var architectures = map[string]architectureImpl{ minBaseCyclesT1H: 1 + 1 + 1, // shift + branch (taken) + store maxBaseCyclesT1H: 1 + 3 + 1, // shift + branch (taken) + store minBaseCyclesTLD: 1 + 1 + 1, // subtraction + branch + store (in next cycle) - valueTemplate: "uint32(c) << 23", + valueTemplate: "(uint32_t)c << 23", template: ` 1: // send_bit - sw {maskSet}, {portSet} // [1] T0H and T0L start here + sw %[maskSet], %[portSet] // [1] T0H and T0L start here @DELAY1 - slli {value}, {value}, 1 // [1] shift value left by 1 - bltz {value}, 2f // [1/3] skip_store - sw {maskClear}, {portClear} // [1] T0H -> T0L transition + slli %[value], %[value], 1 // [1] shift value left by 1 + bltz %[value], 2f // [1/3] skip_store + sw %[maskClear], %[portClear] // [1] T0H -> T0L transition 2: // skip_store @DELAY2 - sw {maskClear}, {portClear} // [1] T1H -> T1L transition + sw %[maskClear], %[portClear] // [1] T1H -> T1L transition @DELAY3 - addi {i}, {i}, -1 // [1] - bnez {i}, 1b // [1/3] send_bit + addi %[i], %[i], -1 // [1] + bnez %[i], 1b // [1/3] send_bit `, }, } -func writeImplementation(f *os.File, arch string, megahertz int) error { +func writeCAssembly(f *os.File, arch string, megahertz int) error { cycleTimeNS := 1 / float64(megahertz) // These timings are taken from the table "Updated simplified timing // constraints for NeoPixel strings" at: @@ -209,30 +209,51 @@ func writeImplementation(f *os.File, arch string, megahertz int) error { // ignore I/O errors. buf := &bytes.Buffer{} fmt.Fprintf(buf, "\n") - fmt.Fprintf(buf, "func (d Device) writeByte%d(c byte) {\n", megahertz) - fmt.Fprintf(buf, " portSet, maskSet := d.Pin.PortMaskSet()\n") - fmt.Fprintf(buf, " portClear, maskClear := d.Pin.PortMaskClear()\n") - fmt.Fprintf(buf, "\n") + fmt.Fprintf(buf, "__attribute__((always_inline))\nvoid ws2812_writeByte%d(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) {\n", megahertz) fmt.Fprintf(buf, " // Timings:\n") fmt.Fprintf(buf, " // T0H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT0H, actualMaxCyclesT0H, actualMinNanosecondsT0H, actualMaxNanosecondsT0H) fmt.Fprintf(buf, " // T1H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT1H, actualMaxCyclesT1H, actualMinNanosecondsT1H, actualMaxNanosecondsT1H) fmt.Fprintf(buf, " // TLD: %2d - cycles or %.1fns -\n", actualMinCyclesTLD, actualMinNanosecondsTLD) - fmt.Fprintf(buf, " mask := interrupt.Disable()\n") - fmt.Fprintf(buf, " value := %s\n", archImpl.valueTemplate) + fmt.Fprintf(buf, " uint32_t value = %s;\n", archImpl.valueTemplate) asm := archImpl.template + asm = strings.TrimSpace(asm) asm = strings.ReplaceAll(asm, " @DELAY1\n", strings.Repeat(" nop\n", delay1)) asm = strings.ReplaceAll(asm, " @DELAY2\n", strings.Repeat(" nop\n", delay2)) asm = strings.ReplaceAll(asm, " @DELAY3\n", strings.Repeat(" nop\n", delay3)) asm = strings.ReplaceAll(asm, "\n", "\n\t") - fmt.Fprintf(buf, " device.AsmFull(`%s`, map[string]interface{}{", asm) + fmt.Fprintf(buf, " char i = 8;\n") + fmt.Fprintf(buf, " __asm__ __volatile__(\n") + for _, line := range strings.Split(asm, "\n") { + fmt.Fprintf(buf, "\t\t%#v\n", line+"\n") + } + // Note: [value] and [i] must be input+output operands because they modify + // the value. + fmt.Fprintf(buf, ` : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} +`) + + // Now write the buffer contents (with the assembly function) to a file. + _, err := f.Write(buf.Bytes()) + return err +} + +func writeGoWrapper(f *os.File, arch string, megahertz int) error { + // Create the Go function in a buffer. Using a buffer here to be able to + // ignore I/O errors. + buf := &bytes.Buffer{} + fmt.Fprintf(buf, "\n") + fmt.Fprintf(buf, "func (d Device) writeByte%d(c byte) {\n", megahertz) + fmt.Fprintf(buf, " portSet, maskSet := d.Pin.PortMaskSet()\n") + fmt.Fprintf(buf, " portClear, maskClear := d.Pin.PortMaskClear()\n") + fmt.Fprintf(buf, "\n") + fmt.Fprintf(buf, " mask := interrupt.Disable()\n") + fmt.Fprintf(buf, " C.ws2812_writeByte%d(C.char(c), portSet, portClear, maskSet, maskClear)\n", megahertz) buf.WriteString(` - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) interrupt.Restore(mask) } `) @@ -271,16 +292,26 @@ package ws2812 // Warning: autogenerated file. Instead of modifying this file, change // gen-ws2812.go and run "go generate". -import ( - "device" - "runtime/interrupt" -) +import "runtime/interrupt" + +/* +#include `) for _, megahertz := range clockFrequencies { - err := writeImplementation(f, *arch, megahertz) + err := writeCAssembly(f, *arch, megahertz) if err != nil { fmt.Fprintf(os.Stderr, "could not generate WS2812 assembly code for %s and %dMHz: %s\n", *arch, megahertz, err) os.Exit(1) } } + f.WriteString(`*/ +import "C" +`) + for _, megahertz := range clockFrequencies { + err := writeGoWrapper(f, *arch, megahertz) + if err != nil { + fmt.Fprintf(os.Stderr, "could not generate Go wrapper: %w\n", err) + os.Exit(1) + } + } } diff --git a/ws2812/ws2812-asm_cortexm.go b/ws2812/ws2812-asm_cortexm.go index 8bef27ecc..0cf37394b 100644 --- a/ws2812/ws2812-asm_cortexm.go +++ b/ws2812/ws2812-asm_cortexm.go @@ -6,68 +6,1035 @@ package ws2812 // Warning: autogenerated file. Instead of modifying this file, change // gen-ws2812.go and run "go generate". -import ( - "device" - "runtime/interrupt" -) +import "runtime/interrupt" -func (d Device) writeByte16(c byte) { - portSet, maskSet := d.Pin.PortMaskSet() - portClear, maskClear := d.Pin.PortMaskClear() +/* +#include +__attribute__((always_inline)) +void ws2812_writeByte16(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { // Timings: // T0H: 6 - 8 cycles or 375.0ns - 500.0ns // T1H: 17 - 19 cycles or 1062.5ns - 1187.5ns // TLD: 19 - cycles or 1187.5ns - + uint32_t value = (uint32_t)c << 24; + char i = 8; + __asm__ __volatile__( + "1: @ send_bit\n" + "\t str %[maskSet], %[portSet] @ [2] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t lsls %[value], #1 @ [1]\n" + "\t bcs.n 2f @ [1/3] skip_store\n" + "\t str %[maskClear], %[portClear] @ [2] T0H -> T0L transition\n" + "\t2: @ skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t str %[maskClear], %[portClear] @ [2] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t subs %[i], #1 @ [1]\n" + "\t beq.n 3f @ [1/3] end\n" + "\t b 1b @ [1/3] send_bit\n" + "\t3: @ end\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} + +__attribute__((always_inline)) +void ws2812_writeByte48(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { + // Timings: + // T0H: 17 - 19 cycles or 354.2ns - 395.8ns + // T1H: 51 - 53 cycles or 1062.5ns - 1104.2ns + // TLD: 56 - cycles or 1166.7ns - + uint32_t value = (uint32_t)c << 24; + char i = 8; + __asm__ __volatile__( + "1: @ send_bit\n" + "\t str %[maskSet], %[portSet] @ [2] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t lsls %[value], #1 @ [1]\n" + "\t bcs.n 2f @ [1/3] skip_store\n" + "\t str %[maskClear], %[portClear] @ [2] T0H -> T0L transition\n" + "\t2: @ skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t str %[maskClear], %[portClear] @ [2] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t subs %[i], #1 @ [1]\n" + "\t beq.n 3f @ [1/3] end\n" + "\t b 1b @ [1/3] send_bit\n" + "\t3: @ end\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} + +__attribute__((always_inline)) +void ws2812_writeByte64(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { + // Timings: + // T0H: 23 - 25 cycles or 359.4ns - 390.6ns + // T1H: 68 - 70 cycles or 1062.5ns - 1093.8ns + // TLD: 74 - cycles or 1156.2ns - + uint32_t value = (uint32_t)c << 24; + char i = 8; + __asm__ __volatile__( + "1: @ send_bit\n" + "\t str %[maskSet], %[portSet] @ [2] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t lsls %[value], #1 @ [1]\n" + "\t bcs.n 2f @ [1/3] skip_store\n" + "\t str %[maskClear], %[portClear] @ [2] T0H -> T0L transition\n" + "\t2: @ skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t str %[maskClear], %[portClear] @ [2] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t subs %[i], #1 @ [1]\n" + "\t beq.n 3f @ [1/3] end\n" + "\t b 1b @ [1/3] send_bit\n" + "\t3: @ end\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} + +__attribute__((always_inline)) +void ws2812_writeByte120(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { + // Timings: + // T0H: 42 - 44 cycles or 350.0ns - 366.7ns + // T1H: 126 - 128 cycles or 1050.0ns - 1066.7ns + // TLD: 138 - cycles or 1150.0ns - + uint32_t value = (uint32_t)c << 24; + char i = 8; + __asm__ __volatile__( + "1: @ send_bit\n" + "\t str %[maskSet], %[portSet] @ [2] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t lsls %[value], #1 @ [1]\n" + "\t bcs.n 2f @ [1/3] skip_store\n" + "\t str %[maskClear], %[portClear] @ [2] T0H -> T0L transition\n" + "\t2: @ skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t str %[maskClear], %[portClear] @ [2] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t subs %[i], #1 @ [1]\n" + "\t beq.n 3f @ [1/3] end\n" + "\t b 1b @ [1/3] send_bit\n" + "\t3: @ end\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} + +__attribute__((always_inline)) +void ws2812_writeByte168(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { + // Timings: + // T0H: 59 - 61 cycles or 351.2ns - 363.1ns + // T1H: 177 - 179 cycles or 1053.6ns - 1065.5ns + // TLD: 194 - cycles or 1154.8ns - + uint32_t value = (uint32_t)c << 24; + char i = 8; + __asm__ __volatile__( + "1: @ send_bit\n" + "\t str %[maskSet], %[portSet] @ [2] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t lsls %[value], #1 @ [1]\n" + "\t bcs.n 2f @ [1/3] skip_store\n" + "\t str %[maskClear], %[portClear] @ [2] T0H -> T0L transition\n" + "\t2: @ skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t str %[maskClear], %[portClear] @ [2] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t subs %[i], #1 @ [1]\n" + "\t beq.n 3f @ [1/3] end\n" + "\t b 1b @ [1/3] send_bit\n" + "\t3: @ end\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} +*/ +import "C" + +func (d Device) writeByte16(c byte) { + portSet, maskSet := d.Pin.PortMaskSet() + portClear, maskClear := d.Pin.PortMaskClear() + mask := interrupt.Disable() - value := uint32(c) << 24 - device.AsmFull(` - 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here - nop - nop - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition - 2: @ skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - str {maskClear}, {portClear} @ [2] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit - 3: @ end - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte16(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } @@ -75,130 +1042,9 @@ func (d Device) writeByte48(c byte) { portSet, maskSet := d.Pin.PortMaskSet() portClear, maskClear := d.Pin.PortMaskClear() - // Timings: - // T0H: 17 - 19 cycles or 354.2ns - 395.8ns - // T1H: 51 - 53 cycles or 1062.5ns - 1104.2ns - // TLD: 56 - cycles or 1166.7ns - mask := interrupt.Disable() - value := uint32(c) << 24 - device.AsmFull(` - 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition - 2: @ skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - str {maskClear}, {portClear} @ [2] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit - 3: @ end - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte48(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } @@ -206,165 +1052,9 @@ func (d Device) writeByte64(c byte) { portSet, maskSet := d.Pin.PortMaskSet() portClear, maskClear := d.Pin.PortMaskClear() - // Timings: - // T0H: 23 - 25 cycles or 359.4ns - 390.6ns - // T1H: 68 - 70 cycles or 1062.5ns - 1093.8ns - // TLD: 74 - cycles or 1156.2ns - mask := interrupt.Disable() - value := uint32(c) << 24 - device.AsmFull(` - 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition - 2: @ skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - str {maskClear}, {portClear} @ [2] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit - 3: @ end - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte64(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } @@ -372,287 +1062,9 @@ func (d Device) writeByte120(c byte) { portSet, maskSet := d.Pin.PortMaskSet() portClear, maskClear := d.Pin.PortMaskClear() - // Timings: - // T0H: 42 - 44 cycles or 350.0ns - 366.7ns - // T1H: 126 - 128 cycles or 1050.0ns - 1066.7ns - // TLD: 138 - cycles or 1150.0ns - mask := interrupt.Disable() - value := uint32(c) << 24 - device.AsmFull(` - 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition - 2: @ skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - str {maskClear}, {portClear} @ [2] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit - 3: @ end - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte120(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } @@ -660,393 +1072,8 @@ func (d Device) writeByte168(c byte) { portSet, maskSet := d.Pin.PortMaskSet() portClear, maskClear := d.Pin.PortMaskClear() - // Timings: - // T0H: 59 - 61 cycles or 351.2ns - 363.1ns - // T1H: 177 - 179 cycles or 1053.6ns - 1065.5ns - // TLD: 194 - cycles or 1154.8ns - mask := interrupt.Disable() - value := uint32(c) << 24 - device.AsmFull(` - 1: @ send_bit - str {maskSet}, {portSet} @ [2] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - lsls {value}, #1 @ [1] - bcs.n 2f @ [1/3] skip_store - str {maskClear}, {portClear} @ [2] T0H -> T0L transition - 2: @ skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - str {maskClear}, {portClear} @ [2] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - subs {i}, #1 @ [1] - beq.n 3f @ [1/3] end - b 1b @ [1/3] send_bit - 3: @ end - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte168(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } diff --git a/ws2812/ws2812-asm_tinygoriscv.go b/ws2812/ws2812-asm_tinygoriscv.go index bcd67bbbc..75a1a9924 100644 --- a/ws2812/ws2812-asm_tinygoriscv.go +++ b/ws2812/ws2812-asm_tinygoriscv.go @@ -6,385 +6,1116 @@ package ws2812 // Warning: autogenerated file. Instead of modifying this file, change // gen-ws2812.go and run "go generate". -import ( - "device" - "runtime/interrupt" -) +import "runtime/interrupt" -func (d Device) writeByte160(c byte) { - portSet, maskSet := d.Pin.PortMaskSet() - portClear, maskClear := d.Pin.PortMaskClear() +/* +#include +__attribute__((always_inline)) +void ws2812_writeByte160(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { // Timings: // T0H: 56 - 58 cycles or 350.0ns - 362.5ns // T1H: 168 - 170 cycles or 1050.0ns - 1062.5ns // TLD: 184 - cycles or 1150.0ns - + uint32_t value = (uint32_t)c << 23; + char i = 8; + __asm__ __volatile__( + "1: // send_bit\n" + "\t sw %[maskSet], %[portSet] // [1] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t slli %[value], %[value], 1 // [1] shift value left by 1\n" + "\t bltz %[value], 2f // [1/3] skip_store\n" + "\t sw %[maskClear], %[portClear] // [1] T0H -> T0L transition\n" + "\t2: // skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t sw %[maskClear], %[portClear] // [1] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t addi %[i], %[i], -1 // [1]\n" + "\t bnez %[i], 1b // [1/3] send_bit\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} + +__attribute__((always_inline)) +void ws2812_writeByte320(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) { + // Timings: + // T0H: 112 - 114 cycles or 350.0ns - 356.2ns + // T1H: 336 - 338 cycles or 1050.0ns - 1056.2ns + // TLD: 368 - cycles or 1150.0ns - + uint32_t value = (uint32_t)c << 23; + char i = 8; + __asm__ __volatile__( + "1: // send_bit\n" + "\t sw %[maskSet], %[portSet] // [1] T0H and T0L start here\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t slli %[value], %[value], 1 // [1] shift value left by 1\n" + "\t bltz %[value], 2f // [1/3] skip_store\n" + "\t sw %[maskClear], %[portClear] // [1] T0H -> T0L transition\n" + "\t2: // skip_store\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t sw %[maskClear], %[portClear] // [1] T1H -> T1L transition\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t addi %[i], %[i], -1 // [1]\n" + "\t bnez %[i], 1b // [1/3] send_bit\n" + : [value]"+r"(value), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [portSet]"m"(*portSet), + [maskClear]"r"(maskClear), + [portClear]"m"(*portClear)); +} +*/ +import "C" + +func (d Device) writeByte160(c byte) { + portSet, maskSet := d.Pin.PortMaskSet() + portClear, maskClear := d.Pin.PortMaskClear() + mask := interrupt.Disable() - value := uint32(c) << 23 - device.AsmFull(` - 1: // send_bit - sw {maskSet}, {portSet} // [1] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - slli {value}, {value}, 1 // [1] shift value left by 1 - bltz {value}, 2f // [1/3] skip_store - sw {maskClear}, {portClear} // [1] T0H -> T0L transition - 2: // skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - sw {maskClear}, {portClear} // [1] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - addi {i}, {i}, -1 // [1] - bnez {i}, 1b // [1/3] send_bit - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte160(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } @@ -392,727 +1123,8 @@ func (d Device) writeByte320(c byte) { portSet, maskSet := d.Pin.PortMaskSet() portClear, maskClear := d.Pin.PortMaskClear() - // Timings: - // T0H: 112 - 114 cycles or 350.0ns - 356.2ns - // T1H: 336 - 338 cycles or 1050.0ns - 1056.2ns - // TLD: 368 - cycles or 1150.0ns - mask := interrupt.Disable() - value := uint32(c) << 23 - device.AsmFull(` - 1: // send_bit - sw {maskSet}, {portSet} // [1] T0H and T0L start here - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - slli {value}, {value}, 1 // [1] shift value left by 1 - bltz {value}, 2f // [1/3] skip_store - sw {maskClear}, {portClear} // [1] T0H -> T0L transition - 2: // skip_store - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - sw {maskClear}, {portClear} // [1] T1H -> T1L transition - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - addi {i}, {i}, -1 // [1] - bnez {i}, 1b // [1/3] send_bit - `, map[string]interface{}{ - "value": value, - "i": 8, - "maskSet": maskSet, - "portSet": portSet, - "maskClear": maskClear, - "portClear": portClear, - }) + C.ws2812_writeByte320(C.char(c), portSet, portClear, maskSet, maskClear) + interrupt.Restore(mask) } From 18fabe3c81792735771ce0275d49269a39e09b81 Mon Sep 17 00:00:00 2001 From: Ayke van Laethem Date: Thu, 14 Apr 2022 19:34:15 +0200 Subject: [PATCH 2/2] ws2812: convert AVR assembly to C inline assembly See https://github.com/tinygo-org/drivers/pull/401 for details. I haven't converted it to autogenerated assembly because AVR is different from many other architectures (8-bit, among others) and it didn't seem worth the effort as many chips run at 16MHz anyway. I ran the two AVR smoke tests for the ws2812 driver and the resulting binary is exactly the same. --- ws2812/ws2812_avr.go | 72 ++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/ws2812/ws2812_avr.go b/ws2812/ws2812_avr.go index 481c10b16..57857bb7e 100644 --- a/ws2812/ws2812_avr.go +++ b/ws2812/ws2812_avr.go @@ -6,11 +6,49 @@ package ws2812 // This file implements the WS2812 protocol for AVR microcontrollers. import ( - "device/avr" "machine" "runtime/interrupt" + "unsafe" ) +/* +#include + +__attribute__((always_inline)) +void ws2812_writeByte16(char c, uint8_t *port, uint8_t maskSet, uint8_t maskClear) { + // See: + // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/ + // T0H: 4 cycles or 250ns + // T0L: 14 cycles or 875ns -> together 18 cycles or 1125ns + // T1H: 9 cycles or 562ns + // T1L: 8 cycles or 500ns -> together 17 cycles or 1062ns + char i = 8; + __asm__ __volatile__( + "1:\n" + "\t st %[port], %[maskSet] ; [2] set output high\n" + "\t lsl %[value] ; [1] shift off the next bit, store it in C\n" + "\t brcs 2f ; [1/2] branch if this bit is high (long pulse)\n" + "\t st %[port], %[maskClear] ; [2] set output low (short pulse)\n" + "\t2:\n" + "\t nop ; [4] wait before changing the output again\n" + "\t nop\n" + "\t nop\n" + "\t nop\n" + "\t st %[port], %[maskClear] ; [2] set output low (end of pulse)\n" + "\t nop ; [3]\n" + "\t nop\n" + "\t nop\n" + "\t subi %[i], 1 ; [1] subtract one (for the loop)\n" + "\t brne 1b ; [1/2] send the next bit, if not at the end of the loop\n" + : [value]"+r"(c), + [i]"+r"(i) + : [maskSet]"r"(maskSet), + [maskClear]"r"(maskClear), + [port]"m"(*port)); +} +*/ +import "C" + // Send a single byte using the WS2812 protocol. func (d Device) WriteByte(c byte) error { // On AVR, the port is always the same for setting and clearing a register @@ -23,37 +61,7 @@ func (d Device) WriteByte(c byte) error { switch machine.CPUFrequency() { case 16e6: // 16MHz - // See: - // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/ - // T0H: 4 cycles or 250ns - // T0L: 14 cycles or 875ns -> together 18 cycles or 1125ns - // T1H: 9 cycles or 562ns - // T1L: 8 cycles or 500ns -> together 17 cycles or 1062ns - avr.AsmFull(` - send_bit: - st {portSet}, {maskSet} ; [2] set output high - lsl {value} ; [1] shift off the next bit, store it in C - brcs skip_store ; [1/2] branch if this bit is high (long pulse) - st {portClear}, {maskClear} ; [2] set output low (short pulse) - skip_store: - nop ; [4] wait before changing the output again - nop - nop - nop - st {portClear}, {maskClear} ; [2] set output low (end of pulse) - nop ; [3] - nop - nop - subi {i}, 1 ; [1] subtract one (for the loop) - brne send_bit ; [1/2] send the next bit, if not at the end of the loop - `, map[string]interface{}{ - "value": c, - "i": byte(8), - "maskSet": maskSet, - "portSet": port, - "maskClear": maskClear, - "portClear": port, - }) + C.ws2812_writeByte16(C.char(c), (*uint8)(unsafe.Pointer(port)), maskSet, maskClear) interrupt.Restore(mask) return nil default: