internal/engine/compiler/impl_vec_amd64.go

package compiler

import (
	"errors"

	"github.com/wasilibs/wazerox/internal/asm"
	"github.com/wasilibs/wazerox/internal/asm/amd64"
	"github.com/wasilibs/wazerox/internal/wazeroir"
)

// compileV128Const implements compiler.compileV128Const for amd64 architecture.
func (c *amd64Compiler) compileV128Const(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	lo, hi := o.U1, o.U2

	result, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// We cannot directly load the value from memory to float regs,
	// so we move it to int reg temporarily.
	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// Move the lower 64-bits.
	if lo == 0 {
		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
	} else {
		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(lo), tmpReg)
	}
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, result)

	if lo != 0 && hi == 0 {
		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
	} else if hi != 0 {
		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(hi), tmpReg)
	}
	// Move the higher 64-bits with PINSRQ at the second element of 64x2 vector.
	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmpReg, result, 1)

	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

// compileV128Add implements compiler.compileV128Add for amd64 architecture.
func (c *amd64Compiler) compileV128Add(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}
	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		inst = amd64.PADDB
	case wazeroir.ShapeI16x8:
		inst = amd64.PADDW
	case wazeroir.ShapeI32x4:
		inst = amd64.PADDD
	case wazeroir.ShapeI64x2:
		inst = amd64.PADDQ
	case wazeroir.ShapeF32x4:
		inst = amd64.ADDPS
	case wazeroir.ShapeF64x2:
		inst = amd64.ADDPD
	}
	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	c.locationStack.markRegisterUnused(x2.register)
	return nil
}

// compileV128Sub implements compiler.compileV128Sub for amd64 architecture.
func (c *amd64Compiler) compileV128Sub(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}
	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		inst = amd64.PSUBB
	case wazeroir.ShapeI16x8:
		inst = amd64.PSUBW
	case wazeroir.ShapeI32x4:
		inst = amd64.PSUBD
	case wazeroir.ShapeI64x2:
		inst = amd64.PSUBQ
	case wazeroir.ShapeF32x4:
		inst = amd64.SUBPS
	case wazeroir.ShapeF64x2:
		inst = amd64.SUBPD
	}
	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	c.locationStack.markRegisterUnused(x2.register)
	return nil
}

// compileV128Load implements compiler.compileV128Load for amd64 architecture.
func (c *amd64Compiler) compileV128Load(o *wazeroir.UnionOperation) error {
	result, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	offset := uint32(o.U2)
	loadType := wazeroir.V128LoadType(o.B1)

	switch loadType {
	case wazeroir.V128LoadType128:
		err = c.compileV128LoadImpl(amd64.MOVDQU, offset, 16, result)
	case wazeroir.V128LoadType8x8s:
		err = c.compileV128LoadImpl(amd64.PMOVSXBW, offset, 8, result)
	case wazeroir.V128LoadType8x8u:
		err = c.compileV128LoadImpl(amd64.PMOVZXBW, offset, 8, result)
	case wazeroir.V128LoadType16x4s:
		err = c.compileV128LoadImpl(amd64.PMOVSXWD, offset, 8, result)
	case wazeroir.V128LoadType16x4u:
		err = c.compileV128LoadImpl(amd64.PMOVZXWD, offset, 8, result)
	case wazeroir.V128LoadType32x2s:
		err = c.compileV128LoadImpl(amd64.PMOVSXDQ, offset, 8, result)
	case wazeroir.V128LoadType32x2u:
		err = c.compileV128LoadImpl(amd64.PMOVZXDQ, offset, 8, result)
	case wazeroir.V128LoadType8Splat:
		reg, err := c.compileMemoryAccessCeilSetup(offset, 1)
		if err != nil {
			return err
		}
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVBQZX, amd64ReservedRegisterForMemory, -1,
			reg, 1, reg)
		// pinsrb   $0, reg, result
		// pxor	    tmpVReg, tmpVReg
		// pshufb   tmpVReg, result
		c.locationStack.markRegisterUsed(result)
		tmpVReg, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0)
		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg)
		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result)
	case wazeroir.V128LoadType16Splat:
		reg, err := c.compileMemoryAccessCeilSetup(offset, 2)
		if err != nil {
			return err
		}
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVWQZX, amd64ReservedRegisterForMemory, -2,
			reg, 1, reg)
		// pinsrw $0, reg, result
		// pinsrw $1, reg, result
		// pshufd $0, result, result (result = result[0,0,0,0])
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
	case wazeroir.V128LoadType32Splat:
		reg, err := c.compileMemoryAccessCeilSetup(offset, 4)
		if err != nil {
			return err
		}
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVLQZX, amd64ReservedRegisterForMemory, -4,
			reg, 1, reg)
		// pinsrd $0, reg, result
		// pshufd $0, result, result (result = result[0,0,0,0])
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
	case wazeroir.V128LoadType64Splat:
		reg, err := c.compileMemoryAccessCeilSetup(offset, 8)
		if err != nil {
			return err
		}
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, amd64ReservedRegisterForMemory, -8,
			reg, 1, reg)
		// pinsrq $0, reg, result
		// pinsrq $1, reg, result
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1)
	case wazeroir.V128LoadType32zero:
		err = c.compileV128LoadImpl(amd64.MOVL, offset, 4, result)
	case wazeroir.V128LoadType64zero:
		err = c.compileV128LoadImpl(amd64.MOVQ, offset, 8, result)
	}

	if err != nil {
		return err
	}

	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

func (c *amd64Compiler) compileV128LoadImpl(inst asm.Instruction, offset uint32, targetSizeInBytes int64, dst asm.Register) error {
	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}
	c.assembler.CompileMemoryWithIndexToRegister(inst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
		offsetReg, 1, dst)
	return nil
}

// compileV128LoadLane implements compiler.compileV128LoadLane for amd64.
func (c *amd64Compiler) compileV128LoadLane(o *wazeroir.UnionOperation) error {
	targetVector := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(targetVector); err != nil {
		return err
	}

	laneSize, laneIndex := o.B1, o.B2
	offset := uint32(o.U2)

	var insertInst asm.Instruction
	switch laneSize {
	case 8:
		insertInst = amd64.PINSRB
	case 16:
		insertInst = amd64.PINSRW
	case 32:
		insertInst = amd64.PINSRD
	case 64:
		insertInst = amd64.PINSRQ
	}

	targetSizeInBytes := int64(laneSize / 8)
	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}
	c.assembler.CompileMemoryWithIndexAndArgToRegister(insertInst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
		offsetReg, 1, targetVector.register, laneIndex)

	c.pushVectorRuntimeValueLocationOnRegister(targetVector.register)
	return nil
}

// compileV128Store implements compiler.compileV128Store for amd64.
func (c *amd64Compiler) compileV128Store(o *wazeroir.UnionOperation) error {
	val := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(val); err != nil {
		return err
	}

	const targetSizeInBytes = 16
	offset := uint32(o.U2)
	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVDQU, val.register,
		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1)

	c.locationStack.markRegisterUnused(val.register, offsetReg)
	return nil
}

// compileV128StoreLane implements compiler.compileV128StoreLane for amd64.
func (c *amd64Compiler) compileV128StoreLane(o *wazeroir.UnionOperation) error {
	var storeInst asm.Instruction
	laneSize := o.B1
	laneIndex := o.B2
	offset := uint32(o.U2)
	switch laneSize {
	case 8:
		storeInst = amd64.PEXTRB
	case 16:
		storeInst = amd64.PEXTRW
	case 32:
		storeInst = amd64.PEXTRD
	case 64:
		storeInst = amd64.PEXTRQ
	}

	val := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(val); err != nil {
		return err
	}

	targetSizeInBytes := int64(laneSize / 8)
	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	c.assembler.CompileRegisterToMemoryWithIndexAndArg(storeInst, val.register,
		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1, laneIndex)

	c.locationStack.markRegisterUnused(val.register, offsetReg)
	return nil
}

// compileV128ExtractLane implements compiler.compileV128ExtractLane for amd64.
func (c *amd64Compiler) compileV128ExtractLane(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vreg := v.register
	shape := o.B1
	laneIndex := o.B2
	signed := o.B3
	switch shape {
	case wazeroir.ShapeI8x16:
		result, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRB, vreg, result, laneIndex)
		if signed {
			c.assembler.CompileRegisterToRegister(amd64.MOVBLSX, result, result)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, result, result)
		}
		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
		c.locationStack.markRegisterUnused(vreg)
	case wazeroir.ShapeI16x8:
		result, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRW, vreg, result, laneIndex)
		if signed {
			c.assembler.CompileRegisterToRegister(amd64.MOVWLSX, result, result)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.MOVWLZX, result, result)
		}
		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
		c.locationStack.markRegisterUnused(vreg)
	case wazeroir.ShapeI32x4:
		result, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRD, vreg, result, laneIndex)
		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
		c.locationStack.markRegisterUnused(vreg)
	case wazeroir.ShapeI64x2:
		result, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, vreg, result, laneIndex)
		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
		c.locationStack.markRegisterUnused(vreg)
	case wazeroir.ShapeF32x4:
		if laneIndex != 0 {
			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, laneIndex)
		}
		c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF32)
	case wazeroir.ShapeF64x2:
		if laneIndex != 0 {
			// This case we can assume LaneIndex == 1.
			// We have to modify the val.register as, for example:
			//    0b11 0b10 0b01 0b00
			//     |    |    |    |
			//   [x3,  x2,  x1,  x0] -> [x0,  x0,  x3,  x2]
			// where val.register = [x3, x2, x1, x0] and each xN = 32bits.
			// Then, we interpret the register as float64, therefore, the float64 value is obtained as [x3, x2].
			arg := byte(0b00_00_11_10)
			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, arg)
		}
		c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF64)
	}

	return nil
}

// compileV128ReplaceLane implements compiler.compileV128ReplaceLane for amd64.
func (c *amd64Compiler) compileV128ReplaceLane(o *wazeroir.UnionOperation) error {
	origin := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(origin); err != nil {
		return err
	}

	vector := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(vector); err != nil {
		return err
	}

	shape := o.B1
	laneIndex := o.B2
	switch shape {
	case wazeroir.ShapeI8x16:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, vector.register, laneIndex)
	case wazeroir.ShapeI16x8:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, vector.register, laneIndex)
	case wazeroir.ShapeI32x4:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, vector.register, laneIndex)
	case wazeroir.ShapeI64x2:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, vector.register, laneIndex)
	case wazeroir.ShapeF32x4:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, vector.register,
			// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
			// See https://www.felixcloutier.com/x86/insertps
			laneIndex<<4,
		)
	case wazeroir.ShapeF64x2:
		if laneIndex == 0 {
			c.assembler.CompileRegisterToRegister(amd64.MOVSD, origin.register, vector.register)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, vector.register)
		}
	}

	c.pushVectorRuntimeValueLocationOnRegister(vector.register)
	c.locationStack.markRegisterUnused(origin.register)
	return nil
}

// compileV128Splat implements compiler.compileV128Splat for amd64.
func (c *amd64Compiler) compileV128Splat(o *wazeroir.UnionOperation) (err error) {
	origin := c.locationStack.pop()
	if err = c.compileEnsureOnRegister(origin); err != nil {
		return
	}

	var result asm.Register
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		result, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.locationStack.markRegisterUsed(result)

		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, result, 0)
		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, result)
	case wazeroir.ShapeI16x8:
		result, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.locationStack.markRegisterUsed(result)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 1)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
	case wazeroir.ShapeI32x4:
		result, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.locationStack.markRegisterUsed(result)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
	case wazeroir.ShapeI64x2:
		result, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.locationStack.markRegisterUsed(result)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 1)
	case wazeroir.ShapeF32x4:
		result = origin.register
		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, result, 0)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
	case wazeroir.ShapeF64x2:
		result = origin.register
		c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, result)
		c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, result)
	}

	c.locationStack.markRegisterUnused(origin.register)
	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

// compileV128Shuffle implements compiler.compileV128Shuffle for amd64.
func (c *amd64Compiler) compileV128Shuffle(o *wazeroir.UnionOperation) error {
	w := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(w); err != nil {
		return err
	}

	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	wr, vr := w.register, v.register

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	consts := [32]byte{}
	lanes := o.Us
	for i, unsignedLane := range lanes {
		lane := byte(unsignedLane)
		if lane < 16 {
			consts[i+16] = 0x80
			consts[i] = lane
		} else {
			consts[i+16] = lane - 16
			consts[i] = 0x80
		}
	}

	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[:16]), tmp)
	if err != nil {
		return err
	}
	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, vr)
	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[16:]), tmp)
	if err != nil {
		return err
	}
	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, wr)
	c.assembler.CompileRegisterToRegister(amd64.ORPS, vr, wr)

	c.pushVectorRuntimeValueLocationOnRegister(wr)
	c.locationStack.markRegisterUnused(vr)
	return nil
}

var swizzleConst = [16]byte{
	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
}

// compileV128Swizzle implements compiler.compileV128Swizzle for amd64.
func (c *amd64Compiler) compileV128Swizzle(*wazeroir.UnionOperation) error {
	index := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(index); err != nil {
		return err
	}

	base := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(base); err != nil {
		return err
	}

	idxReg, baseReg := index.register, base.register

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(swizzleConst[:]), tmp)
	if err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PADDUSB, tmp, idxReg)
	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, idxReg, baseReg)

	c.pushVectorRuntimeValueLocationOnRegister(baseReg)
	c.locationStack.markRegisterUnused(idxReg)
	return nil
}

// compileV128AnyTrue implements compiler.compileV128AnyTrue for amd64.
func (c *amd64Compiler) compileV128AnyTrue(*wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vreg := v.register

	c.assembler.CompileRegisterToRegister(amd64.PTEST, vreg, vreg)

	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateNE)
	c.locationStack.markRegisterUnused(vreg)
	return nil
}

// compileV128AllTrue implements compiler.compileV128AllTrue for amd64.
func (c *amd64Compiler) compileV128AllTrue(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var cmpInst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		cmpInst = amd64.PCMPEQB
	case wazeroir.ShapeI16x8:
		cmpInst = amd64.PCMPEQW
	case wazeroir.ShapeI32x4:
		cmpInst = amd64.PCMPEQD
	case wazeroir.ShapeI64x2:
		cmpInst = amd64.PCMPEQQ
	}

	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
	c.assembler.CompileRegisterToRegister(cmpInst, v.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.PTEST, tmp, tmp)
	c.locationStack.markRegisterUnused(v.register, tmp)
	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
	return nil
}

// compileV128BitMask implements compiler.compileV128BitMask for amd64.
func (c *amd64Compiler) compileV128BitMask(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
	case wazeroir.ShapeI16x8:
		// When we have:
		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
		//	where RX(wn) is n-th signed word (16-bit) of RX register,
		//
		// "PACKSSWB R1, R2" produces
		//  R1 = [
		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
		//  ]
		//  where R1 is the destination register, and
		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
		//                0x80 if w is less than 0x80
		//                0x7F if w is greater than 0x7f
		//
		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
		//
		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, v.register, v.register)
		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
		// Clear the higher bits than 8.
		c.assembler.CompileConstToRegister(amd64.SHRQ, 8, result)
	case wazeroir.ShapeI32x4:
		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPS, v.register, result)
	case wazeroir.ShapeI64x2:
		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPD, v.register, result)
	}

	c.locationStack.markRegisterUnused(v.register)
	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
	return nil
}

// compileV128And implements compiler.compileV128And for amd64.
func (c *amd64Compiler) compileV128And(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PAND, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Not implements compiler.compileV128Not for amd64.
func (c *amd64Compiler) compileV128Not(*wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// Set all bits on tmp register.
	c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
	// Then XOR with tmp to reverse all bits on v.register.
	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, v.register)
	c.pushVectorRuntimeValueLocationOnRegister(v.register)
	return nil
}

// compileV128Or implements compiler.compileV128Or for amd64.
func (c *amd64Compiler) compileV128Or(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.POR, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Xor implements compiler.compileV128Xor for amd64.
func (c *amd64Compiler) compileV128Xor(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PXOR, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Bitselect implements compiler.compileV128Bitselect for amd64.
func (c *amd64Compiler) compileV128Bitselect(*wazeroir.UnionOperation) error {
	selector := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(selector); err != nil {
		return err
	}

	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// The following logic is equivalent to v128.or(v128.and(v1, selector), v128.and(v2, v128.not(selector)))
	// See https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select
	c.assembler.CompileRegisterToRegister(amd64.PAND, selector.register, x1.register)
	c.assembler.CompileRegisterToRegister(amd64.PANDN, x2.register, selector.register)
	c.assembler.CompileRegisterToRegister(amd64.POR, selector.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register, selector.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128AndNot implements compiler.compileV128AndNot for amd64.
func (c *amd64Compiler) compileV128AndNot(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PANDN, x1.register, x2.register)

	c.locationStack.markRegisterUnused(x1.register)
	c.pushVectorRuntimeValueLocationOnRegister(x2.register)
	return nil
}

// compileV128Shr implements compiler.compileV128Shr for amd64.
func (c *amd64Compiler) compileV128Shr(o *wazeroir.UnionOperation) error {
	// https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
	shape := o.B1
	signed := o.B3
	if shape == wazeroir.ShapeI8x16 {
		return c.compileV128ShrI8x16Impl(signed)
	} else if shape == wazeroir.ShapeI64x2 && signed {
		return c.compileV128ShrI64x2SignedImpl()
	} else {
		return c.compileV128ShrImpl(o)
	}
}

// compileV128ShrImpl implements shift right instructions except for i8x16 (logical/arithmetic) and i64x2 (arithmetic).
func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.UnionOperation) error {
	s := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(s); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	vecTmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var moduleConst int64
	var shift asm.Instruction
	shape := o.B1
	signed := o.B3
	switch shape {
	case wazeroir.ShapeI16x8:
		moduleConst = 0xf // modulo 16.
		if signed {
			shift = amd64.PSRAW
		} else {
			shift = amd64.PSRLW
		}
	case wazeroir.ShapeI32x4:
		moduleConst = 0x1f // modulo 32.
		if signed {
			shift = amd64.PSRAD
		} else {
			shift = amd64.PSRLD
		}
	case wazeroir.ShapeI64x2:
		moduleConst = 0x3f // modulo 64.
		shift = amd64.PSRLQ
	}

	gpShiftAmount := s.register
	c.assembler.CompileConstToRegister(amd64.ANDQ, moduleConst, gpShiftAmount)
	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)

	c.locationStack.markRegisterUnused(gpShiftAmount)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift.
// PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq
func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error {
	const shiftCountRegister = amd64.RegCX

	s := c.locationStack.pop()
	if s.register != shiftCountRegister {
		// If another value lives on the CX register, we release it to the stack.
		c.onValueReleaseRegisterToStack(shiftCountRegister)
		if s.onStack() {
			s.setRegister(shiftCountRegister)
			c.compileLoadValueOnStackToRegister(s)
		} else if s.onConditionalRegister() {
			c.compileMoveConditionalToGeneralPurposeRegister(s, shiftCountRegister)
		} else { // already on register.
			old := s.register
			c.assembler.CompileRegisterToRegister(amd64.MOVL, old, shiftCountRegister)
			s.setRegister(shiftCountRegister)
			c.locationStack.markRegisterUnused(old)
		}
	}

	c.locationStack.markRegisterUsed(shiftCountRegister)
	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Extract each lane into tmp, execute SHR on tmp, and write it back to the lane.
	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 0)
	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 0)
	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 1)
	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 1)

	c.locationStack.markRegisterUnused(shiftCountRegister)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
}

// compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i8x16 signed logical/arithmetic shifts.
// amd64 doesn't have packed byte shifts, so we need this special casing.
// See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
func (c *amd64Compiler) compileV128ShrI8x16Impl(signed bool) error {
	s := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(s); err != nil {
		return err
	}

	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	vecTmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	gpShiftAmount := s.register
	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x7, gpShiftAmount) // mod 8.

	if signed {
		c.locationStack.markRegisterUsed(vecTmp)
		vecTmp2, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}

		vreg := v.register

		// Copy the value from v.register to vecTmp.
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vreg, vecTmp)

		// Assuming that we have
		//  vreg   = [b1, ..., b16]
		//  vecTmp = [b1, ..., b16]
		// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
		//  vreg   = [b1, b1, b2, b2, ..., b8, b8]
		//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
		c.assembler.CompileRegisterToRegister(amd64.PUNPCKLBW, vreg, vreg)
		c.assembler.CompileRegisterToRegister(amd64.PUNPCKHBW, vecTmp, vecTmp)

		// Adding 8 to the shift amount, and then move the amount to vecTmp2.
		c.assembler.CompileConstToRegister(amd64.ADDQ, 0x8, gpShiftAmount)
		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp2)

		// Perform the word packed arithmetic right shifts on vreg and vecTmp.
		// This changes these two registers as:
		//  vreg   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
		//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
		// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vreg)
		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vecTmp)

		// Finally, we can get the result by packing these two word vectors.
		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, vecTmp, vreg)

		c.locationStack.markRegisterUnused(gpShiftAmount, vecTmp)
		c.pushVectorRuntimeValueLocationOnRegister(vreg)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
		// amd64 doesn't have packed byte shifts, so we packed word shift here, and then mark-out
		// the unnecessary bits below.
		c.assembler.CompileRegisterToRegister(amd64.PSRLW, vecTmp, v.register)

		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}

		// Read the initial address of the mask table into gpTmp register.
		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16LogicalSHRMaskTable[:]), gpTmp)
		if err != nil {
			return err
		}

		// We have to get the mask according to the shift amount, so we first have to do
		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)

		// Now ready to read the content of the mask into the vecTmp.
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
			gpTmp, 0, gpShiftAmount, 1,
			vecTmp,
		)

		// Finally, clear out the unnecessary
		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, v.register)

		c.locationStack.markRegisterUnused(gpShiftAmount)
		c.pushVectorRuntimeValueLocationOnRegister(v.register)
	}
	return nil
}

// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
}

// compileV128Shl implements compiler.compileV128Shl for amd64.
func (c *amd64Compiler) compileV128Shl(o *wazeroir.UnionOperation) error {
	s := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(s); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	vecTmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var modulo int64
	var shift asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		modulo = 0x7 // modulo 8.
		// x86 doesn't have packed bytes shift, so we use PSLLW and mask-out the redundant bits.
		// See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
		shift = amd64.PSLLW
	case wazeroir.ShapeI16x8:
		modulo = 0xf // modulo 16.
		shift = amd64.PSLLW
	case wazeroir.ShapeI32x4:
		modulo = 0x1f // modulo 32.
		shift = amd64.PSLLD
	case wazeroir.ShapeI64x2:
		modulo = 0x3f // modulo 64.
		shift = amd64.PSLLQ
	}

	gpShiftAmount := s.register
	c.assembler.CompileConstToRegister(amd64.ANDQ, modulo, gpShiftAmount)
	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)

	if shape == wazeroir.ShapeI8x16 {
		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return err
		}

		// Read the initial address of the mask table into gpTmp register.
		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16SHLMaskTable[:]), gpTmp)
		if err != nil {
			return err
		}

		// We have to get the mask according to the shift amount, so we first have to do
		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)

		// Now ready to read the content of the mask into the vecTmp.
		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
			gpTmp, 0, gpShiftAmount, 1,
			vecTmp,
		)

		// Finally, clear out the unnecessary
		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, x1.register)
	}

	c.locationStack.markRegisterUnused(gpShiftAmount)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Cmp implements compiler.compileV128Cmp for amd64.
func (c *amd64Compiler) compileV128Cmp(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	const (
		// See https://www.felixcloutier.com/x86/cmppd and https://www.felixcloutier.com/x86/cmpps
		floatEqualArg           = 0
		floatLessThanArg        = 1
		floatLessThanOrEqualArg = 2
		floatNotEqualARg        = 4
	)

	x1Reg, x2Reg, result := x1.register, x2.register, asm.NilRegister
	v128CmpType := o.B1
	switch v128CmpType {
	case wazeroir.V128CmpTypeF32x4Eq:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatEqualArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF32x4Ne:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatNotEqualARg)
		result = x1Reg
	case wazeroir.V128CmpTypeF32x4Lt:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF32x4Gt:
		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanArg)
		result = x2Reg
	case wazeroir.V128CmpTypeF32x4Le:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanOrEqualArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF32x4Ge:
		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanOrEqualArg)
		result = x2Reg
	case wazeroir.V128CmpTypeF64x2Eq:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatEqualArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF64x2Ne:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatNotEqualARg)
		result = x1Reg
	case wazeroir.V128CmpTypeF64x2Lt:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF64x2Gt:
		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanArg)
		result = x2Reg
	case wazeroir.V128CmpTypeF64x2Le:
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanOrEqualArg)
		result = x1Reg
	case wazeroir.V128CmpTypeF64x2Ge:
		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanOrEqualArg)
		result = x2Reg
	case wazeroir.V128CmpTypeI8x16Eq:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI8x16Ne:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI8x16LtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x1Reg, x2Reg)
		result = x2Reg
	case wazeroir.V128CmpTypeI8x16LtU, wazeroir.V128CmpTypeI8x16GtU:
		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
		if v128CmpType == wazeroir.V128CmpTypeI8x16LtU {
			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, x1Reg)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, x1Reg)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI8x16GtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI8x16LeS, wazeroir.V128CmpTypeI8x16LeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		// Copy the value on the src to tmp.
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI8x16LeS {
			c.assembler.CompileRegisterToRegister(amd64.PMINSB, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI8x16GeS, wazeroir.V128CmpTypeI8x16GeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI8x16GeS {
			c.assembler.CompileRegisterToRegister(amd64.PMAXSB, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8Eq:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8Ne:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8LtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x1Reg, x2Reg)
		result = x2Reg
	case wazeroir.V128CmpTypeI16x8LtU, wazeroir.V128CmpTypeI16x8GtU:
		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
		if v128CmpType == wazeroir.V128CmpTypeI16x8LtU {
			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, x1Reg)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, x1Reg)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8GtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8LeS, wazeroir.V128CmpTypeI16x8LeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		// Copy the value on the src to tmp.
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI16x8LeS {
			c.assembler.CompileRegisterToRegister(amd64.PMINSW, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI16x8GeS, wazeroir.V128CmpTypeI16x8GeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI16x8GeS {
			c.assembler.CompileRegisterToRegister(amd64.PMAXSW, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4Eq:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4Ne:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4LtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x1Reg, x2Reg)
		result = x2Reg
	case wazeroir.V128CmpTypeI32x4LtU, wazeroir.V128CmpTypeI32x4GtU:
		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
		if v128CmpType == wazeroir.V128CmpTypeI32x4LtU {
			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, x1Reg)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, x1Reg)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4GtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4LeS, wazeroir.V128CmpTypeI32x4LeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		// Copy the value on the src to tmp.
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI32x4LeS {
			c.assembler.CompileRegisterToRegister(amd64.PMINSD, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI32x4GeS, wazeroir.V128CmpTypeI32x4GeU:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
		if v128CmpType == wazeroir.V128CmpTypeI32x4GeS {
			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, x2Reg, tmp)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, tmp)
		}
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI64x2Eq:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI64x2Ne:
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI64x2LtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
		result = x2Reg
	case wazeroir.V128CmpTypeI64x2GtS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI64x2LeS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
		// Set all bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
		// Swap the bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
		result = x1Reg
	case wazeroir.V128CmpTypeI64x2GeS:
		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
		// Set all bits on x1Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x1Reg, x1Reg)
		// Swap the bits on x2Reg register.
		c.assembler.CompileRegisterToRegister(amd64.PXOR, x1Reg, x2Reg)
		result = x2Reg
	}

	c.locationStack.markRegisterUnused(x1Reg, x2Reg)
	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

// compileV128AddSat implements compiler.compileV128AddSat for amd64.
func (c *amd64Compiler) compileV128AddSat(o *wazeroir.UnionOperation) error {
	var inst asm.Instruction
	shape := o.B1
	signed := o.B3
	switch shape {
	case wazeroir.ShapeI8x16:
		if signed {
			inst = amd64.PADDSB
		} else {
			inst = amd64.PADDUSB
		}
	case wazeroir.ShapeI16x8:
		if signed {
			inst = amd64.PADDSW
		} else {
			inst = amd64.PADDUSW
		}
	}

	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128SubSat implements compiler.compileV128SubSat for amd64.
func (c *amd64Compiler) compileV128SubSat(o *wazeroir.UnionOperation) error {
	var inst asm.Instruction
	shape := o.B1
	signed := o.B3
	switch shape {
	case wazeroir.ShapeI8x16:
		if signed {
			inst = amd64.PSUBSB
		} else {
			inst = amd64.PSUBUSB
		}
	case wazeroir.ShapeI16x8:
		if signed {
			inst = amd64.PSUBSW
		} else {
			inst = amd64.PSUBUSW
		}
	}

	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Mul implements compiler.compileV128Mul for amd64.
func (c *amd64Compiler) compileV128Mul(o *wazeroir.UnionOperation) error {
	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI16x8:
		inst = amd64.PMULLW
	case wazeroir.ShapeI32x4:
		inst = amd64.PMULLD
	case wazeroir.ShapeI64x2:
		return c.compileV128MulI64x2()
	case wazeroir.ShapeF32x4:
		inst = amd64.MULPS
	case wazeroir.ShapeF64x2:
		inst = amd64.MULPD
	}

	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128MulI64x2 implements V128Mul for i64x2.
func (c *amd64Compiler) compileV128MulI64x2() error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	x1r, x2r := x1.register, x2.register

	tmp1, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(tmp1)

	tmp2, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// Assuming that we have
	//	x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
	//  x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
	// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.

	// Copy x1's value into tmp1.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1)
	// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1)

	// Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1)

	// Copy x2's value into tmp2.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2)
	// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2)

	// Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2)

	// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
	// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1)
	c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1)

	// Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r)

	// Finally, we get the result by adding x1r and tmp1,
	// which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r)

	c.locationStack.markRegisterUnused(x2r, tmp1)
	c.pushVectorRuntimeValueLocationOnRegister(x1r)
	return nil
}

// compileV128Div implements compiler.compileV128Div for amd64.
func (c *amd64Compiler) compileV128Div(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeF32x4:
		inst = amd64.DIVPS
	case wazeroir.ShapeF64x2:
		inst = amd64.DIVPD
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Neg implements compiler.compileV128Neg for amd64.
func (c *amd64Compiler) compileV128Neg(o *wazeroir.UnionOperation) error {
	shape := o.B1
	if shape <= wazeroir.ShapeI64x2 {
		return c.compileV128NegInt(shape)
	} else {
		return c.compileV128NegFloat(shape)
	}
}

// compileV128NegInt implements compiler.compileV128Neg for integer lanes.
func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var subInst asm.Instruction
	switch s {
	case wazeroir.ShapeI8x16:
		subInst = amd64.PSUBB
	case wazeroir.ShapeI16x8:
		subInst = amd64.PSUBW
	case wazeroir.ShapeI32x4:
		subInst = amd64.PSUBD
	case wazeroir.ShapeI64x2:
		subInst = amd64.PSUBQ
	}

	c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result)
	c.assembler.CompileRegisterToRegister(subInst, v.register, result)

	c.locationStack.markRegisterUnused(v.register)
	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

// compileV128NegInt implements compiler.compileV128Neg for float lanes.
func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var leftShiftInst, xorInst asm.Instruction
	var leftShiftAmount asm.ConstantValue
	if s == wazeroir.ShapeF32x4 {
		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS
	} else {
		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD
	}

	// Clear all bits on tmp.
	c.assembler.CompileRegisterToRegister(amd64.XORPS, tmp, tmp)
	// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
	// See https://www.felixcloutier.com/x86/cmpps
	//
	// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
	// if the lane is NaN.
	c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0x8)
	// Do the left shift on each lane to set only the most significant bit in each.
	c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp)
	// Get the negated result by XOR on each lane with tmp.
	c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register)

	c.pushVectorRuntimeValueLocationOnRegister(v.register)
	return nil
}

// compileV128Sqrt implements compiler.compileV128Sqrt for amd64.
func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeF64x2:
		inst = amd64.SQRTPD
	case wazeroir.ShapeF32x4:
		inst = amd64.SQRTPS
	}

	c.assembler.CompileRegisterToRegister(inst, v.register, v.register)
	c.pushVectorRuntimeValueLocationOnRegister(v.register)
	return nil
}

// compileV128Abs implements compiler.compileV128Abs for amd64.
func (c *amd64Compiler) compileV128Abs(o *wazeroir.UnionOperation) error {
	shape := o.B1
	if shape == wazeroir.ShapeI64x2 {
		return c.compileV128AbsI64x2()
	}

	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	result := v.register
	switch shape {
	case wazeroir.ShapeI8x16:
		c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result)
	case wazeroir.ShapeI16x8:
		c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result)
	case wazeroir.ShapeI32x4:
		c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result)
	case wazeroir.ShapeF32x4:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		// Set all bits on tmp.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
		// Shift right packed single floats by 1 to clear the sign bits.
		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp)
		// Clear the sign bit of vr.
		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result)
	case wazeroir.ShapeF64x2:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		// Set all bits on tmp.
		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
		// Shift right packed single floats by 1 to clear the sign bits.
		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp)
		// Clear the sign bit of vr.
		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result)
	}

	c.pushVectorRuntimeValueLocationOnRegister(result)
	return nil
}

// compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes.
func (c *amd64Compiler) compileV128AbsI64x2() error {
	// See https://www.felixcloutier.com/x86/blendvpd
	const blendMaskReg = amd64.RegX0
	c.onValueReleaseRegisterToStack(blendMaskReg)
	c.locationStack.markRegisterUsed(blendMaskReg)

	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	if vr == blendMaskReg {
		return errors.New("BUG: X0 must not be used")
	}

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp)

	// Copy the value to tmp.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)

	// Clear all bits on blendMaskReg.
	c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg)
	// Subtract vr from blendMaskReg.
	c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg)
	// Copy the subtracted value ^^ back into vr.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr)

	c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr)

	c.locationStack.markRegisterUnused(blendMaskReg, tmp)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

var (
	popcntMask = [16]byte{
		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
	}
	// popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05).
	popcntTable = [16]byte{
		0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
		0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
	}
)

// compileV128Popcnt implements compiler.compileV128Popcnt for amd64.
func (c *amd64Compiler) compileV128Popcnt(operation *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	tmp1, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(tmp1)

	tmp2, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(tmp2)

	tmp3, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// Read the popcntMask into tmp1, and we have
	//  tmp1 = [0xf, ..., 0xf]
	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntMask[:]), tmp1); err != nil {
		return err
	}

	// Copy the original value into tmp2.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)

	// Given that we have:
	//  v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
	//
	// Take PAND on tmp1 and tmp2, and we have
	//  tmp2 = [l1, ..., l16].
	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2)

	// Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have
	//  vr = [h1, ...., h16].
	c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr)
	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr)

	// Read the popcntTable into tmp1, and we have
	//  tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntTable[:]), tmp1); err != nil {
		return err
	}

	// Copy the tmp1 into tmp3, and we have
	//  tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3)

	//  tmp3 = [popcnt(l1), ..., popcnt(l16)].
	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3)

	//  tmp1 = [popcnt(h1), ..., popcnt(h16)].
	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1)

	// vr = tmp1 = [popcnt(h1), ..., popcnt(h16)].
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr)

	// vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)].
	c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr)

	c.locationStack.markRegisterUnused(tmp1, tmp2)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128Min implements compiler.compileV128Min for amd64.
func (c *amd64Compiler) compileV128Min(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	shape := o.B1
	if shape >= wazeroir.ShapeF32x4 {
		return c.compileV128FloatMinImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register)
	}

	signed := o.B3
	var inst asm.Instruction
	switch shape {
	case wazeroir.ShapeI8x16:
		if signed {
			inst = amd64.PMINSB
		} else {
			inst = amd64.PMINUB
		}
	case wazeroir.ShapeI16x8:
		if signed {
			inst = amd64.PMINSW
		} else {
			inst = amd64.PMINUW
		}
	case wazeroir.ShapeI32x4:
		if signed {
			inst = amd64.PMINSD
		} else {
			inst = amd64.PMINUD
		}
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128FloatMinImpl implements compiler.compileV128Min for float lanes.
func (c *amd64Compiler) compileV128FloatMinImpl(is32bit bool, x1r, x2r asm.Register) error {
	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var min, cmp, andn, or, srl /* shit right logical */ asm.Instruction
	var shiftNumToInverseNaN asm.ConstantValue
	if is32bit {
		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa
	} else {
		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd
	}

	// Let v1 and v2 be the operand values on x1r and x2r at this point.

	// Copy the value into tmp: tmp=v1
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
	// tmp=min(v1, v2)
	c.assembler.CompileRegisterToRegister(min, x2r, tmp)
	// x2r=min(v2, v1)
	c.assembler.CompileRegisterToRegister(min, x1r, x2r)
	// x1r=min(v2, v1)
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, x1r)

	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
	//       NaN         if v1 == NaN || v2 == NaN
	//       min(v1, v2) otherwise
	c.assembler.CompileRegisterToRegister(or, tmp, x2r)
	// x1r = 0^ (set all bits) if v1 == NaN || v2 == NaN
	//       0 otherwise
	c.assembler.CompileRegisterToRegisterWithArg(cmp, tmp, x1r, 3)
	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
	//       ^0          if v1 == NaN || v2 == NaN
	//       min(v1, v2) otherwise
	c.assembler.CompileRegisterToRegister(or, x1r, x2r)
	// x1r = set all bits on the mantissa bits
	//       0 otherwise
	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
	// x1r = x2r and !x1r
	//     = -0                                                   if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
	//       set all bits on exponential and sign bit (== NaN)    if v1 == NaN || v2 == NaN
	//       min(v1, v2)                                          otherwise
	c.assembler.CompileRegisterToRegister(andn, x2r, x1r)

	c.locationStack.markRegisterUnused(x2r)
	c.pushVectorRuntimeValueLocationOnRegister(x1r)
	return nil
}

// compileV128Max implements compiler.compileV128Max for amd64.
func (c *amd64Compiler) compileV128Max(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	shape := o.B1
	if shape >= wazeroir.ShapeF32x4 {
		return c.compileV128FloatMaxImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register)
	}

	signed := o.B3
	var inst asm.Instruction
	switch shape {
	case wazeroir.ShapeI8x16:
		if signed {
			inst = amd64.PMAXSB
		} else {
			inst = amd64.PMAXUB
		}
	case wazeroir.ShapeI16x8:
		if signed {
			inst = amd64.PMAXSW
		} else {
			inst = amd64.PMAXUW
		}
	case wazeroir.ShapeI32x4:
		if signed {
			inst = amd64.PMAXSD
		} else {
			inst = amd64.PMAXUD
		}
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128FloatMaxImpl implements compiler.compileV128Max for float lanes.
func (c *amd64Compiler) compileV128FloatMaxImpl(is32bit bool, x1r, x2r asm.Register) error {
	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	var max, cmp, andn, or, xor, sub, srl /* shit right logical */ asm.Instruction
	var shiftNumToInverseNaN asm.ConstantValue
	if is32bit {
		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.XORPS, amd64.SUBPS, amd64.PSRLD, 0xa
	} else {
		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.XORPD, amd64.SUBPD, amd64.PSRLQ, 0xd
	}

	// Let v1 and v2 be the operand values on x1r and x2r at this point.

	// Copy the value into tmp: tmp=v2
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp)
	// tmp=max(v2, v1)
	c.assembler.CompileRegisterToRegister(max, x1r, tmp)
	// x1r=max(v1, v2)
	c.assembler.CompileRegisterToRegister(max, x2r, x1r)
	// x2r=max(v1, v2)
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, x2r)

	// x2r = -0      if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
	//       0       if (v1 == 0 && v2 ==  0)
	//       -0       if (v1 == -0 && v2 == -0)
	//       v1^v2   if v1 == NaN || v2 == NaN
	//       0       otherwise
	c.assembler.CompileRegisterToRegister(xor, tmp, x2r)
	// x1r = -0           if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
	//       0            if (v1 == 0 && v2 ==  0)
	//       -0           if (v1 == -0 && v2 == -0)
	//       NaN          if v1 == NaN || v2 == NaN
	//       max(v1, v2)  otherwise
	c.assembler.CompileRegisterToRegister(or, x2r, x1r)
	// Copy x1r into tmp.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
	// tmp = 0            if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) || (v1 == 0 && v2 ==  0)
	//       -0           if (v1 == -0 && v2 == -0)
	//       NaN          if v1 == NaN || v2 == NaN
	//       max(v1, v2)  otherwise
	//
	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
	c.assembler.CompileRegisterToRegister(sub, x2r, tmp)
	// x1r = 0^ if v1 == NaN || v2 == NaN
	c.assembler.CompileRegisterToRegisterWithArg(cmp, x1r, x1r, 3)
	// x1r = set all bits on the mantissa bits
	//       0 otherwise
	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
	c.assembler.CompileRegisterToRegister(andn, tmp, x1r)

	c.locationStack.markRegisterUnused(x2r)
	c.pushVectorRuntimeValueLocationOnRegister(x1r)
	return nil
}

// compileV128AvgrU implements compiler.compileV128AvgrU for amd64.
func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	var inst asm.Instruction
	shape := o.B1
	switch shape {
	case wazeroir.ShapeI8x16:
		inst = amd64.PAVGB
	case wazeroir.ShapeI16x8:
		inst = amd64.PAVGW
	}

	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

// compileV128Pmin implements compiler.compileV128Pmin for amd64.
func (c *amd64Compiler) compileV128Pmin(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	var min asm.Instruction
	if o.B1 == wazeroir.ShapeF32x4 {
		min = amd64.MINPS
	} else {
		min = amd64.MINPD
	}

	x1r, v2r := x1.register, x2.register

	c.assembler.CompileRegisterToRegister(min, x1r, v2r)

	c.locationStack.markRegisterUnused(x1r)
	c.pushVectorRuntimeValueLocationOnRegister(v2r)
	return nil
}

// compileV128Pmax implements compiler.compileV128Pmax for amd64.
func (c *amd64Compiler) compileV128Pmax(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	var min asm.Instruction
	if o.B1 == wazeroir.ShapeF32x4 {
		min = amd64.MAXPS
	} else {
		min = amd64.MAXPD
	}

	x1r, v2r := x1.register, x2.register

	c.assembler.CompileRegisterToRegister(min, x1r, v2r)

	c.locationStack.markRegisterUnused(x1r)
	c.pushVectorRuntimeValueLocationOnRegister(v2r)
	return nil
}

// compileV128Ceil implements compiler.compileV128Ceil for amd64.
func (c *amd64Compiler) compileV128Ceil(o *wazeroir.UnionOperation) error {
	// See https://www.felixcloutier.com/x86/roundpd
	const roundModeCeil = 0x2
	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeCeil)
}

// compileV128Floor implements compiler.compileV128Floor for amd64.
func (c *amd64Compiler) compileV128Floor(o *wazeroir.UnionOperation) error {
	// See https://www.felixcloutier.com/x86/roundpd
	const roundModeFloor = 0x1
	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeFloor)
}

// compileV128Trunc implements compiler.compileV128Trunc for amd64.
func (c *amd64Compiler) compileV128Trunc(o *wazeroir.UnionOperation) error {
	// See https://www.felixcloutier.com/x86/roundpd
	const roundModeTrunc = 0x3
	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeTrunc)
}

// compileV128Nearest implements compiler.compileV128Nearest for amd64.
func (c *amd64Compiler) compileV128Nearest(o *wazeroir.UnionOperation) error {
	// See https://www.felixcloutier.com/x86/roundpd
	const roundModeNearest = 0x0
	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeNearest)
}

// compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil
// with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane).
func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	var round asm.Instruction
	if is32bit {
		round = amd64.ROUNDPS
	} else {
		round = amd64.ROUNDPD
	}

	c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128Extend implements compiler.compileV128Extend for amd64.
func (c *amd64Compiler) compileV128Extend(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	originShape := o.B1
	signed := o.B2 == 1
	useLow := o.B3
	if !useLow {
		// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
		// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
		// See https://www.felixcloutier.com/x86/palignr
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8)
	}

	var extend asm.Instruction
	switch originShape {
	case wazeroir.ShapeI8x16:
		if signed {
			extend = amd64.PMOVSXBW
		} else {
			extend = amd64.PMOVZXBW
		}
	case wazeroir.ShapeI16x8:
		if signed {
			extend = amd64.PMOVSXWD
		} else {
			extend = amd64.PMOVZXWD
		}
	case wazeroir.ShapeI32x4:
		if signed {
			extend = amd64.PMOVSXDQ
		} else {
			extend = amd64.PMOVZXDQ
		}
	}

	c.assembler.CompileRegisterToRegister(extend, vr, vr)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128ExtMul implements compiler.compileV128ExtMul for amd64.
func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	x1r, x2r := x1.register, x2.register

	originShape := o.B1
	signed := o.B2 == 1
	useLow := o.B3
	switch originShape {
	case wazeroir.ShapeI8x16:
		if !useLow {
			// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
			// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
			// See https://www.felixcloutier.com/x86/palignr
			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8)
			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8)
		}

		var ext asm.Instruction
		if signed {
			ext = amd64.PMOVSXBW
		} else {
			ext = amd64.PMOVZXBW
		}

		// Signed or Zero extend lower half packed bytes to packed words.
		c.assembler.CompileRegisterToRegister(ext, x1r, x1r)
		c.assembler.CompileRegisterToRegister(ext, x2r, x2r)

		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
	case wazeroir.ShapeI16x8:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}

		// Copy the value on x1r to tmp.
		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)

		// Multiply the values and store the lower 16-bits into x1r.
		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
		if signed {
			// Signed multiply the values and store the higher 16-bits into tmp.
			c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp)
		} else {
			// Unsigned multiply the values and store the higher 16-bits into tmp.
			c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp)
		}

		// Unpack lower or higher half of vectors (tmp and x1r) and concatenate them.
		if useLow {
			c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r)
		}
	case wazeroir.ShapeI32x4:
		var shuffleOrder byte
		// Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word,
		if useLow {
			// This makes the register as [v1, v1, v2, v2]
			shuffleOrder = 0b01010000
		} else {
			// This makes the register as [v3, v3, v4, v4]
			shuffleOrder = 0b11111010
		}
		// See https://www.felixcloutier.com/x86/pshufd
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder)
		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder)

		var mul asm.Instruction
		if signed {
			mul = amd64.PMULDQ
		} else {
			mul = amd64.PMULUDQ
		}
		c.assembler.CompileRegisterToRegister(mul, x2r, x1r)
	}

	c.locationStack.markRegisterUnused(x2r)
	c.pushVectorRuntimeValueLocationOnRegister(x1r)
	return nil
}

var q15mulrSatSMask = [16]byte{
	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
}

// compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64.
func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	x1r, x2r := x1.register, x2.register

	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(q15mulrSatSMask[:]), tmp); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r)
	c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp)
	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r)

	c.locationStack.markRegisterUnused(x2r)
	c.pushVectorRuntimeValueLocationOnRegister(x1r)
	return nil
}

var (
	allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
	allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}

	extAddPairwiseI16x8uMask = [16 * 2]byte{
		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
		0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
	}
)

// compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64.
func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	originShape := o.B1
	signed := o.B3
	switch originShape {
	case wazeroir.ShapeI8x16:
		allOnesReg, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}

		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
			asm.NewStaticConst(allOnesI8x16[:]), allOnesReg); err != nil {
			return err
		}

		var result asm.Register
		// See https://www.felixcloutier.com/x86/pmaddubsw for detail.
		if signed {
			// Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise
			// signed extadd.
			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg)
			result = allOnesReg
		} else {
			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr)
			result = vr
		}

		if result != vr {
			c.locationStack.markRegisterUnused(vr)
		}
		c.pushVectorRuntimeValueLocationOnRegister(result)
	case wazeroir.ShapeI16x8:
		tmp, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}

		if signed {
			// See https://www.felixcloutier.com/x86/pmaddwd
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
				return err
			}

			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
			c.pushVectorRuntimeValueLocationOnRegister(vr)
		} else {

			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
				asm.NewStaticConst(extAddPairwiseI16x8uMask[:16]), tmp); err != nil {
				return err
			}

			// Flip the sign bits on vr.
			//
			// Assuming that vr = [w1, ..., w8], now we have,
			// 	vr[i] = int8(-w1) for i = 0...8
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)

			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
				return err
			}

			// For i = 0,..4 (as this results in i32x4 lanes), now we have
			// vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)

			// tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1)
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
				asm.NewStaticConst(extAddPairwiseI16x8uMask[16:]), tmp); err != nil {
				return err
			}

			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
			c.pushVectorRuntimeValueLocationOnRegister(vr)
		}
	}
	return nil
}

// compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64.
func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64.
func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128Dot implements compiler.compileV128Dot for amd64.
func (c *amd64Compiler) compileV128Dot(*wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

var fConvertFromIMask = [16]byte{
	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}

// compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64.
func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	destinationShape := o.B1
	signed := o.B3

	switch destinationShape {
	case wazeroir.ShapeF32x4:
		if signed {
			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
		} else {
			tmp, err := c.allocateRegister(registerTypeVector)
			if err != nil {
				return err
			}

			// Copy the value into tmp.
			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)

			// Clear the higher 16-bits of tmp.
			c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp)
			c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp)

			// Subtract the higher 16-bits from vr == clear the lower 16-bits of vr.
			c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr)

			// Convert the lower 16-bits in tmp.
			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)

			// Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr.
			c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr)
			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)

			// Double the converted halved higher 16bits.
			c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr)

			// Get the conversion result by add tmp (holding lower 16-bit conversion) into vr.
			c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr)
		}
	case wazeroir.ShapeF64x2:
		if signed {
			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr)
		} else {
			tmp, err := c.allocateRegister(registerTypeVector)
			if err != nil {
				return err
			}

			// tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(fConvertFromIMask[:16]), tmp); err != nil {
				return err
			}

			// Given that we have vr = [d1, d2, d3, d4], this results in
			//	vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
			c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr)

			// tmp = [float64(0x1.0p52), float64(0x1.0p52)]
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
				asm.NewStaticConst(twop52[:]), tmp); err != nil {
				return err
			}

			// Now, we get the result as
			// 	vr = [float64(uint32(d1)), float64(uint32(d2))]
			// because the following equality always satisfies:
			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
			c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr)
		}
	}

	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}

// compileV128Narrow implements compiler.compileV128Narrow for amd64.
func (c *amd64Compiler) compileV128Narrow(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	var narrow asm.Instruction
	originShape := o.B1
	signed := o.B3
	switch originShape {
	case wazeroir.ShapeI16x8:
		if signed {
			narrow = amd64.PACKSSWB
		} else {
			narrow = amd64.PACKUSWB
		}
	case wazeroir.ShapeI32x4:
		if signed {
			narrow = amd64.PACKSSDW
		} else {
			narrow = amd64.PACKUSDW
		}
	}
	c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register)

	c.locationStack.markRegisterUnused(x2.register)
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	return nil
}

var (
	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
	i32sMaxOnF64x2 = [16]byte{
		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
	}

	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
	i32uMaxOnF64x2 = [16]byte{
		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
	}

	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
	// like addition or subtraction, the resulted floating point holds exactly the same
	// bit representations in 32-bit integer on its mantissa.
	//
	// Note: the name twop52 is common across various compiler ecosystem.
	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
	twop52 = [16]byte{
		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
	}
)

// compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64.
func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.UnionOperation) error {
	v := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}
	vr := v.register

	tmp, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(tmp)

	originShape := o.B1
	signed := o.B3
	switch originShape {
	case wazeroir.ShapeF32x4:
		if signed {
			// Copy the value into tmp.
			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)

			// Assuming we have vr = [v1, v2, v3, v4].
			//
			// Set all bits if lane is not NaN on tmp.
			// tmp[i] = 0xffffffff  if vi != NaN
			//        = 0           if vi == NaN
			c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp)

			// Clear NaN lanes on vr, meaning that
			// 	vr[i] = vi  if vi != NaN
			//	        0   if vi == NaN
			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr)

			// tmp[i] = ^vi         if vi != NaN
			//        = 0xffffffff  if vi == NaN
			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
			c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp)

			// vr[i] = int32(vi)   if vi != NaN and vr is not overflowing.
			//       = 0x80000000  if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
			//       = 0           if vi == NaN
			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)

			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
			//
			// tmp[i] = 0x80000000                         if vi is positive
			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
			c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp)

			// Arithmetic right shifting tmp by 31, meaning that we have
			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
			c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp)

			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
		} else {
			tmp2, err := c.allocateRegister(registerTypeVector)
			if err != nil {
				return err
			}

			// See https://github.com/bytecodealliance/wasmtime/pull/2440
			// Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u.
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
			c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr)
			c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
			c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp)
			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
			c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2)
			c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS
			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2)
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2)
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2)
			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr)
		}
	case wazeroir.ShapeF64x2:
		tmp2, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}

		if signed {
			// Copy the value into tmp.
			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)

			// Set all bits for non-NaN lanes, zeros otherwise.
			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
			c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp)

			// Load the 2147483647 into tmp2's each lane.
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32sMaxOnF64x2[:]), tmp2); err != nil {
				return err
			}

			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp)

			// MINPD returns the source register's value as-is, so we have
			//  vr[i] = vi   if vi != NaN
			//        = 0    if vi == NaN
			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr)

			c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr)
		} else {
			// Clears all bits on tmp.
			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)

			//  vr[i] = vi   if vi != NaN && vi > 0
			//        = 0    if vi == NaN || vi <= 0
			c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr)

			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32uMaxOnF64x2[:]), tmp2); err != nil {
				return err
			}

			// vr[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//       = 0    otherwise
			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr)

			// Round the floating points into integer.
			c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3)

			// tmp2[i] = float64(0x1.0p52)
			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(twop52[:]), tmp2); err != nil {
				return err
			}

			// vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//       = 0                                       otherwise
			//
			// This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
			c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr)

			// At this point, we have
			// 	vr  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
			//  tmp = [0, 0, 0, 0]
			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
			//	vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0]
			// meaning that for i = 0 and 1, we have
			//  vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//        = 0          otherwise.
			c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00)
		}
	}

	c.locationStack.markRegisterUnused(tmp)
	c.pushVectorRuntimeValueLocationOnRegister(vr)
	return nil
}