Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:

 - Fixed algorithm construction hang when self-test fails.
 - Added SHA variants to talitos AEAD list.
 - New driver for Exynos random number generator.
 - Performance enhancements for arc4.
 - Added hwrng support to caam.
 - Added ahash support to caam.
 - Fixed bad kfree in aesni-intel.
 - Allow aesni-intel in FIPS mode.
 - Added atmel driver with support for AES/3DES/SHA.
 - Bug fixes for mv_cesa.
 - CRC hardware driver for BF60x family processors.

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (66 commits)
  crypto: twofish-avx - remove useless instruction
  crypto: testmgr - add aead cbc aes hmac sha1,256,512 test vectors
  crypto: talitos - add sha224, sha384 and sha512 to existing AEAD algorithms
  crypto: talitos - export the talitos_submit function
  crypto: talitos - move talitos structures to header file
  crypto: atmel - add new tests to tcrypt
  crypto: atmel - add Atmel SHA1/SHA256 driver
  crypto: atmel - add Atmel DES/TDES driver
  crypto: atmel - add Atmel AES driver
  ARM: AT91SAM9G45: add crypto peripherals
  crypto: testmgr - allow aesni-intel and ghash_clmulni-intel in fips mode
  hwrng: exynos - Add support for Exynos random number generator
  crypto: aesni-intel - fix wrong kfree pointer
  crypto: caam - ERA retrieval and printing for SEC device
  crypto: caam - Using alloc_coherent for caam job rings
  crypto: algapi - Fix hang on crypto allocation
  crypto: arc4 - now arc needs blockcipher support
  crypto: caam - one tasklet per job ring
  crypto: caam - consolidate memory barriers from job ring en/dequeue
  crypto: caam - only query h/w in job ring dequeue path
  ...
  • Loading branch information...
commit 44a6b8442190cf213081060b610dae2e822f802b 2 parents 945c40c + a434788
@torvalds authored
Showing with 14,289 additions and 1,842 deletions.
  1. +12 −1 arch/arm/mach-at91/at91sam9g45.c
  2. +128 −0 arch/arm/mach-at91/at91sam9g45_devices.c
  3. +2 −0  arch/arm/mach-at91/include/mach/at91sam9g45.h
  4. +0 −1  arch/powerpc/Makefile
  5. +0 −18 arch/s390/crypto/crypto_des.h
  6. +7 −7 arch/x86/crypto/Makefile
  7. +149 −0 arch/x86/crypto/ablk_helper.c
  8. +1 −1  arch/x86/crypto/aes_glue.c
  9. +13 −97 arch/x86/crypto/aesni-intel_glue.c
  10. +86 −269 arch/x86/crypto/camellia_glue.c
  11. +307 −0 arch/x86/crypto/glue_helper.c
  12. +704 −0 arch/x86/crypto/serpent-avx-x86_64-asm_64.S
  13. +636 −0 arch/x86/crypto/serpent_avx_glue.c
  14. +100 −413 arch/x86/crypto/serpent_sse2_glue.c
  15. +1 −1  arch/x86/crypto/sha1_ssse3_asm.S
  16. +3 −3 arch/x86/crypto/sha1_ssse3_glue.c
  17. +300 −0 arch/x86/crypto/twofish-avx-x86_64-asm_64.S
  18. +624 −0 arch/x86/crypto/twofish_avx_glue.c
  19. +109 −300 arch/x86/crypto/twofish_glue_3way.c
  20. +31 −0 arch/x86/include/asm/crypto/ablk_helper.h
  21. 0  arch/x86/include/asm/{ → crypto}/aes.h
  22. +115 −0 arch/x86/include/asm/crypto/glue_helper.h
  23. +32 −0 arch/x86/include/asm/crypto/serpent-avx.h
  24. +2 −2 arch/x86/include/asm/{serpent.h → crypto/serpent-sse2.h}
  25. +46 −0 arch/x86/include/asm/crypto/twofish.h
  26. +66 −1 crypto/Kconfig
  27. +0 −17 crypto/algapi.c
  28. +9 −8 crypto/algboss.c
  29. +90 −25 crypto/arc4.c
  30. +0 −1  crypto/internal.h
  31. +81 −5 crypto/tcrypt.c
  32. +203 −0 crypto/testmgr.c
  33. +1,820 −32 crypto/testmgr.h
  34. +12 −0 drivers/char/hw_random/Kconfig
  35. +1 −0  drivers/char/hw_random/Makefile
  36. +182 −0 drivers/char/hw_random/exynos-rng.c
  37. +13 −8 drivers/char/hw_random/mxc-rnga.c
  38. +55 −1 drivers/crypto/Kconfig
  39. +6 −1 drivers/crypto/Makefile
  40. +62 −0 drivers/crypto/atmel-aes-regs.h
  41. +1,206 −0 drivers/crypto/atmel-aes.c
  42. +46 −0 drivers/crypto/atmel-sha-regs.h
  43. +1,112 −0 drivers/crypto/atmel-sha.c
  44. +89 −0 drivers/crypto/atmel-tdes-regs.h
  45. +1,215 −0 drivers/crypto/atmel-tdes.c
  46. +780 −0 drivers/crypto/bfin_crc.c
  47. +29 −1 drivers/crypto/caam/Kconfig
  48. +3 −1 drivers/crypto/caam/Makefile
  49. +196 −376 drivers/crypto/caam/caamalg.c
  50. +1,878 −0 drivers/crypto/caam/caamhash.c
  51. +309 −0 drivers/crypto/caam/caamrng.c
  52. +2 −0  drivers/crypto/caam/compat.h
  53. +173 −6 drivers/crypto/caam/ctrl.c
  54. +13 −0 drivers/crypto/caam/ctrl.h
  55. +15 −16 drivers/crypto/caam/desc.h
  56. +51 −6 drivers/crypto/caam/desc_constr.h
  57. +34 −10 drivers/crypto/caam/error.c
  58. +3 −3 drivers/crypto/caam/intern.h
  59. +39 −76 drivers/crypto/caam/jr.c
  60. +122 −0 drivers/crypto/caam/key_gen.c
  61. +17 −0 drivers/crypto/caam/key_gen.h
  62. +401 −0 drivers/crypto/caam/pdb.h
  63. +36 −2 drivers/crypto/caam/regs.h
  64. +156 −0 drivers/crypto/caam/sg_sw_sec4.h
  65. +45 −16 drivers/crypto/mv_cesa.c
  66. +166 −117 drivers/crypto/talitos.c
  67. +123 −0 drivers/crypto/talitos.h
  68. +22 −0 include/linux/platform_data/atmel-aes.h
View
13 arch/arm/mach-at91/at91sam9g45.c
@@ -183,6 +183,13 @@ static struct clk adc_op_clk = {
.rate_hz = 13200000,
};
+/* AES/TDES/SHA clock - Only for sam9m11/sam9g56 */
+static struct clk aestdessha_clk = {
+ .name = "aestdessha_clk",
+ .pmc_mask = 1 << AT91SAM9G45_ID_AESTDESSHA,
+ .type = CLK_TYPE_PERIPHERAL,
+};
+
static struct clk *periph_clocks[] __initdata = {
&pioA_clk,
&pioB_clk,
@@ -212,6 +219,7 @@ static struct clk *periph_clocks[] __initdata = {
&udphs_clk,
&mmc1_clk,
&adc_op_clk,
+ &aestdessha_clk,
// irq0
};
@@ -232,6 +240,9 @@ static struct clk_lookup periph_clocks_lookups[] = {
CLKDEV_CON_DEV_ID("pclk", "ssc.0", &ssc0_clk),
CLKDEV_CON_DEV_ID("pclk", "ssc.1", &ssc1_clk),
CLKDEV_CON_DEV_ID(NULL, "atmel-trng", &trng_clk),
+ CLKDEV_CON_DEV_ID(NULL, "atmel_sha", &aestdessha_clk),
+ CLKDEV_CON_DEV_ID(NULL, "atmel_tdes", &aestdessha_clk),
+ CLKDEV_CON_DEV_ID(NULL, "atmel_aes", &aestdessha_clk),
/* more usart lookup table for DT entries */
CLKDEV_CON_DEV_ID("usart", "ffffee00.serial", &mck),
CLKDEV_CON_DEV_ID("usart", "fff8c000.serial", &usart0_clk),
@@ -388,7 +399,7 @@ static unsigned int at91sam9g45_default_irq_priority[NR_AIC_IRQS] __initdata = {
3, /* Ethernet */
0, /* Image Sensor Interface */
2, /* USB Device High speed port */
- 0,
+ 0, /* AESTDESSHA Crypto HW Accelerators */
0, /* Multimedia Card Interface 1 */
0,
0, /* Advanced Interrupt Controller (IRQ0) */
View
128 arch/arm/mach-at91/at91sam9g45_devices.c
@@ -18,6 +18,7 @@
#include <linux/platform_device.h>
#include <linux/i2c-gpio.h>
#include <linux/atmel-mci.h>
+#include <linux/platform_data/atmel-aes.h>
#include <linux/platform_data/at91_adc.h>
@@ -1830,6 +1831,130 @@ void __init at91_register_uart(unsigned id, unsigned portnr, unsigned pins) {}
void __init at91_add_device_serial(void) {}
#endif
+/* --------------------------------------------------------------------
+ * SHA1/SHA256
+ * -------------------------------------------------------------------- */
+
+#if defined(CONFIG_CRYPTO_DEV_ATMEL_SHA) || defined(CONFIG_CRYPTO_DEV_ATMEL_SHA_MODULE)
+static struct resource sha_resources[] = {
+ {
+ .start = AT91SAM9G45_BASE_SHA,
+ .end = AT91SAM9G45_BASE_SHA + SZ_16K - 1,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = AT91SAM9G45_ID_AESTDESSHA,
+ .end = AT91SAM9G45_ID_AESTDESSHA,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct platform_device at91sam9g45_sha_device = {
+ .name = "atmel_sha",
+ .id = -1,
+ .resource = sha_resources,
+ .num_resources = ARRAY_SIZE(sha_resources),
+};
+
+static void __init at91_add_device_sha(void)
+{
+ platform_device_register(&at91sam9g45_sha_device);
+}
+#else
+static void __init at91_add_device_sha(void) {}
+#endif
+
+/* --------------------------------------------------------------------
+ * DES/TDES
+ * -------------------------------------------------------------------- */
+
+#if defined(CONFIG_CRYPTO_DEV_ATMEL_TDES) || defined(CONFIG_CRYPTO_DEV_ATMEL_TDES_MODULE)
+static struct resource tdes_resources[] = {
+ [0] = {
+ .start = AT91SAM9G45_BASE_TDES,
+ .end = AT91SAM9G45_BASE_TDES + SZ_16K - 1,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = AT91SAM9G45_ID_AESTDESSHA,
+ .end = AT91SAM9G45_ID_AESTDESSHA,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct platform_device at91sam9g45_tdes_device = {
+ .name = "atmel_tdes",
+ .id = -1,
+ .resource = tdes_resources,
+ .num_resources = ARRAY_SIZE(tdes_resources),
+};
+
+static void __init at91_add_device_tdes(void)
+{
+ platform_device_register(&at91sam9g45_tdes_device);
+}
+#else
+static void __init at91_add_device_tdes(void) {}
+#endif
+
+/* --------------------------------------------------------------------
+ * AES
+ * -------------------------------------------------------------------- */
+
+#if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
+static struct aes_platform_data aes_data;
+static u64 aes_dmamask = DMA_BIT_MASK(32);
+
+static struct resource aes_resources[] = {
+ [0] = {
+ .start = AT91SAM9G45_BASE_AES,
+ .end = AT91SAM9G45_BASE_AES + SZ_16K - 1,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = AT91SAM9G45_ID_AESTDESSHA,
+ .end = AT91SAM9G45_ID_AESTDESSHA,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct platform_device at91sam9g45_aes_device = {
+ .name = "atmel_aes",
+ .id = -1,
+ .dev = {
+ .dma_mask = &aes_dmamask,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
+ .platform_data = &aes_data,
+ },
+ .resource = aes_resources,
+ .num_resources = ARRAY_SIZE(aes_resources),
+};
+
+static void __init at91_add_device_aes(void)
+{
+ struct at_dma_slave *atslave;
+ struct aes_dma_data *alt_atslave;
+
+ alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
+
+ /* DMA TX slave channel configuration */
+ atslave = &alt_atslave->txdata;
+ atslave->dma_dev = &at_hdmac_device.dev;
+ atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW |
+ ATC_SRC_PER(AT_DMA_ID_AES_RX);
+
+ /* DMA RX slave channel configuration */
+ atslave = &alt_atslave->rxdata;
+ atslave->dma_dev = &at_hdmac_device.dev;
+ atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW |
+ ATC_DST_PER(AT_DMA_ID_AES_TX);
+
+ aes_data.dma_slave = alt_atslave;
+ platform_device_register(&at91sam9g45_aes_device);
+}
+#else
+static void __init at91_add_device_aes(void) {}
+#endif
/* -------------------------------------------------------------------- */
/*
@@ -1847,6 +1972,9 @@ static int __init at91_add_standard_devices(void)
at91_add_device_trng();
at91_add_device_watchdog();
at91_add_device_tc();
+ at91_add_device_sha();
+ at91_add_device_tdes();
+ at91_add_device_aes();
return 0;
}
View
2  arch/arm/mach-at91/include/mach/at91sam9g45.h
@@ -136,6 +136,8 @@
#define AT_DMA_ID_SSC1_RX 8
#define AT_DMA_ID_AC97_TX 9
#define AT_DMA_ID_AC97_RX 10
+#define AT_DMA_ID_AES_TX 11
+#define AT_DMA_ID_AES_RX 12
#define AT_DMA_ID_MCI1 13
#endif
View
1  arch/powerpc/Makefile
@@ -149,7 +149,6 @@ core-$(CONFIG_KVM) += arch/powerpc/kvm/
core-$(CONFIG_PERF_EVENTS) += arch/powerpc/perf/
drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
-drivers-$(CONFIG_CRYPTO_DEV_NX) += drivers/crypto/nx/
# Default to zImage, override when needed
all: zImage
View
18 arch/s390/crypto/crypto_des.h
@@ -1,18 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Function for checking keys for the DES and Tripple DES Encryption
- * algorithms.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-#ifndef __CRYPTO_DES_H__
-#define __CRYPTO_DES_H__
-
-extern int crypto_des_check_key(const u8*, unsigned int, u32*);
-
-#endif /*__CRYPTO_DES_H__*/
View
14 arch/x86/crypto/Makefile
@@ -2,6 +2,9 @@
# Arch-specific CryptoAPI modules.
#
+obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
+obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-
-# enable AVX support only when $(AS) can actually assemble the instructions
-ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
-AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
-CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
-endif
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
View
149 arch/x86/crypto/ablk_helper.c
@@ -0,0 +1,149 @@
+/*
+ * Shared async block cipher helpers
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <asm/i387.h>
+#include <asm/crypto/ablk_helper.h>
+
+int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ablkcipher_setkey(child, key, key_len);
+ crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ablk_set_key);
+
+int __ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->encrypt(
+ &desc, req->dst, req->src, req->nbytes);
+}
+EXPORT_SYMBOL_GPL(__ablk_encrypt);
+
+int ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_encrypt(cryptd_req);
+ } else {
+ return __ablk_encrypt(req);
+ }
+}
+EXPORT_SYMBOL_GPL(ablk_encrypt);
+
+int ablk_decrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_decrypt(cryptd_req);
+ } else {
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->decrypt(
+ &desc, req->dst, req->src, req->nbytes);
+ }
+}
+EXPORT_SYMBOL_GPL(ablk_decrypt);
+
+void ablk_exit(struct crypto_tfm *tfm)
+{
+ struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+EXPORT_SYMBOL_GPL(ablk_exit);
+
+int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
+{
+ struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ ctx->cryptd_tfm = cryptd_tfm;
+ tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+ crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ablk_init_common);
+
+int ablk_init(struct crypto_tfm *tfm)
+{
+ char drv_name[CRYPTO_MAX_ALG_NAME];
+
+ snprintf(drv_name, sizeof(drv_name), "__driver-%s",
+ crypto_tfm_alg_driver_name(tfm));
+
+ return ablk_init_common(tfm, drv_name);
+}
+EXPORT_SYMBOL_GPL(ablk_init);
+
+MODULE_LICENSE("GPL");
View
2  arch/x86/crypto/aes_glue.c
@@ -5,7 +5,7 @@
#include <linux/module.h>
#include <crypto/aes.h>
-#include <asm/aes.h>
+#include <asm/crypto/aes.h>
asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
View
110 arch/x86/crypto/aesni-intel_glue.c
@@ -30,7 +30,8 @@
#include <crypto/ctr.h>
#include <asm/cpu_device_id.h>
#include <asm/i387.h>
-#include <asm/aes.h>
+#include <asm/crypto/aes.h>
+#include <asm/crypto/ablk_helper.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <linux/workqueue.h>
@@ -52,10 +53,6 @@
#define HAS_XTS
#endif
-struct async_aes_ctx {
- struct cryptd_ablkcipher *cryptd_tfm;
-};
-
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
@@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
}
#endif
-static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
- unsigned int key_len)
-{
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
- int err;
-
- crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
- & CRYPTO_TFM_REQ_MASK);
- err = crypto_ablkcipher_setkey(child, key, key_len);
- crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
- & CRYPTO_TFM_RES_MASK);
- return err;
-}
-
-static int ablk_encrypt(struct ablkcipher_request *req)
-{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
- if (!irq_fpu_usable()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_ablkcipher_encrypt(cryptd_req);
- } else {
- struct blkcipher_desc desc;
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
- return crypto_blkcipher_crt(desc.tfm)->encrypt(
- &desc, req->dst, req->src, req->nbytes);
- }
-}
-
-static int ablk_decrypt(struct ablkcipher_request *req)
-{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
- if (!irq_fpu_usable()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_ablkcipher_decrypt(cryptd_req);
- } else {
- struct blkcipher_desc desc;
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
- return crypto_blkcipher_crt(desc.tfm)->decrypt(
- &desc, req->dst, req->src, req->nbytes);
- }
-}
-
-static void ablk_exit(struct crypto_tfm *tfm)
-{
- struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
- cryptd_free_ablkcipher(ctx->cryptd_tfm);
-}
-
-static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
-{
- struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- struct cryptd_ablkcipher *cryptd_tfm;
-
- cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- ctx->cryptd_tfm = cryptd_tfm;
- tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
- crypto_ablkcipher_reqsize(&cryptd_tfm->base);
-
- return 0;
-}
-
static int ablk_ecb_init(struct crypto_tfm *tfm)
{
return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
@@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
struct aesni_rfc4106_gcm_ctx *child_ctx =
aesni_rfc4106_gcm_ctx_get(cryptd_child);
- u8 *new_key_mem = NULL;
+ u8 *new_key_align, *new_key_mem = NULL;
if (key_len < 4) {
crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
if (!new_key_mem)
return -ENOMEM;
- new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
- memcpy(new_key_mem, key, key_len);
- key = new_key_mem;
+ new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+ memcpy(new_key_align, key, key_len);
+ key = new_key_align;
}
if (!irq_fpu_usable())
@@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
@@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
View
355 arch/x86/crypto/camellia_glue.c
@@ -5,10 +5,6 @@
*
* Camellia parts based on code by:
* Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -34,9 +30,9 @@
#include <linux/module.h>
#include <linux/types.h>
#include <crypto/algapi.h>
-#include <crypto/b128ops.h>
#include <crypto/lrw.h>
#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
#define CAMELLIA_MIN_KEY_SIZE 16
#define CAMELLIA_MAX_KEY_SIZE 32
@@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
&tfm->crt_flags);
}
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
- void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
- void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
+static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
{
- struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- unsigned int bsize = CAMELLIA_BLOCK_SIZE;
- unsigned int nbytes;
- int err;
-
- err = blkcipher_walk_virt(desc, walk);
-
- while ((nbytes = walk->nbytes)) {
- u8 *wsrc = walk->src.virt.addr;
- u8 *wdst = walk->dst.virt.addr;
-
- /* Process two block batch */
- if (nbytes >= bsize * 2) {
- do {
- fn_2way(ctx, wdst, wsrc);
-
- wsrc += bsize * 2;
- wdst += bsize * 2;
- nbytes -= bsize * 2;
- } while (nbytes >= bsize * 2);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize;
- wdst += bsize;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- err = blkcipher_walk_done(desc, walk, nbytes);
- }
-
- return err;
-}
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct blkcipher_walk walk;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
-}
+ u128 iv = *src;
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct blkcipher_walk walk;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
-}
+ camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
- struct blkcipher_walk *walk)
-{
- struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- unsigned int bsize = CAMELLIA_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u128 *src = (u128 *)walk->src.virt.addr;
- u128 *dst = (u128 *)walk->dst.virt.addr;
- u128 *iv = (u128 *)walk->iv;
-
- do {
- u128_xor(dst, src, iv);
- camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
- return nbytes;
+ u128_xor(&dst[1], &dst[1], &iv);
}
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
+static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
{
- struct blkcipher_walk walk;
- int err;
+ be128 ctrblk;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
+ if (dst != src)
+ *dst = *src;
- while ((nbytes = walk.nbytes)) {
- nbytes = __cbc_encrypt(desc, &walk);
- err = blkcipher_walk_done(desc, &walk, nbytes);
- }
+ u128_to_be128(&ctrblk, iv);
+ u128_inc(iv);
- return err;
+ camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
}
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
- struct blkcipher_walk *walk)
+static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+ u128 *iv)
{
- struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- unsigned int bsize = CAMELLIA_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u128 *src = (u128 *)walk->src.virt.addr;
- u128 *dst = (u128 *)walk->dst.virt.addr;
- u128 ivs[2 - 1];
- u128 last_iv;
+ be128 ctrblks[2];
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- /* Process two block batch */
- if (nbytes >= bsize * 2) {
- do {
- nbytes -= bsize * (2 - 1);
- src -= 2 - 1;
- dst -= 2 - 1;
-
- ivs[0] = src[0];
-
- camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
-
- u128_xor(dst + 1, dst + 1, ivs + 0);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- goto done;
-
- u128_xor(dst, dst, src - 1);
- src -= 1;
- dst -= 1;
- } while (nbytes >= bsize * 2);
-
- if (nbytes < bsize)
- goto done;
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
}
- /* Handle leftovers */
- for (;;) {
- camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- break;
+ u128_to_be128(&ctrblks[0], iv);
+ u128_inc(iv);
+ u128_to_be128(&ctrblks[1], iv);
+ u128_inc(iv);
- u128_xor(dst, dst, src - 1);
- src -= 1;
- dst -= 1;
- }
-
-done:
- u128_xor(dst, dst, (u128 *)walk->iv);
- *(u128 *)walk->iv = last_iv;
-
- return nbytes;
+ camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
}
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct blkcipher_walk walk;
- int err;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ } }
+};
- while ((nbytes = walk.nbytes)) {
- nbytes = __cbc_decrypt(desc, &walk);
- err = blkcipher_walk_done(desc, &walk, nbytes);
- }
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ } }
+};
- return err;
-}
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
-static inline void u128_to_be128(be128 *dst, const u128 *src)
-{
- dst->a = cpu_to_be64(src->a);
- dst->b = cpu_to_be64(src->b);
-}
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
{
- dst->a = be64_to_cpu(src->a);
- dst->b = be64_to_cpu(src->b);
+ return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
}
-static inline void u128_inc(u128 *i)
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
{
- i->b++;
- if (!i->b)
- i->a++;
+ return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
}
-static void ctr_crypt_final(struct blkcipher_desc *desc,
- struct blkcipher_walk *walk)
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
{
- struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- u8 keystream[CAMELLIA_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
- u128 ctrblk;
-
- memcpy(keystream, src, nbytes);
- camellia_enc_blk_xor(ctx, keystream, walk->iv);
- memcpy(dst, keystream, nbytes);
-
- be128_to_u128(&ctrblk, (be128 *)walk->iv);
- u128_inc(&ctrblk);
- u128_to_be128((be128 *)walk->iv, &ctrblk);
+ return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+ dst, src, nbytes);
}
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
- struct blkcipher_walk *walk)
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
{
- struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- unsigned int bsize = CAMELLIA_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u128 *src = (u128 *)walk->src.virt.addr;
- u128 *dst = (u128 *)walk->dst.virt.addr;
- u128 ctrblk;
- be128 ctrblocks[2];
-
- be128_to_u128(&ctrblk, (be128 *)walk->iv);
-
- /* Process two block batch */
- if (nbytes >= bsize * 2) {
- do {
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- }
-
- /* create ctrblks for parallel encrypt */
- u128_to_be128(&ctrblocks[0], &ctrblk);
- u128_inc(&ctrblk);
- u128_to_be128(&ctrblocks[1], &ctrblk);
- u128_inc(&ctrblk);
-
- camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
- (u8 *)ctrblocks);
-
- src += 2;
- dst += 2;
- nbytes -= bsize * 2;
- } while (nbytes >= bsize * 2);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- if (dst != src)
- *dst = *src;
-
- u128_to_be128(&ctrblocks[0], &ctrblk);
- u128_inc(&ctrblk);
-
- camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- u128_to_be128((be128 *)walk->iv, &ctrblk);
- return nbytes;
+ return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+ nbytes);
}
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
- struct blkcipher_walk walk;
- int err;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
-
- while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
- nbytes = __ctr_crypt(desc, &walk);
- err = blkcipher_walk_done(desc, &walk, nbytes);
- }
-
- if (walk.nbytes) {
- ctr_crypt_final(desc, &walk);
- err = blkcipher_walk_done(desc, &walk, 0);
- }
-
- return err;
+ return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
}
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
View
307 arch/x86/crypto/glue_helper.c
@@ -0,0 +1,307 @@
+/*
+ * Shared glue code for 128bit block ciphers
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+
+static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes, i, func_bytes;
+ bool fpu_enabled = false;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ desc, fpu_enabled, nbytes);
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ func_bytes = bsize * gctx->funcs[i].num_blocks;
+
+ /* Process multi-block batch */
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.ecb(ctx, wdst,
+ wsrc);
+
+ wsrc += func_bytes;
+ wdst += func_bytes;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+
+int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return __glue_ecb_crypt_128bit(gctx, desc, &walk);
+}
+EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
+
+static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 *iv = (u128 *)walk->iv;
+
+ do {
+ u128_xor(dst, src, iv);
+ fn(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+ return nbytes;
+}
+
+int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
+ struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
+
+static unsigned int
+__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 last_iv;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ /* Process multi-block batch */
+ if (nbytes >= func_bytes) {
+ do {
+ nbytes -= func_bytes - bsize;
+ src -= num_blocks - 1;
+ dst -= num_blocks - 1;
+
+ gctx->funcs[i].fn_u.cbc(ctx, dst, src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ u128_xor(dst, dst, src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ u128_xor(dst, dst, (u128 *)walk->iv);
+ *(u128 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ const unsigned int bsize = 128 / 8;
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ desc, fpu_enabled, nbytes);
+ nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
+
+static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *src = (u8 *)walk->src.virt.addr;
+ u8 *dst = (u8 *)walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+ u128 ctrblk;
+ u128 tmp;
+
+ be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+ memcpy(&tmp, src, nbytes);
+ fn_ctr(ctx, &tmp, &tmp, &ctrblk);
+ memcpy(dst, &tmp, nbytes);
+
+ u128_to_be128((be128 *)walk->iv, &ctrblk);
+}
+EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
+
+static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 ctrblk;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+ /* Process multi-block batch */
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
+
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ u128_to_be128((be128 *)walk->iv, &ctrblk);
+ return nbytes;
+}
+
+int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ const unsigned int bsize = 128 / 8;
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, bsize);
+
+ while ((nbytes = walk.nbytes) >= bsize) {
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ desc, fpu_enabled, nbytes);
+ nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ if (walk.nbytes) {
+ glue_ctr_crypt_final_128bit(
+ gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
+
+MODULE_LICENSE("GPL");
View
704 arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-avx-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way AVX serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define tp %xmm5
+
+#define RA2 %xmm6
+#define RB2 %xmm7
+#define RC2 %xmm8
+#define RD2 %xmm9
+#define RE2 %xmm10
+
+#define RNOT %xmm11
+
+#define RK0 %xmm12
+#define RK1 %xmm13
+#define RK2 %xmm14
+#define RK3 %xmm15
+
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i, j, t) \
+ vbroadcastss (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ vmovdqu (0*4*4)(in), x0; \
+ vmovdqu (1*4*4)(in), x1; \
+ vmovdqu (2*4*4)(in), x2; \
+ vmovdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vmovdqu x0, (0*4*4)(out); \
+ vmovdqu x1, (1*4*4)(out); \
+ vmovdqu x2, (2*4*4)(out); \
+ vmovdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpxor (0*4*4)(out), x0, x0; \
+ vmovdqu x0, (0*4*4)(out); \
+ vpxor (1*4*4)(out), x1, x1; \
+ vmovdqu x1, (1*4*4)(out); \
+ vpxor (2*4*4)(out), x2, x2; \
+ vmovdqu x2, (2*4*4)(out); \
+ vpxor (3*4*4)(out), x3, x3; \
+ vmovdqu x3, (3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way_avx
+.type __serpent_enc_blk_8way_avx,@function;
+
+__serpent_enc_blk_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz __enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+.align 8
+.global serpent_dec_blk_8way_avx
+.type serpent_dec_blk_8way_avx,@function;
+
+serpent_dec_blk_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
View
636 arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,636 @@
+/*
+ * Glue Code for AVX assembler versions of Serpent Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Glue code based on serpent_sse2_glue.c by:
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/serpent-avx.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+{
+ u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ unsigned int j;
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ ivs[j] = src[j];
+
+ serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+}
+
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+{
+ be128 ctrblk;
+
+ u128_to_be128(&ctrblk, iv);
+ u128_inc(iv);
+
+ __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+ u128 *iv)
+{
+ be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+ unsigned int i;
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+ if (dst != src)
+ dst[i] = src[i];
+
+ u128_to_be128(&ctrblks[i], iv);
+ u128_inc(iv);
+ }
+
+ serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+ dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+ nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+ return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
+ NULL, fpu_enabled, nbytes);
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+ glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+ struct serpent_ctx *ctx;
+ bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ struct crypt_priv *ctx = priv;
+ int i;
+
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+ if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ __serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ struct crypt_priv *ctx = priv;
+ int i;
+
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+ if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ __serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+struct serpent_lrw_ctx {
+ struct lrw_table_ctx lrw_table;
+ struct serpent_ctx serpent_ctx;
+};
+
+static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+ int err;
+
+ err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
+ SERPENT_BLOCK_SIZE);
+ if (err)
+ return err;
+
+ return lrw_init_table(&ctx->lrw_table, key + keylen -
+ SERPENT_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->serpent_ctx,
+ .fpu_enabled = false,
+ };
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = encrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = lrw_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->serpent_ctx,
+ .fpu_enabled = false,
+ };
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = decrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = lrw_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ lrw_free_table(&ctx->lrw_table);
+}
+
+struct serpent_xts_ctx {
+ struct serpent_ctx tweak_ctx;
+ struct serpent_ctx crypt_ctx;
+};
+
+static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 *flags = &tfm->crt_flags;