From 557294eb774ee0bda1817fc8249cae62e916eafa Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@cs.stanford.edu>
Date: Tue, 16 Mar 2021 15:59:21 -0700
Subject: [PATCH] cuda: fix windowing test with cuda

Fixes #422.

This commit ensures that the allocation clearing logic is applied to
the CUDA backend as well. The windowing test caught this because TACO
was automatically parallelizing the loop onto the GPU.
---
 src/codegen/codegen_cuda.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 971410b98..77cf0cd88 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -1096,6 +1096,20 @@ void CodeGen_CUDA::visit(const Allocate* op) {
   op->num_elements.accept(this);
   parentPrecedence = TOP;
   stream << "));" << endl;
+  // If the operation wants the input cleared, then memset it to zero.
+  if (op->clear) {
+    doIndent();
+    stream << "gpuErrchk(cudaMemset(";
+    op->var.accept(this);
+    stream << variable_name;
+    stream << ", 0, ";
+    stream << "sizeof(" << elementType << ")";
+    stream << " * ";
+    parentPrecedence = MUL;
+    op->num_elements.accept(this);
+    parentPrecedence = TOP;
+    stream << "));" << endl;
+  }
 
   if(op->is_realloc) {
     doIndent();