<a href="https://colab.research.google.com/github/tonystz/gitpod/blob/main/strings_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycuda # install cuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [68]:
import numpy as np
ttt = np.asarray([ "stuff" + str(i)  for i in range(0,2) ])

print( ttt.dtype, type(ttt[0]) ) 
print(ttt)

s= np.array(['abcdf'], dtype=np.str_)
print( s.dtype, type(s[0]) )

<U6 <class 'numpy.str_'>
['stuff0' 'stuff1']
<U5 <class 'numpy.str_'>


In [186]:
%%writefile s.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi( char *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      printf("thread id: [%d]  %s\\n",idx,out_gpu[idx]);
      //print c-str
      for (int i=0;i<4;i++){
        printf("%c",out_gpu[i]);
      }

      printf(" >end\\n");
      printf("test:%s\\n",out_gpu);

    }
    """)

func = mod.get_function("say_hi")


s= np.array(['s12345'], dtype=np.str_)
print('shape:',s.shape)
print(s,s.data)

s_gpu = gpuarray.to_gpu(s)
func(s_gpu,block=(s.size,1,1),grid=(1,1,1))
print('modify:',s_gpu.get())

Overwriting s.py


In [187]:
!python s.py

shape: (1,)
['s12345'] <memory at 0x7f800023c580>
thread id: [0]  (null)
s    >end
test:s
modify: ['s12345']


In [188]:
%%writefile t389.cu
#include <stdio.h>
#include <string.h>

#define nTPB 256

__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}

__global__ void kern_2D(char **data, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}

int main(){

  const int num_strings = 3;
  const char s0[] = "s1\0";
  const char s1[] = "s2\0";
  const char s2[] = "sstz3\0";
  int ds[num_strings];
  ds[0] = sizeof(s0)/sizeof(char);
  ds[1] = sizeof(s1)/sizeof(char);
  ds[2] = sizeof(s2)/sizeof(char);
  // pretend we have a dynamically allocated char** array
  char **data;
  data = (char **)malloc(num_strings*sizeof(char *));
  data[0] = (char *)malloc(ds[0]*sizeof(char));
  data[1] = (char *)malloc(ds[1]*sizeof(char));
  data[2] = (char *)malloc(ds[2]*sizeof(char));
  // initialize said array
  strcpy(data[0], s0);
  strcpy(data[1], s1);
  strcpy(data[2], s2);
  // method 1: "flattening"
  char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
  unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
  unsigned next = 0;
  for (int i = 0; i < num_strings; i++){
    strcpy(fdata+next, data[i]);
    ind[i] = next;
    next += ds[i];}
  //copy to device
  char *d_fdata;
  unsigned *d_ind;
  cudaMalloc(&d_fdata, next*sizeof(char));
  cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
  cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
  printf("method 1:\n");
  kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
  cudaDeviceSynchronize();
  //method 2: "2D" (pointer-to-pointer) array
  char **d_data;
  cudaMalloc(&d_data, num_strings*sizeof(char *));
  char **d_temp_data;
  d_temp_data = (char **)malloc(num_strings*sizeof(char *));
  for (int i = 0; i < num_strings; i++){
    cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
    cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
  printf("method 2:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
  cudaDeviceSynchronize();
  // method 3: managed allocations
  // start over with a managed char** array
  char **m_data;
  cudaMallocManaged(&m_data, num_strings*sizeof(char *));
  cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
  cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
  cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
  // initialize said array
  strcpy(m_data[0], s0);
  strcpy(m_data[1], s1);
  strcpy(m_data[2], s2);
  // call kernel directly on managed data
  printf("method 3:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
  cudaDeviceSynchronize();

  return 0;
}

Overwriting t389.cu


In [189]:
!nvcc -o t389 t389.cu
!compute-sanitizer ./t389

method 1:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is sstz3
method 2:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is sstz3
method 3:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is sstz3


In [174]:
%%writefile s2.py
import time
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import math
from sys import getsizeof

import pycuda.autoinit

kernel_code1 = """
__global__ void test1(char** d_wordList) {
    (d_wordList[blockIdx.x][threadIdx.x])++;      
}
    """

kernel_code2 = """
__global__ void test2(char* d_wordList, size_t *offsets) {

    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    printf("Hello from thread %d, my string is %s\\n", idx, d_wordList+offsets[idx]);
    (d_wordList[offsets[blockIdx.x] + threadIdx.x])++;
}
    """




mod = compiler.SourceModule(kernel_code1)
ker_test1 = mod.get_function("test1")



wordList = ['asd','bsd','csd']

d_words = []

for word in wordList:
    d_words.append(gpuarray.to_gpu(np.array(word, dtype=str)))

d_wordList = gpuarray.to_gpu(np.array([word.ptr for word in d_words], dtype=np.uintp))

ker_test1(d_wordList, block=(3,1,1), grid=(3,1,1))

for word in d_words:
  result = word.get()
  print(result)

mod2 = compiler.SourceModule(kernel_code2)
ker_test2 = mod2.get_function("test2")
wordlist2 = np.array(['asdbsdcsd'], dtype=np.str_)
d_words2 = gpuarray.to_gpu(np.array(['asd','bsd','csd'], dtype=np.str_))
offsets = gpuarray.to_gpu(np.array([0,3,6,9], dtype=np.uint64))
ker_test2(d_words2, offsets, block=(3,1,1), grid=(1,1,1))
h_words2 = d_words2.get()
print(h_words2)


Overwriting s2.py


In [175]:
!python s2.py

𐅢sd
𐅣sd
𐅤sd
Hello from thread 0, my string is a
Hello from thread 1, my string is 
Hello from thread 2, my string is 
['𐅢sd' 'bsd' 'csd']
