# CUDA编程模型---利用Shared Memory优化程序

![](shared_memory.png)

当我们在处理矩阵乘法时，假设矩阵M(m,k)\*N(k,n) = P(m,n)。那么，矩阵M中的一个数值m(x,y),就要被grid中所有满足threadIdx.y+blockIdx.y\*blockDim.y = y的线程从Global Memory中读一次，一共就是K次。那么，我们看到这么多重复读取，就可以把这个变量放在Shared Memory中，极大地减少每次的读取时间。我们可以按照下面的代码来修改martix_mul的核函数：

```c
__global__ void gpu_matrix(int* a, int* b, int* c, int m, int n, int k)
{
    __shared__ int sub_a[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int sub_b[BLOCK_SIZE][BLOCK_SIZE];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    int tmp =0;
    int idx;
    for(int step=0; step <= n/BLOCK_SIZE; step++)
    {
        int step_x = step * BLOCK_SIZE + threadIdx.x;
        int step_y = y;
        idx = step_y * n + step_x;
        if(step_x >= n || step_y >= m)
        {
            sub_a[threadIdx.y][threadIdx.x] =0;
        }
        else
        {
            sub_a[threadIdx.y][threadIdx.x] = a[idx];
        }

        step_x = x;
        step_y = step * BLOCK_SIZE + threadIdx.y;
        idx = step_y * k +step_x;
        if(step_x >= k || step_y >= n)
        {
            sub_b[threadIdx.y][threadIdx.x] = 0;
        }
        else
        {
            sub_b[threadIdx.y][threadIdx.x] = b[idx];
        }

        __syncthreads();

        for(int i = 0; i < BLOCK_SIZE; i++)
        {
            tmp +=sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];
        }
        __syncthreads();
    }

    if ( x < k && y < m)
    {
        c[y*k + x] = tmp; 
    }
}
```

修改[matrix_mul.cu](matrix_mul.cu)文件，编译并执行。如果遇到困难，请参考[result_1.cu](result_1.cu)

In [None]:
!/usr/local/cuda/bin/nvcc matrix_mul.cu -o matrix_mul

In [None]:
!./matrix_mul

课后作业:

* 请大家尝试利用shared memory优化矩阵转置的示例, 如果遇到困难, 请参考[result_2.cu](result_2.cu)