X, A, B
X => C1 = A1 * B1
C2 = A2 * B2

In [None]:
%%writefile matmul.cu
#include <iostream>
#include <cuda_runtime.h>
#include <fstream>

using namespace std;

__global__ void matrixMul(float* A, float* B, float* C, int M, int N, int P, int offset) {
    int k = threadIdx.x + offset;

    float* a = A + k * M * N;
    float* b = B + k * N * P;
    float* c = C + k * M * P;

    for(int i = 0; i < M; i++) {
        for(int j = 0; j < N; j++) {
            for(int l = 0; l < P; l++) {
                //c[i][l] += a[i][j] * b[j][l];
                c[i * P + l] = a[i * N + j] * b[j * P + l];
            }
        }
    }
}

int main(int argc, char *argv[]) {

    int T = atoi(argv[1]); //koyta thread use korte parbo
    int K = atoi(argv[2]); //koita matrix gun

    //100 gun, thread 10,

    int M = 400, N = 400, P = 400;

    int SizeA = M * N * K;
    int SizeB = N * P * K;
    int SizeC = M * P * K;

    //memory alocate (cpu allocate)
    float *h_A = new float[SizeA];
    float *h_B = new float[SizeB];
    float *h_C = new float[SizeC];


    //malloc (gpu allocate)
    float *d_A;
    cudaMalloc(&d_A, SizeA * sizeof(float));
    float *d_B;
    cudaMalloc(&d_B, SizeB * sizeof(float));
    float *d_C;
    cudaMalloc(&d_C, SizeC * sizeof(float));

    //data initialize
    for (int i = 0; i < SizeA; i++) {
        h_A[i] = rand();
    }
    for(int i = 0; i < SizeB; i++) {
        h_B[i] = rand();
    }


    //copy from host to device
    cudaMemcpy(d_A, h_A, SizeA * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, SizeB * sizeof(float), cudaMemcpyHostToDevice);

    //cuda process suru
    int gunKorteHobe = K;
    int offset = 0;
    while(gunKorteHobe > 0){

        int currentBatch = min(gunKorteHobe, T);

        matrixMul<<<1,currentBatch>>>(d_A, d_B, d_C, M, N, P, offset);
        cudaDeviceSynchronize();

        gunKorteHobe -= currentBatch;
        offset += currentBatch;
    }

    //let's copy back to cpu
    cudaMemcpy(h_C, d_C, SizeC * sizeof(float), cudaMemcpyDeviceToHost);

    // Save results to file
    ofstream fout("output.txt");

    if (!fout) {
        cerr << "Error opening output file!" << endl;
        return 1;
    }

    for (int k = 0; k < K; k++) {
        fout << "Matrix C[" << k << "]:" << endl;
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < P; j++) {
                fout << h_C[k * M * P + i * P + j] << " ";
            }
            fout << endl;
        }
        fout << endl;
    }

    fout.close();
    cout << "Results written to output.txt" << endl;


    cout << "All operation done" << endl;

}

Overwriting asif.cu


In [None]:
!nvcc -arch=sm_75 matmul.cu -o matmul

In [None]:
!time ./matmul 1 10 && sleep 2

All operation done

real	0m29.922s
user	0m29.468s
sys	0m0.283s


In [None]:
!time ./matmul 3 10 && sleep 2

All operation done

real	0m12.954s
user	0m12.633s
sys	0m0.228s


In [None]:
!time ./matmul 7 10 && sleep 2

All operation done

real	0m8.002s
user	0m7.700s
sys	0m0.229s


In [None]:
!time ./matmul 10 10 && sleep 2

All operation done

real	0m5.295s
user	0m5.000s
sys	0m0.223s


In [None]:
!time ./matmul 20 10 && sleep 2

All operation done

real	0m5.251s
user	0m5.026s
sys	0m0.210s
