# `streams` task

In [30]:
import pandas as pd
import pandasql as ps
import plotly.graph_objects as go
import numpy as np

## Results

In [31]:
df = pd.read_csv("results.csv")
df

Unnamed: 0,rep,size,num_streams,block_size,section,dur
0,0,1024,0,32,total,0.000557
1,0,1024,0,32,stream-0,0.000090
2,0,1024,0,32,kernel-0,0.000055
3,0,1024,1,32,total,0.000521
4,0,1024,1,32,stream-0,0.000071
...,...,...,...,...,...,...
126715,15,67108864,64,1024,stream-60,0.010627
126716,15,67108864,64,1024,kernel-61,0.009516
126717,15,67108864,64,1024,kernel-62,0.009466
126718,15,67108864,64,1024,kernel-26,0.007415


## Code sections

In [32]:
df["section"].unique()

array(['total', 'stream-0', 'kernel-0', 'stream-1', 'kernel-1',
       'stream-3', 'kernel-3', 'stream-2', 'kernel-2', 'stream-7',
       'kernel-7', 'stream-6', 'kernel-4', 'stream-5', 'kernel-6',
       'kernel-5', 'stream-4', 'kernel-15', 'kernel-12', 'stream-15',
       'kernel-14', 'stream-9', 'kernel-8', 'stream-8', 'kernel-9',
       'stream-14', 'kernel-10', 'stream-12', 'stream-10', 'kernel-11',
       'stream-11', 'kernel-13', 'stream-13', 'stream-30', 'stream-29',
       'stream-19', 'kernel-30', 'stream-28', 'kernel-16', 'kernel-27',
       'kernel-25', 'stream-31', 'kernel-28', 'kernel-21', 'stream-27',
       'kernel-26', 'stream-21', 'kernel-17', 'stream-18', 'stream-25',
       'stream-17', 'stream-24', 'stream-16', 'kernel-31', 'kernel-18',
       'kernel-20', 'stream-20', 'kernel-19', 'kernel-22', 'stream-22',
       'kernel-23', 'stream-26', 'stream-23', 'kernel-24', 'kernel-29',
       'stream-63', 'stream-42', 'kernel-63', 'kernel-42', 'kernel-47',
       'stream-4

Explanations:
- `total`: total time of execution;
- `stream-${stream_idx}`: total duration of a given stream (including copying data);
- `kernel-${stream_idx}`: duration of kernel execution in a given stream.

In [33]:
avg_df = ps.sqldf("""
select size, num_streams, block_size, section, avg(dur) mean, sqrt(avg(dur*dur)-avg(dur)*avg(dur)) std
from df
group by size, num_streams, block_size, section;
""")

avg_df

Unnamed: 0,size,num_streams,block_size,section,mean,std
0,1024,0,32,kernel-0,0.000044,0.000005
1,1024,0,32,stream-0,0.000080,0.000007
2,1024,0,32,total,0.000519,0.000017
3,1024,0,64,kernel-0,0.000043,0.000005
4,1024,0,64,stream-0,0.000079,0.000006
...,...,...,...,...,...,...
7915,67108864,64,1024,stream-63,0.010592,0.000075
7916,67108864,64,1024,stream-7,0.006080,0.000036
7917,67108864,64,1024,stream-8,0.010213,0.000071
7918,67108864,64,1024,stream-9,0.010252,0.000065


In [34]:
best_df = ps.sqldf("""
select size, num_streams, mean, std
from avg_df
where section = 'total'
group by size, num_streams
having mean = min(mean);
""")
best_df.tail(8)

Unnamed: 0,size,num_streams,mean,std
32,67108864,0,0.15493,0.000709
33,67108864,1,0.155035,0.000826
34,67108864,2,0.121485,0.000757
35,67108864,4,0.104046,0.000606
36,67108864,8,0.095436,0.000654
37,67108864,16,0.09113,0.000644
38,67108864,32,0.088834,0.000655
39,67108864,64,0.087576,0.000637


We can see that, asymptotically, the time gets improved by a factor of around two. Let's investigate it further.

In [35]:
total_df = ps.sqldf("""
select size, num_streams, block_size, mean dur
from avg_df
where section = 'total'
group by size, num_streams, block_size;
""")

kernel_df = ps.sqldf("""
select size, num_streams, block_size, sum(mean) dur
from avg_df
where section like 'kernel-%'
group by size, num_streams, block_size;
""")

stream_df = ps.sqldf("""
select size, num_streams, block_size, sum(mean) dur
from avg_df
where section like 'stream-%'
group by size, num_streams, block_size;
""")

comp_df = ps.sqldf("""
select S.size, S.num_streams, T.dur total, K.dur kernel, S.dur stream, S.dur-K.dur memory, T.dur/S.dur "total/stream", K.dur/(S.dur-K.dur) "kernel/memory"
from stream_df S 
join kernel_df K 
    on S.size = K.size
        and S.num_streams = K.num_streams
        and S.block_size = K.block_size
join total_df T
    on S.size = T.size
        and S.num_streams = T.num_streams
        and S.block_size = T.block_size
group by S.size, S.num_streams
having T.dur = min(T.dur);
""")

comp_df

Unnamed: 0,size,num_streams,total,kernel,stream,memory,total/stream,kernel/memory
0,1024,0,0.000519,4.4e-05,8e-05,3.6e-05,6.486323,1.229518
1,1024,1,0.00051,4.5e-05,6.2e-05,1.7e-05,8.19093,2.565396
2,1024,2,0.000529,8.7e-05,0.000123,3.6e-05,4.307052,2.432853
3,1024,4,0.000573,0.00017,0.000232,6.2e-05,2.468609,2.729676
4,1024,8,0.000665,0.000337,0.000452,0.000116,1.470724,2.911988
5,1024,16,0.000843,0.000666,0.000889,0.000223,0.947779,2.993173
6,1024,32,0.001187,0.001353,0.001779,0.000426,0.667118,3.174435
7,1024,64,0.001908,0.00271,0.003566,0.000856,0.534923,3.166549
8,16384,0,0.000517,4.2e-05,8.8e-05,4.6e-05,5.90107,0.91051
9,16384,1,0.000517,4.3e-05,7.5e-05,3.2e-05,6.901425,1.32358


The main point is that, with the kernel and memory transfers being roughly equal, the increased number of streams should indeed cause a decrease of executon time by around two, since they would get overlapped. This, indeed, seems to happen.