In [42]:
import glob

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import plotly.express as px
import plotly.graph_objects as go

In [39]:
columns=['approach', 'sort', 'world_size', 'construct', 'threads', 'size', 'duration']
results = pd.read_csv('../results/mpi_sort_results_backup.csv', names=columns)

In [40]:
results

Unnamed: 0,approach,sort,world_size,construct,threads,size,duration
0,serial,quick_sort,1,single,1,10000,0.001775
1,serial,merge_sort,1,single,1,10000,0.001781
2,serial,enumeration_sort,1,single,1,10000,0.229131
3,serial,quick_sort,1,single,1,10000,0.001310
4,serial,merge_sort,1,single,1,10000,0.001192
...,...,...,...,...,...,...,...
475,partition,quick_sort,2,tasks,8,100000000,15.840319
476,merge,merge_sort,2,tasks,8,100000000,14.003287
477,partition,quick_sort,2,tasks,8,100000000,15.828794
478,merge,merge_sort,2,tasks,8,100000000,14.047082


In [41]:
for col in results.columns:
    if col != 'duration':
        print(f'{col}: {results[col].unique()}')

approach: ['serial' 'merge' 'partition']
sort: ['quick_sort' 'merge_sort' 'enumeration_sort']
world_size: [1 2]
construct: ['single' 'tasks' 'parallel']
threads: [1 2 4 8]
size: [    10000    100000   1000000  10000000 100000000]


In [54]:
df = (
    results
    .groupby(['approach', 'sort', 'world_size', 'construct', 'threads', 'size'])['duration'].mean()
    .to_frame('duration')
    .reset_index()
)
df

Unnamed: 0,approach,sort,world_size,construct,threads,size,duration
0,merge,enumeration_sort,2,parallel,2,10000,0.066696
1,merge,enumeration_sort,2,parallel,2,100000,6.133736
2,merge,enumeration_sort,2,parallel,4,10000,0.034632
3,merge,enumeration_sort,2,parallel,4,100000,3.110979
4,merge,enumeration_sort,2,parallel,8,10000,0.021942
...,...,...,...,...,...,...,...
91,serial,quick_sort,1,tasks,8,10000,0.001535
92,serial,quick_sort,1,tasks,8,100000,0.018962
93,serial,quick_sort,1,tasks,8,1000000,0.233033
94,serial,quick_sort,1,tasks,8,10000000,2.643697


In [57]:
# single threaded MPI algorithms
df = df.loc[(df['construct'] == 'single') & (df['size'] <= 1000000)]

In [58]:
df

Unnamed: 0,approach,sort,world_size,construct,threads,size,duration
6,merge,enumeration_sort,2,single,1,10000,0.051456
7,merge,enumeration_sort,2,single,1,100000,5.919773
8,merge,merge_sort,2,single,1,10000,0.000908
9,merge,merge_sort,2,single,1,100000,0.010127
10,merge,merge_sort,2,single,1,1000000,0.093235
28,partition,quick_sort,2,single,1,10000,0.001101
29,partition,quick_sort,2,single,1,100000,0.01092
30,partition,quick_sort,2,single,1,1000000,0.195689
54,serial,enumeration_sort,1,single,1,10000,0.219281
55,serial,enumeration_sort,1,single,1,100000,23.860981


In [None]:
# take average over trials
df = results.groupby(['Algorithm', 'Construct', 'Threads', 'Size', 'Cutoff'])['Duration'].mean().to_frame('Duration')

cutoff = 100
if_cutoff = 'No Hybrid Parallelism' if cutoff == 0 else 'Hybrid Parallelism'

# exclude parallel implementations without cutoffs
df = df.loc[
    (
        (df.index.get_level_values('Construct') == 'serial')
        | (df.index.get_level_values('Cutoff') == cutoff)
    )
#     & (df.index.get_level_values('Algorithm') != 'enumeration')
#     & (df.index.get_level_values('Construct') != 'sections')
    & (df.index.get_level_values('Size') <= 1000000)
]

df = df.reset_index(['Cutoff'], drop=True)

# copy single threaded "serial" to one each for tasks and sections for visualisation
sections = df.loc[df.index.get_level_values('Construct') == 'serial'].reset_index()
tasks = sections.copy()

sections.Construct = sections.Construct.str.replace('serial', 'sections')
sections.set_index(df.index.names, inplace=True)
# sections.set_index(df.'Construct', inplace=True, append=True)

tasks.loc[tasks.Algorithm != 'enumeration', 'Construct'] = (
    tasks.Construct.str.replace('serial', 'tasks')
)
tasks.set_index(df.index.names, inplace=True)
# sections.set_index('Construct', inplace=True, append=True)

# merge copies together
df = pd.concat([df.loc[df.index.get_level_values('Construct') != 'serial'], sections, tasks]).reset_index()
df = df.sort_values(['Threads', 'Size'])

# the way we have copied serial implementations to tasks/sections for visualisation
# has meant we have duplicated num_threads==1 entries for enumeration
# this is not the most elegant solution, but this should replace duplicate enumeration serial entries
df.loc[df['Algorithm'] == 'enumeration', 'Construct'] = (
    np.repeat('parallel', len(df.loc[df['Algorithm'] == 'enumeration', 'Construct']))
)

assert (len(df) - len(df.drop_duplicates())) == 5

df = df.drop_duplicates()