# SDP HPSO Scheduling

Last run with Jupyter Notebook 5.6.0 running Python 3.7.0

In [None]:
# Imports
import matplotlib.pyplot as plt
import sys
import os
import pickle
import random
import numpy as np
import csv

sys.path += ['..']
from sdp_par_model import reports as iapi
from sdp_par_model.parameters.definitions import *
from sdp_par_model.parameters.definitions import Constants as c

from sdp_par_model.scheduler import Definitions as sdefs
from sdp_par_model.scheduler import Scheduler

import scheduling.scheduling as sched

import collections
import warnings
import bisect

%matplotlib inline
plt.rcParams['figure.figsize'] = 16, 8

## Let's create a sequence of tasks 
### (using letters A..G to define scheduling  blocks - see Google Drive or Python code  for Definitions)

In [None]:
flops_capacity_low = 13.8  # PetaFlops
flops_capacity_mid = 12.1  # PetaFlops

cold_buffer_size_low = 30 # PetaBytes
hot_buffer_size_low  = 20  # PetaBytes

cold_buffer_size_mid = 30 # PetaBytes
hot_buffer_size_mid  = 20  # PetaBytes


seqL = ['B','A','A',] + ['B',]*32 + ['A',]*2 + ['B',]*73 + ['A',] + ['B',]*43
seqM = ['B','G',] + ['B',]*34 + ['G','C','F',] + ['B',]*110 +['F',]*91 + ['G',]*2 + ['E',]*4 + ['D',]

In [None]:
sequence_to_simulate = seqL.copy()
random.shuffle(sequence_to_simulate)  # Randomly shuffles the sequence of processing blocks (letters)
keep_data_in_cold_buffer = False
hpso_list = Scheduler.hpso_letters_to_hpsos(sequence_to_simulate)

In [None]:
flops_cap   = flops_capacity_low
coldbuf_cap = cold_buffer_size_low
hotbuf_cap  = hot_buffer_size_low
tel_str = "LOW"

# THe block below saves the performance dictionary to file, but we won't use this for now (debugging)
'''
## Read  performace requirement lookup for all HPSOs. 
### If this lookup table does not exist, we create it, and save it to disk (to save time re-computing)
performance_lookup_filename = "performance_dict.data"
if os.path.isfile(performance_lookup_filename):
    performance_dict = None
    with open(performance_lookup_filename, "rb") as f:
        performance_dict = pickle.load(f)
else:
    # Create a performance dictionary and write it to file
    performance_dict = Scheduler.compute_performance_dictionary()
    with open(performance_lookup_filename, "wb") as f:
        pickle.dump(performance_dict, f, pickle.HIGHEST_PROTOCOL)
'''

performance_dictionary = Scheduler.compute_performance_dictionary()

In [None]:
task_list = Scheduler.hpsos_to_sdp_task_list(hpso_list, performance_dictionary, keep_data_in_cold_buffer)
'''To show how the tasks are created, can print the sequence of Task objects.'''
#for task in task_list:
#    print(task)

schedule = Scheduler.schedule(task_list, flops_cap, hotbuf_cap, coldbuf_cap,  
                              assign_flops_fraction=0.5, assign_bw_fraction=0.5, max_nr_iterations=1000)

last_preservation_timestamp = sorted(schedule.preserve_deltas.keys())[-1]
max_t = last_preservation_timestamp
print("SDP task sequence completes at t = %g hrs" % (max_t / 3600))

# Now we plot the results

max_preservation = sorted(schedule.preserve_deltas.values())[-1]
last_preservation_timestamp = sorted(schedule.preserve_deltas.keys())[-1]
xrange = [0, last_preservation_timestamp * 1.05]
preserv_yrange = [0, max(max_preservation * 1.05, 1)]

max_t = last_preservation_timestamp

iapi.plot_deltas(schedule.flops_deltas, xrange=xrange, max_t=max_t, 
                 title='%s SDP FLOP/s (capped at %.3g PetaFLOPS)' % (tel_str, flops_cap), 
                 xlabel='wall clock time (hours)', ylabel='PetaFLOP/s')
'''
iapi.plot_deltas(schedule.memory_deltas, xrange=xrange, max_t=max_t, 
                 title='Evolution of SDP working memory (RAM)', xlabel='wall clock time (hours)', ylabel='TeraByte')
'''                 
iapi.plot_deltas(schedule.cold_buffer_deltas, xrange=xrange, max_t=max_t, 
                 title='%s SDP Cold buffer usage (capped at %.3g PetaByte)' % (tel_str, coldbuf_cap), 
                 xlabel='wall clock time (hours)', ylabel='PetaByte')
iapi.plot_deltas(schedule.hot_buffer_deltas, xrange=xrange, max_t=max_t, 
                 title='%s SDP Hot Buffer usage (capped at %.0f PetaByte)' % (tel_str, hotbuf_cap), 
                 xlabel='wall clock time (hours)', ylabel='PetaByte')
iapi.plot_deltas(schedule.preserve_deltas, xrange=xrange, yrange=preserv_yrange, max_t=max_t, 
                 title='%s SDP Preservation usage (uncapped)' % tel_str, xlabel='wall clock time (hours)', ylabel='TeraByte')

iapi.plot_deltas(schedule.ingest_pipe_deltas, xrange=xrange, max_t=max_t, 
                 title='Bandwidth of %s (Ingest pipeline -> Cold Buffer)' % tel_str, 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')                 
'''
# Ingest -> Working memory pipeline is identical to Working Memory -> Cold Buffer (streaming)
iapi.plot_deltas(schedule.mem_cold_pipe_deltas, xrange=xrange, max_t=max_t, 
                 title='Bandwidth of Ingest working memory -> Cold Buffer', 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')
'''
iapi.plot_deltas(schedule.cold_hot_pipe_deltas, xrange=xrange, max_t=max_t, 
                 title='Bandwidth of %s (Cold Buffer -> Hot Buffer)' % tel_str, 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')
'''
iapi.plot_deltas(schedule.hot_mem_pipe_delta, xrange=xrange, max_t=max_t, 
                 title='Bandwidth usage of pipeline from hot buffer to working memory', 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')
iapi.plot_deltas(schedule.mem_hot_pipe_delta, xrange=xrange, max_t=max_t, 
                 title='Bandwidth usage of pipeline from working memory to hot buffer', 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')
'''                 
iapi.plot_deltas(schedule.hot_preserve_pipe_delta, xrange=xrange, max_t=max_t, 
                 title='Bandwidth %s (Hot Buffer -> Preservation)' % tel_str, 
                 xlabel='wall clock time (hours)', ylabel='TeraByte/s', colour='c')

### Using dymanic generation, loosely based on Mark Ashdown's code

In [None]:
print("Working directory = %s" % os.getcwd())

dtype_pmout = np.dtype([('name',   'U30'),
                        ('tele',   'U10'),
                        ('pipe',   'U10'),
                        ('tobs',   'f8' ),
                        ('tpoint', 'f8' ),
                        ('texp',   'f8' ),
                        ('rflop',  'f8' ),
                        ('mvis',   'f8' ),
                        ('mout',   'f8' )])

In [None]:
ifile = os.path.join('..','data','csv','2017-12-18-0d8d518_hpsos.csv')
with open(ifile, 'r') as f:
    reader = csv.reader(f)
    line = next(reader)
    pmout = np.zeros(len(line)-1, dtype=dtype_pmout)
    
    # Force non-defined entries to have NaN values
    for entry in pmout:
        entry[-1] = np.NaN
        entry[-2] = np.NaN
        entry[-3] = np.NaN
        entry[-4] = np.NaN
        entry[-5] = np.NaN
        entry[-6] = np.NaN    
        
    # Read all the column names, discarding everything from the first space onwards
    pmout['name'] = [x.split()[0] for x in line[1:]]  
    print('Column headers:\n')
    print(line)
    print('\nExtracted HPSO names:\n')
    print(pmout['name'])
    
    # Now iterate across all rows. If a row name is one we're interested in, read the values for all columns
    # and write them to the pmout data structure. Skip empty or unrecognized rows
    for line in reader:  
        if len(line) == 0:
            pass  # empty line; skip
        elif line[0] == 'Telescope':
            pmout['tele'] = line[1:]
        elif line[0] == 'Pipeline':
            pmout['pipe'] = line[1:]
        elif line[0] == 'Observation Time [s]':
            pmout['tobs'] = line[1:]
        elif line[0] == 'Pointing Time [s]':
            pmout['tpoint'] = line[1:]
        elif line[0] == 'Total Time [s]':
            pmout['texp'] = line[1:]
        elif line[0] == 'Total Compute Requirement [PetaFLOP/s]':
            pmout['rflop'] = line[1:]
        elif line[0] == 'Visibility Buffer [PetaBytes]':
            pmout['mvis'] = line[1:]
        elif line[0] == 'Output size [TB]':
            pmout['mout'] = line[1:]
        else:
            pass  # row title is not in the list of ones we're interested in
            

In [None]:
pmout

In [None]:
performance_dictionary

In [None]:
tel_to_hpsos = {}
hpso_params = {}

#ifile = os.path.join('..','data','csv','2018-09-17-2c761d6_hpsos.csv')
ifile = os.path.join('..','data','csv','2017-12-18-0d8d518_hpsos.csv')
oform = 'hpsos_{t}.txt'

tele = [('low', 'SKA1_Low'),
        ('mid', 'SKA1_Mid')]

# Loop over telescopes.

for tel, tinp in tele:
    ofile = oform.format(t=tel)

    # Extract project information.
    proj = sched.extract_projects(ifile, tinp)

    # Write list of projects.
    sched.write_projects(ofile, proj)
    print("Processed %s, %s" % (tel, tinp))
    break

'''
for hpso in HPSOs.hpsos_original:
    p = ParameterContainer()
    hpso_params = apply_hpso_parameters(p, hpso, HPSOs.hpso_tasks[hpso][0])
    tel = hpso_params.telescope
    texp = hpso_params.Texp
    tpoint = hpso_params.Tpoint
    
    if not tel in tel_to_hpsos:
        tel_to_hpsos[tel] = set()
    tel_to_hpsos[tel].add(hpso)
'''

In [None]:
print(hpso_params)

In [None]:
tel_to_hpsos

### This can also be read from e.g. a CSV file if you want flexible schedules to be simulated
#### Using a variant of Mark Ashdown's schedule generating code

In [None]:
ifile = os.path.join('..', 'scheduling', 'hpsos.csv')

projects = {}

# Loop over telescopes.
for tinp in ('SKA1_Low', 'SKA1_Mid'):
    proj = sched.extract_projects(ifile, tinp)
    projects[tinp] = proj

In [None]:
# Set the length of a scheduling block and the length of the sequence
# to generate.

tsched = 6.0 * 3600.0
tseq = 10.0 * 24.0 * 3600.0
allow_short_tobs = False
generated_sequences = {}

for t in ('SKA1_Low', 'SKA1_Mid'):
    # Generate sequence of observations.
    proj = projects[t]
    seq = sched.generate_sequence(proj, tsched, tseq,
                                  allow_short_tobs=allow_short_tobs)
    sequence = []
    for item in seq:
        sequence.append(item[1])  # the HPSO identifier

    generated_sequences[t] = sequence

# Old code follows below - may not work since updates

# Old-school letter sequence method:
## Create the Scheduler object, sets it up, execute the scheduler, and plot the results
At the moment the Scheduler object contains the functionality of the SDP simulator, the scheduling code itself, as well as the generated schedule objects. In future we may wish to split them into separate classes and objects 

## Run a number of randomized sequences, looking at spread of results

In [None]:
nr_of_runs = 100

sequence_to_simulate = seqL

# Caps are all in "Peta" units (FLOPS, or Bytes)
flops_cap   = flops_capacity_low
coldbuf_cap = cold_buffer_size_low
hotbuf_cap  = hot_buffer_size_low
tel_str = "LOW"

## Read  performace requirement lookup for all HPSOs. 
### If this lookup table does not exist, we create it, and save it to disk (to save time re-computing)
performance_lookup_filename = "performance_dict.data"
performance_dict = None
if os.path.isfile(performance_lookup_filename):
    performance_dict = None
    with open(performance_lookup_filename, "rb") as f:
        performance_dict = pickle.load(f)
else:
    raise Exception("No Performance dictionary file found!")

runtimes = np.zeros(nr_of_runs)
for i in range(nr_of_runs):
    # Randomly shuffles the sequence of processing blocks (letters)
    random.shuffle(sequence_to_simulate)
    sdp_scheduler = Scheduler()  # Create a new scheduler, because the Scheduler has a state
    sdp_scheduler.set_performance_dictionary(performance_dict)
    task_list = sdp_scheduler.hpso_letters_to_sdp_task_list(sequence_to_simulate)

    schedule = sdp_scheduler.schedule(task_list, flops_cap, hotbuf_cap, coldbuf_cap,  
                                      assign_flops_fraction=0.5, assign_bw_fraction=0.5, max_nr_iterations=1000)
    max_t = sorted(schedule.preserve_deltas.keys())[-1]
    runtimes[i] = max_t
    print("Run %d of %d : SDP task seq completed at t = %g hrs" % (i+1, nr_of_runs, (max_t / 3600)))

print("Done!")

In [None]:
plt.hist(runtimes/3600)
plt.title('Distribution of execution times (median = %.1f hours)' % np.median(runtimes/3600), Fontsize=20)
plt.xlabel('Hours', Fontsize=16)
plt.ylabel('Nr of occurrences', Fontsize=16)
plt.show()

## Hard-coded performace costs and requirements from Rosie's Excel sheet
### These were previously used in rev [3372fdd] to approximately replicate Rosie's results. Check (rerun) the notebook at that repository revision to regenerate those results - not repeated here.

In [None]:
# The following sets of values should be computed using the parametric model. Just hard-coded for now (from Excel)
hpso_ingest_rates = {'A':0.459, 'B':3e-3, 'C':0.117, 'D':0.112, 'E':0.0603, 'F':0.244, 'G':0.438}  # in TeraByte/s
# FLOPcounts below are the PetaFLOPs required to process one second of ingested data
hpso_flopcounts = {'A':50.4, 'B':2.0, 'C':7.5, 'D':6.2, 'E':2.9833, 'F':17.689, 'G':27.698}  # in PetaFLOP/s
hpso_durations  = {'A':6, 'B':0.17, 'C':6, 'D':6, 'E':4.4, 'F':0.1233, 'G':6}  # in hours

sdp_setup_time = 60  # the minimum amount of time between processing tasks on the SDP (seconds)
telecope_setup_time = 0  # TODO is this correct?