# SDP system sizing

This notebook generates the base numbers that go into SDP system sizing. We use the list of high priority science objectives (HPSO) to derive the average SDP computational load and output data rate. 

In [1]:
import sys
from IPython.display import display, Markdown

sys.path.insert(0, "..")
from sdp_par_model import reports, config
from sdp_par_model.parameters.definitions import Telescopes, Pipelines, Constants, HPSOs

## Read HPSO performance characteristics

Loads high performance science objective characteristics generated by the export notebook. This always picks up the latest file checked into Git.

In [2]:
csv = reports.read_csv(reports.newest_csv(reports.find_csvs()))
csv = reports.strip_csv(csv)
def lookup(name, hpso):
    return { pipeline : float(reports.lookup_csv(csv, config.PipelineConfig(hpso=hpso, pipeline=pipeline).describe(), name))
             for pipeline in HPSOs.hpso_pipelines[hpso] }
total_time = { tel:
    sum([lookup('Total Time', hpso).get(Pipelines.Ingest)
        for hpso in HPSOs.all_hpsos if HPSOs.hpso_telescopes[hpso] == tel])
    for tel in Telescopes.available_teles
}

## Compute requirements

Sum up computational requirements of HPSOs, weighted by allocated fraction of observation time. Note that HPSOs do not cover the whole breath of what SKA SDP will be required to do, but we assume that the load is representative.

In [3]:
def make_compute_table(tel):
    table = [ "| HPSO | Time [%] | Tobs [h] | Ingest [Pflop/s] | RCAL [Pflop/s] | FastImg [Pflop/s] | ICAL [Pflop/s] " +
                "| DPrepA [Pflop/s] | DPrepB [Pflop/s] | DPrepC [Pflop/s] | DPrepD [Pflop/s] " +
                "| Total RT [Pflop/s] | Total Batch [Pflop/s] | Total [Pflop/s] | "]
    table.append("-".join("|"*table[0].count('|')))
    flop_sum = { pip : 0 for pip in Pipelines.available_pipelines }
    pips = [ Pipelines.Ingest, Pipelines.RCAL, Pipelines.FastImg,
             Pipelines.ICAL, Pipelines.DPrepA, Pipelines.DPrepB, Pipelines.DPrepC, Pipelines.DPrepD,]
    for hpso in sorted(HPSOs.all_hpsos):
        if HPSOs.hpso_telescopes[hpso] != tel:
            continue
        Tobs = lookup('Observation Time', hpso).get(Pipelines.Ingest,0)
        Texp = lookup('Total Time', hpso).get(Pipelines.Ingest,0)
        flops = lookup('Total Compute Requirement', hpso)
        Rflop = sum(flops.values())
        Rflop_rt = sum([ Rflop for pip, Rflop in flops.items() if pip in Pipelines.realtime])
        time_frac = Texp / total_time[tel]
        for pip, rate in flops.items():
            flop_sum[pip] += time_frac * rate
        flops_strs = ["{:.2f}".format(flops[pip]) if pip in flops else '-' for pip in pips]
        table.append("|{}|{:.1f}|{:.1f}|{}|{}|{}|{}|{}|{}|{}|{}|{:.2f}|{:.2f}|{:.2f}|".format(
            hpso,time_frac*100,Tobs/3600,*flops_strs,Rflop_rt,Rflop-Rflop_rt,Rflop))
    table.append("| **Average** | - | - | {:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|".format(
        *[flop_sum.get(pip,0) for pip in pips],
        sum([ Rflop for pip, Rflop in flop_sum.items() if pip in Pipelines.realtime]),
        sum([ Rflop for pip, Rflop in flop_sum.items() if pip not in Pipelines.realtime]),
        sum(flop_sum.values())))
    return "\n".join(table)
for tel in Telescopes.available_teles:
    display(Markdown("##### {}:\n\n".format(tel) + make_compute_table(tel)))

##### SKA1_Low:

| HPSO | Time [%] | Tobs [h] | Ingest [Pflop/s] | RCAL [Pflop/s] | FastImg [Pflop/s] | ICAL [Pflop/s] | DPrepA [Pflop/s] | DPrepB [Pflop/s] | DPrepC [Pflop/s] | DPrepD [Pflop/s] | Total RT [Pflop/s] | Total Batch [Pflop/s] | Total [Pflop/s] | 
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|hpso01|15.6|5.0|0.63|0.75|0.38|6.88|2.35|2.50|5.12|0.30|1.76|17.16|18.92|
|hpso02a|15.6|5.0|0.63|0.75|0.38|4.01|2.35|2.50|5.12|0.30|1.76|14.29|16.05|
|hpso02b|15.6|5.0|0.63|0.75|0.38|4.01|2.35|2.50|5.12|0.30|1.76|14.29|16.05|
|hpso04a|39.8|0.7|0.63|0.22|0.12|-|-|-|-|-|0.97|0.00|0.97|
|hpso05a|13.4|0.7|0.63|0.22|0.12|-|-|-|-|-|0.97|0.00|0.97|
| **Average** | - | - | 0.63|0.47|0.24|2.33|1.10|1.17|2.40|0.14|1.34|7.14|8.47|

##### SKA1_Mid:

| HPSO | Time [%] | Tobs [h] | Ingest [Pflop/s] | RCAL [Pflop/s] | FastImg [Pflop/s] | ICAL [Pflop/s] | DPrepA [Pflop/s] | DPrepB [Pflop/s] | DPrepC [Pflop/s] | DPrepD [Pflop/s] | Total RT [Pflop/s] | Total Batch [Pflop/s] | Total [Pflop/s] | 
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|hpso04b|1.0|0.2|0.60|0.94|0.36|-|-|-|-|-|1.91|0.00|1.91|
|hpso04c|3.1|0.2|0.60|0.56|0.23|-|-|-|-|-|1.39|0.00|1.39|
|hpso05b|2.1|0.2|0.60|0.95|0.55|-|-|-|-|-|2.11|0.00|2.11|
|hpso13|6.5|8.0|0.14|0.04|0.02|0.40|0.14|0.13|0.48|-|0.20|1.16|1.36|
|hpso14|2.6|8.0|0.15|0.03|0.01|0.15|0.09|0.09|0.31|-|0.19|0.63|0.83|
|hpso15|16.5|4.4|0.08|0.01|0.00|0.04|0.03|0.03|0.09|-|0.10|0.19|0.28|
|hpso18|13.1|0.0|0.60|0.94|0.36|-|-|-|-|-|1.91|0.00|1.91|
|hpso22|7.9|8.0|0.60|0.75|0.38|6.80|2.99|3.08|-|-|1.74|12.88|14.62|
|hpso27and33|13.1|0.1|0.19|0.09|0.05|0.26|0.31|0.47|-|-|0.33|1.04|1.37|
|hpso32|13.1|2.2|0.12|0.09|0.04|0.24|-|0.29|-|-|0.25|0.53|0.78|
|hpso37a|13.1|3.8|0.60|0.88|0.39|5.44|3.43|3.57|-|-|1.88|12.45|14.33|
|hpso37b|2.6|8.0|0.60|0.88|0.39|7.96|3.43|3.57|-|-|1.88|14.96|16.84|
|hpso37c|2.6|8.0|0.60|0.88|0.39|7.96|3.43|3.57|-|-|1.88|14.96|16.84|
|hpso38a|1.3|8.0|0.60|0.77|0.43|5.24|3.38|3.50|-|-|1.80|12.12|13.92|
|hpso38b|1.3|8.0|0.60|0.77|0.43|6.93|3.38|3.50|-|-|1.80|13.81|15.61|
| **Average** | - | - | 0.36|0.44|0.20|1.92|1.01|1.10|0.06|0.00|1.00|4.09|5.09|

Note that this is minimum requirements, in reality we will need substantially more capacity to compensate for unavoidable inefficiencies in scheduling. See the scheduling notebook for a model of this.

## Data Product generation

Similar analysis, but for the data rate at which SDP  might generate data products.

In [5]:
def make_data_table(tel, return_average=False):
    table = [ "| HPSO | Time [%] | Tobs [h] | Npix (side) | Channels (DPrepB) | Channels (DPrepC) | " +
              " Image size [GB] | Non-Vis Rate [Gbit/s] | " +
              " Visibility Size [TB] | Visibility Rate [Gbit/s] | Total Rate [Gbit/s] | "]
    table.append("-".join("|"*table[0].count('|')))    
    vis_sum = 0; output_sum = 0
    for hpso in sorted(HPSOs.all_hpsos):
        if HPSOs.hpso_telescopes[hpso] != tel:
            continue
        Tobs = lookup('Observation Time', hpso).get(Pipelines.Ingest,0)
        Texp = lookup('Total Time', hpso).get(Pipelines.Ingest,0)
        time_frac = Texp / total_time[tel]
        Mout = sum(lookup('Output size', hpso).values())
        Mvis = lookup('Output size', hpso).get(Pipelines.DPrepD,0)
        Rout = 8000 * Mout / Tobs; Rvis = 8000 * Mvis / Tobs
        vis_sum += Rvis * time_frac; output_sum += Rout * time_frac
        row = "|{}|{:.1f}|{:.2f}|".format(hpso,time_frac*100,Tobs/3600,Mout,Rvis,Rout)
        if Pipelines.DPrepA in HPSOs.hpso_pipelines[hpso]:
            Npix = lookup('Image side length', hpso).get(Pipelines.DPrepA,0)
            Nchan_B = lookup('Channels out', hpso).get(Pipelines.DPrepB,0)
            Nchan_C = lookup('Channels out', hpso).get(Pipelines.DPrepC,0)
            Mimage = lookup('Image size', hpso).get(Pipelines.DPrepA,0)
            row += "{:.0f}|{:.0f}|{:.0f}|{:.1f}|{:.1f}|".format(
                Npix,Nchan_B,Nchan_C,Mimage,Rout-Rvis)
        else:
            row += "- | - | - | - |{:.1f}|".format(Rout-Rvis)
        if Pipelines.DPrepD in HPSOs.hpso_pipelines[hpso]:
            row += "{:.1f}|{:.1f}|".format(Mvis, Rvis)
        else:
            row += "-|-|"
        row += "{:.1f}|".format(Rout)
        table.append(row)
    table.append("| **Average** | - | - | - | - | - | - | {:.1f} | - |{:.1f}|{:.1f}|".format(
        output_sum-vis_sum, vis_sum, output_sum))
    if return_average:
        return output_sum
    else:
        return "\n".join(table)
for tel in Telescopes.available_teles:
    display(Markdown("##### {}:\n\n".format(tel) + make_data_table(tel)))

##### SKA1_Low:

| HPSO | Time [%] | Tobs [h] | Npix (side) | Channels (DPrepB) | Channels (DPrepC) |  Image size [GB] | Non-Vis Rate [Gbit/s] |  Visibility Size [TB] | Visibility Rate [Gbit/s] | Total Rate [Gbit/s] | 
|-|-|-|-|-|-|-|-|-|-|-|
|hpso01|15.6|5.00|18344|500|1500|2.7|8.5|205.8|91.4|99.9|
|hpso02a|15.6|5.00|18344|500|1500|2.7|8.5|205.8|91.4|99.9|
|hpso02b|15.6|5.00|18344|500|1500|2.7|8.5|205.8|91.4|99.9|
|hpso04a|39.8|0.67|- | - | - | - |0.7|-|-|0.7|
|hpso05a|13.4|0.67|- | - | - | - |2.6|-|-|2.6|
| **Average** | - | - | - | - | - | - | 4.6 | - |42.8|47.4|

##### SKA1_Mid:

| HPSO | Time [%] | Tobs [h] | Npix (side) | Channels (DPrepB) | Channels (DPrepC) |  Image size [GB] | Non-Vis Rate [Gbit/s] |  Visibility Size [TB] | Visibility Rate [Gbit/s] | Total Rate [Gbit/s] | 
|-|-|-|-|-|-|-|-|-|-|-|
|hpso04b|1.0|0.17|- | - | - | - |2.3|-|-|2.3|
|hpso04c|3.1|0.17|- | - | - | - |2.3|-|-|2.3|
|hpso05b|2.1|0.25|- | - | - | - |6.9|-|-|6.9|
|hpso13|6.5|8.00|25339|160|3200|5.1|4.2|-|-|4.2|
|hpso14|2.6|8.00|18814|300|5000|2.8|2.8|-|-|2.8|
|hpso15|16.5|4.40|10837|260|2500|0.9|0.8|-|-|0.8|
|hpso18|13.1|0.02|- | - | - | - |0.1|-|-|0.1|
|hpso22|7.9|8.00|110601|1000|0|97.9|48.1|-|-|48.1|
|hpso27and33|13.1|0.12|23549|700|0|4.4|99.3|-|-|99.3|
|hpso32|13.1|2.20|- | - | - | - |1.3|-|-|1.3|
|hpso37a|13.1|3.80|94195|700|0|71.0|60.6|-|-|60.6|
|hpso37b|2.6|8.00|94195|700|0|71.0|28.8|-|-|28.8|
|hpso37c|2.6|8.00|94195|700|0|71.0|28.8|-|-|28.8|
|hpso38a|1.3|8.00|113204|1000|0|102.5|50.4|-|-|50.4|
|hpso38b|1.3|8.00|113204|1000|0|102.5|50.4|-|-|50.4|
| **Average** | - | - | - | - | - | - | 28.4 | - |0.0|28.4|

Note that this assumes that we manage to produce usable data at all times.

We can make a similar graph where we sum the weighted contributions of all HPSOs per pipeline:

In [6]:
table = [ "| Telescope | Pipeline | Data Rate [Gbit/s] | Daily Growth [TB/day] | Yearly Growth [PB/year] | 5-year Growth [PB/(5 year)] |"]
table.append("-".join("|"*table[0].count('|')))
total_data_rate = 0
for tel in Telescopes.available_teles:
    def mk_projection(rate):
        day_rate = rate * 3600 * 24 / 8 / 1000 # TB/day
        year_rate = day_rate * 365 / 1000 # PB/year
        return "{:.2f}|{:.1f}|{:.1f}|{:.1f}".format(rate, day_rate, year_rate, 5*year_rate)
    subtotal_data_rate = 0
    for pip in Pipelines.all:
        data_rate = 0
        for hpso in HPSOs.all_hpsos:
            if HPSOs.hpso_telescopes[hpso] == tel and pip in HPSOs.hpso_pipelines[hpso]:
                Texp = lookup('Total Time', hpso).get(Pipelines.Ingest,0)
                Tobs = lookup('Observation Time', hpso).get(Pipelines.Ingest,0)
                Mout = lookup('Output size', hpso).get(pip)
                Rout = 8000 * Mout / Tobs
                time_frac = Texp / total_time[tel]
                data_rate += Rout * time_frac
        if data_rate > 0:
            table.append("|{}|{}|{}|".format(tel, pip, mk_projection(data_rate)))
            subtotal_data_rate += data_rate; total_data_rate += data_rate
    table.append("|**{}**|**Sub-Total**|{}|".format(tel, mk_projection(subtotal_data_rate)))
table.append("|&nbsp;|**Total**|{}|".format(mk_projection(total_data_rate)))
display(Markdown("\n".join(table)))

| Telescope | Pipeline | Data Rate [Gbit/s] | Daily Growth [TB/day] | Yearly Growth [PB/year] | 5-year Growth [PB/(5 year)] |
|-|-|-|-|-|-|
|SKA1_Low|DPrepA|0.02|0.2|0.1|0.4|
|SKA1_Low|DPrepB|0.99|10.7|3.9|19.5|
|SKA1_Low|DPrepC|2.96|32.0|11.7|58.4|
|SKA1_Low|DPrepD|42.80|462.2|168.7|843.6|
|SKA1_Low|PSS|0.28|3.1|1.1|5.6|
|SKA1_Low|PST|0.35|3.7|1.4|6.8|
|**SKA1_Low**|**Sub-Total**|47.40|511.9|186.9|934.3|
|SKA1_Mid|DPrepA|0.15|1.6|0.6|3.0|
|SKA1_Mid|DPrepB|27.59|298.0|108.8|543.8|
|SKA1_Mid|DPrepC|0.45|4.9|1.8|8.9|
|SKA1_Mid|PSS|0.10|1.0|0.4|1.9|
|SKA1_Mid|PST|0.14|1.6|0.6|2.8|
|SKA1_Mid|SinglePulse|0.01|0.1|0.0|0.1|
|**SKA1_Mid**|**Sub-Total**|28.44|307.1|112.1|560.5|
|&nbsp;|**Total**|75.84|819.1|299.0|1494.8|

This gives a good idea of how the archive will need to grow, and what types of data products it will contain. For instance, DPrepD will produce visibility data products (mostly for EoR search) that might eventually be discarded after delivery.