# SDP Parametric Model Data Export

(Last tested with Jupyter Notebook 5.7.4 using Python 3.7.2 - if you have trouble, check version compatibility)

In [None]:
from __future__ import print_function
import functools
import sys
import os
import re
from ipywidgets import interact_manual, Select, SelectMultiple
from IPython.display import display, Markdown
sys.path+=['..']
from sdp_par_model import reports
from sdp_par_model.parameters.definitions import *

import subprocess, string
git_date = subprocess.check_output(["git", "log", "-1", "--format=%ci"]).split()[0].decode()
git_rev = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).split()[0].decode()

# Check for changes in Python. We assume notebooks and other files don't matter for the result
changed_python = [ line[3:]
   for line in subprocess.check_output(["git", "status", "--porcelain"]).decode().split('\n')
   if line.startswith(' M ') and line.endswith('.py')]
if len(changed_python) > 0:
    print("Unclean Git revision %s from %s (files changes: %s)" % (git_rev, git_date, ",".join(changed_python)))
    git_rev += "_HEAD"    
else:
    print("Clean Git revision %s from %s" % (git_rev, git_date))
    
csv_path = os.path.join('..','data','csv','')
if not os.path.isdir(csv_path):
    os.makedirs(csv_path)
output_pipelines_filename = csv_path + git_date + "-" + git_rev + "_pipelines.csv"
output_hpsos_filename = csv_path + git_date + "-" + git_rev + "_hpsos.csv"

out_path = "out"
if not os.path.isdir(out_path):
    os.makedirs(out_path)

In [None]:
parallel = 0
try:
    import pymp
    import multiprocessing
    parallel = multiprocessing.cpu_count()
    print("Using parallelism degree of %d" % parallel)
except Exception as e:
    print("Not utilising parallelism (%s)" % e)

## Generate CSV data for HPSOs and Pipelines

Equivalent to leaving all HPSOs/pipelines selected above. This might take a bit of time!

In [None]:
reports.write_csv_hpsos(output_hpsos_filename, HPSOs.available_hpsos, parallel=parallel, verbose=False)

In [None]:
reports.write_csv_pipelines(output_pipelines_filename,Telescopes.available_teles,Bands.available_bands,
                            Pipelines.available_pipelines, parallel=parallel, verbose=False)

## Calculation results can be exported as CSV

These files can be opened by standard spreadsheet programs such as LibreOffice Calc or Microsoft Excel.

In [None]:
interact_manual(reports.write_csv_hpsos,
                filename=output_hpsos_filename,
                hpsos=SelectMultiple(options=sorted(tuple(HPSOs.available_hpsos)), value=tuple(HPSOs.available_hpsos)),
                parallel=(0,32,1));

In [None]:
interact_manual(reports.write_csv_pipelines,
                filename=output_pipelines_filename,
                telescopes=SelectMultiple(options=sorted(tuple(Telescopes.available_teles)), value=tuple(Telescopes.available_teles)),
                bands=SelectMultiple(options=sorted(tuple(Bands.available_bands)), value=tuple(Bands.available_bands)),
                pipelines=SelectMultiple(options=sorted(tuple(Pipelines.available_pipelines)), value=tuple(Pipelines.available_pipelines)),
                parallel=(0,32,1));

## CSV tables can be compared to track changes

Note that you might have to re-execute the cell if freshly generated CSV files don't appear in the list. The reference files selected by default is the last one that was checked into version control.

In [None]:
ls = [ csv_path+"%s" % n for n in os.listdir(csv_path) if n.endswith(".csv") ]
ls = sorted(ls, key=lambda n: os.stat(n).st_mtime)
results = list(reversed(ls))
refs = subprocess.check_output(["git", "log", "--pretty=format:", "--name-only", csv_path]).split()
refs = [os.path.relpath(r.decode(), "iPython") for r in refs]
result_refs = list([ r for r in results if r in refs ])
interact_manual(reports.compare_csv,
                result_file=Select(options=results, value=results[0]),
                ref_file=Select(options=results, value=result_refs[0]))

## Compare *all* pipelines and HPSO versions

We go back in (Git) history and pairwise compare them. Starts with some routines for interacting with Git:

In [None]:
def get_ref(file):
    """ Identify Git revision that corresponds to the given historical file. """
    if file in [output_pipelines_filename, output_hpsos_filename]:
        return "HEAD"
    # Has a git revision in the name?
    match = re.search(r"-([0-9A-Fa-f]+)_", file)
    if match is None:
        # Otherwise use the revision where the CSV file was checked in
        out = subprocess.check_output(["git", "log", "--follow", "--format=%h", file]).decode().split('\n')
        out = list(filter(lambda l: l != '', out))
        return out[-1]
    return match.group(1)
def get_log(file0, file1):
    ref0 = get_ref(file0)
    ref1 = get_ref(file1)
    if ref0 == '' or ref1 == '': return ''
    return subprocess.check_output(["git", "log", ref0 + ".." + ref1, '--format=%h %ci %cn: %s']).decode()
def cmp(file0, file1):
    ref0 = get_ref(file0)
    ref1 = get_ref(file1)
    if ref0 == ref1: return 0
    if get_log(file0, file1) != '':
        return 1
    return -1

Finally compare each against their closest parent:

In [None]:
ref_map = reports.find_csvs()
csvsCurrent = True
for typ in ['hpsos', 'pipelines']:

    # Add current HPSOs (generated above) to the list, if we did not skip it
    suffix = "_%s.csv" % typ
    my_refs = list(filter(lambda n: n[1].endswith(suffix), ref_map.items()))
    if os.path.isfile(output_hpsos_filename) and output_hpsos_filename.endswith(suffix):
        my_refs.insert(0, (('HEAD', typ), output_hpsos_filename))
    if os.path.isfile(output_pipelines_filename) and output_pipelines_filename.endswith(suffix):
        my_refs.insert(0, (('HEAD', typ), output_pipelines_filename))

    # First sort all CVSs. The check-ins should already be in roughly the right order, but the few exceptions
    # are bothersome enough to sacrifice a few seconds doing this.
    print('Sorting...')
    my_refs = sorted(my_refs, key=functools.cmp_to_key(lambda n1, n2: cmp(n1[1], n2[1])))

    # Compare pairwise, track changes
    for i, ((rev,_), results) in enumerate(my_refs):
        reference = reports.newest_csv(ref_map, typ, rev+'^')
        if reference is None:
            continue
        
        display(Markdown("### Comparing %s with %s" % (results, reference)))
        diff = reports.compare_csv(results, reference, return_diffs=True,
                                   export_html=os.path.join(out_path,"compare_%s_%d.html" % (typ,i)))
        for row, me,mi,ma in diff:
            if 'Total Compute' not in row: continue
            display(Markdown("%s: mean %+.5f%% (min %+.5f%% max %+.5f%%)" % (row, me,mi,ma)))
            if (abs(mi) > 1e-6 or abs(ma) > 1e-6) and results in [output_hpsos_filename, output_pipelines_filename]:
                display(Markdown('### <span style="color:red">Result mismatch. Commit %s, or Jenkins will complain!</span>'
                                 % results))
                csvsCurrent = False
        print(get_log(reference, results))

We always want to keep the current numbers under source control for easy comparisons. Therefore, the CSV file checked into Git should exactly match the current results.

In [None]:
assert csvsCurrent, "Last checked in CSV does not match current results!"