This notebook should help with the analysis of the results from the first application of the correction of values.
The research question is 

- Is there a correlation between degradation and overshooting of predictions?
- Should the restricted area be calculated using the truth@t-1 and truth@t-2 instead of truth@t-1 and pred@t-1?
- Should the restricted area be depending on the length of 1s/0s on the prediction?
- How is the degradation distributed?

- How big is the influence of the correction if the restriction is too big?
- How to calculate beta and parts of the correction?
- How to consider dimension jumps in the correction?

## Reading log files

In [None]:
from glob import glob
import pandas as pd
from io import StringIO
import numpy as np
from collections import namedtuple

Reading in the data from the log files as pandas panel objects.

In [None]:
methods = ["Delta2PowerTwo","PreviousError"]

In [None]:
def read_file(filename, methodshorthand):
    if methodshorthand == "prer":
        with open(filename, 'r') as f:
            file_str = [line.split(',', 1)[1] for line in f.readlines() if "," in line and "offset" not in line]
            file_str = StringIO("".join(file_str))
            df = pd.read_csv(file_str, index_col=0, header=None)
            df.columns = ["uncorrected", "corrected", "truth", "overshot", "offset", "empty"]
            df.index.name = "index"
        return df
    elif methodshorthand == "d2p2":
        with open(filename, 'r') as f:
            file_str = [line.split(',', 1)[1] for line in f.readlines() if "," in line and "restricted" not in line]
            file_str = StringIO("".join(file_str))
            df = pd.read_csv(file_str, index_col=0, header=None)
            df.columns = ["uncorrected", "corrected", "truth", "overshot", "restricted", "delta"]
            df.index.name = "index"
        return df

def get_panel(method):
    handles = dict(PreviousError="prer", Delta2PowerTwo="d2p2")
    shorthand = handles[method]
    logfiles =  './*{}.log'.format(shorthand)
    logfiles = sorted(glob(logfiles))
    return pd.Panel({fname:read_file(fname, shorthand) for fname in logfiles})

## Actual processing of the data

In [None]:
def lzc(val, bits=32):
    """Count leading zeroes."""
    cnt = 0
    for i in range(0, bits):
        if val & (1 << (bits - 1 - i)) != 0:
            break
        cnt += 1
    return cnt
lzcu = np.frompyfunc(lzc, 2, 1)

### Correlation of overshooting and performance

In [None]:
def calculate_performance_correlation_with_overshooting(filename, panel):
    uncorrected_residual = np.bitwise_xor(panel[filename,:,:]['uncorrected'], panel[filename,:,:]['truth'])
    corrected_residual = np.bitwise_xor(panel[filename,:,:]['corrected'], panel[filename,:,:]['truth'])
    better_cases = lzcu(corrected_residual, 32) > lzcu(uncorrected_residual, 32)
    same_cases = lzcu(corrected_residual, 32) == lzcu(uncorrected_residual, 32)
    worse_cases = lzcu(corrected_residual, 32) < lzcu(uncorrected_residual, 32)
    overshooting = panel[filename,:,"overshot"]
    b_correlation = np.corrcoef(overshooting.astype(bool), better_cases.astype(bool))[0,1]
    s_correlation = np.corrcoef(overshooting.astype(bool), same_cases.astype(bool))[0,1]
    w_correlation = np.corrcoef(overshooting.astype(bool), worse_cases.astype(bool))[0,1]
    
    correlation_performance_overshooting = namedtuple("CPOS", "fname,better,worse,same,bcorr,wcorr,scorr")
    return correlation_performance_overshooting(filename,better_cases.sum(),worse_cases.sum(),same_cases.sum(),
                                                b_correlation,w_correlation, s_correlation)

In [None]:
threshold = .1
for m in methods:
    panel = get_panel(m)
    all_performances = [calculate_performance_correlation_with_overshooting(x, panel) for x in panel.items]
    print(m)
    significant = [print("{:60} {:+.2f} {:+.2f} {:+.2f}     {} {} {}".format(x.fname,x.bcorr,x.wcorr,x.scorr,
                                                                        abs(x.bcorr)  > threshold,abs(x.wcorr)  > threshold,abs(x.scorr) > threshold)) 
                   if not np.isnan(x.bcorr) else 
                   print("{:60}".format(x.fname)) 
                   for x in all_performances]

### Is the correction an actual improvement of the prediction (which simply is not represented in the LZC)

In [None]:
filename = './icon.pl.qv.f32.little.4x47x351x901_2.prer.log'

def calculate_absolute_performance(filename, panel):
    corrected_diff = (panel[filename, :, "corrected"] - panel[filename, :, "truth"]).abs()
    uncorrected_diff = (panel[filename, :, "uncorrected"] - panel[filename, :, "truth"]).abs()
    better_cases = corrected_diff < uncorrected_diff
    same_cases = corrected_diff == uncorrected_diff
    worse_cases = corrected_diff > uncorrected_diff
    
    correlation_performance_overshooting = namedtuple("CAP", "fname,better,worse,same")
    return correlation_performance_overshooting(filename,better_cases.mean(),worse_cases.mean(),same_cases.mean())

for m in methods:
    panel = get_panel(m)
    all_performances = [calculate_absolute_performance(x, panel) for x in panel.items]
    print(m)
    significant = [print("{:60} {:03.2f} {:03.2f} {:03.2f}     {}".format(x.fname,x.better,x.worse,x.same,x.worse <= .5)) for x in all_performances]

### Correlation/Relationship of calculated restricted area and actual restricted area