This notebook should help with the analysis of the results from the first application of the correction of values.
The research question is 

- Is there a correlation between degradation and overshooting of predictions?
- Should the restricted area be calculated using the truth@t-1 and truth@t-2 instead of truth@t-1 and pred@t-1?
- Should the restricted area be depending on the length of 1s/0s on the prediction?
- How is the degradation distributed?

- How big is the influence of the correction if the restriction is too big?
- How to calculate beta and parts of the correction?
- How to consider dimension jumps in the correction?

## Reading log files

In [None]:
from glob import glob
import pandas as pd
from io import StringIO
import numpy as np
from collections import namedtuple
from matplotlib import pyplot as plt
%matplotlib inline

Reading in the data from the log files as pandas panel objects.

In [None]:
methods = ["Delta2PowerTwo","PreviousError"]

In [None]:
def read_file(filename, methodshorthand):
    if methodshorthand == "prer":
        with open(filename, 'r') as f:
            file_str = [line.split(',', 1)[1] for line in f.readlines() if "," in line and "offset" not in line]
            file_str = StringIO("".join(file_str))
            df = pd.read_csv(file_str, index_col=0, header=None)
            df.columns = ["uncorrected", "corrected", "truth", "overshot", "offset", "empty"]
            df.index.name = "index"
        return df
    elif methodshorthand == "d2p2":
        with open(filename, 'r') as f:
            file_str = [line.split(',', 1)[1] for line in f.readlines() if "," in line and "restricted" not in line]
            file_str = StringIO("".join(file_str))
            df = pd.read_csv(file_str, index_col=0, header=None)
            df.columns = ["uncorrected", "corrected", "truth", "overshot", "restricted", "delta"]
            df.index.name = "index"
        return df

def get_panel(method):
    handles = dict(PreviousError="prer", Delta2PowerTwo="d2p2")
    shorthand = handles[method]
    logfiles =  './*{}.log'.format(shorthand)
    logfiles = sorted(glob(logfiles))
    return pd.Panel({fname:read_file(fname, shorthand) for fname in logfiles})

## Actual processing of the data

In [None]:
def lzc(val, bits=32):
    """Count leading zeroes."""
    cnt = 0
    for i in range(0, bits):
        if val & (1 << (bits - 1 - i)) != 0:
            break
        cnt += 1
    return cnt
lzcu = np.frompyfunc(lzc, 2, 1)

### Correlation of overshooting and performance

In [None]:
def calculate_performance_correlation_with_overshooting(filename, panel):
    uncorrected_residual = np.bitwise_xor(panel[filename,:,:]['uncorrected'], panel[filename,:,:]['truth'])
    corrected_residual = np.bitwise_xor(panel[filename,:,:]['corrected'], panel[filename,:,:]['truth'])
    better_cases = lzcu(corrected_residual, 32) > lzcu(uncorrected_residual, 32)
    same_cases = lzcu(corrected_residual, 32) == lzcu(uncorrected_residual, 32)
    worse_cases = lzcu(corrected_residual, 32) < lzcu(uncorrected_residual, 32)
    overshooting = panel[filename,:,"overshot"]
    b_correlation = np.corrcoef(overshooting.astype(bool), better_cases.astype(bool))[0,1]
    s_correlation = np.corrcoef(overshooting.astype(bool), same_cases.astype(bool))[0,1]
    w_correlation = np.corrcoef(overshooting.astype(bool), worse_cases.astype(bool))[0,1]
    
    correlation_performance_overshooting = namedtuple("CPOS", "fname,better,worse,same,bcorr,wcorr,scorr")
    return correlation_performance_overshooting(filename,better_cases.sum(),worse_cases.sum(),same_cases.sum(),
                                                b_correlation,w_correlation, s_correlation)

In [None]:
threshold = .1
for m in methods:
    panel = get_panel(m)
    all_performances = [calculate_performance_correlation_with_overshooting(x, panel) for x in panel.items]
    print(m)
    significant = [print("{:60} {:+.2f} {:+.2f} {:+.2f}     {} {} {}".format(x.fname,x.bcorr,x.wcorr,x.scorr,
                                                                        abs(x.bcorr)  > threshold,abs(x.wcorr)  > threshold,abs(x.scorr) > threshold)) 
                   if not np.isnan(x.bcorr) else 
                   print("{:60}".format(x.fname)) 
                   for x in all_performances]

### Is the correction an actual improvement of the prediction (which simply is not represented in the LZC)

In [None]:
filename = './icon.pl.qv.f32.little.4x47x351x901_2.prer.log'

def calculate_absolute_performance(filename, panel):
    corrected_diff = (panel[filename, :, "corrected"] - panel[filename, :, "truth"]).abs()
    uncorrected_diff = (panel[filename, :, "uncorrected"] - panel[filename, :, "truth"]).abs()
    better_cases = corrected_diff < uncorrected_diff
    same_cases = corrected_diff == uncorrected_diff
    worse_cases = corrected_diff > uncorrected_diff
    
    correlation_performance_overshooting = namedtuple("CAP", "fname,better,worse,same")
    return correlation_performance_overshooting(filename,better_cases.mean(),worse_cases.mean(),same_cases.mean())

for m in methods:
    panel = get_panel(m)
    all_performances = [calculate_absolute_performance(x, panel) for x in panel.items]
    print(m)
    print("{:60} {:>4} {:>4} {:>4}      better  not worse".format("file","b","w","s"))
    significant = [print("{:60} {:03.2f} {:03.2f} {:03.2f}     {:>6}  {:>6}".format(x.fname,x.better,x.worse,x.same,str(x.better >= .5),str(x.better+x.same >= .5))) for x in all_performances]

### What is the "average" range of LZCs back to back of each other?

In [None]:
ninties = np.zeros(panel.items.size)
nintyfives = np.zeros(panel.items.size)
nintynines = np.zeros(panel.items.size)

for m in methods:
    panel = get_panel(m)
    print(m)
    for i,item in enumerate(panel.items):
        df = panel.loc[item,:,:][["uncorrected","corrected","truth"]].astype("int")
        df["uncorrected_residue"] = df["truth"] ^ df["uncorrected"]
        df["uncorrected_lzc"] = lzcu(df["uncorrected_residue"], 32)
        ninety = np.percentile(df["uncorrected_lzc"].diff().abs(), 90)
        ninetyfive = np.percentile(df["uncorrected_lzc"].diff().abs(), 95)
        ninetynine = np.percentile(df["uncorrected_lzc"].diff().abs(), 99)
        ninties[i] = ninety
        nintyfives[i] = ninetyfive
        nintynines[i] = ninetynine
        print("{:60} {:5.2f} {:5.2f} {:5.2f}".format(item, ninety, ninetyfive, ninetynine))
pd.Series(nintynines).plot.line(label='99%')
pd.Series(nintyfives).plot.line(label='95%')
pd.Series(ninties).plot.line(label='90%')
plt.legend();

In [None]:
pd.Series(ninties).describe()

### Is there a correlation of the back-to-back fluctuation of LZC and the length of MS1?

In [None]:
def ms1(num):
    summation = 0
    lz = lzc(num)
    try:
        while 1 << 32 - lz - 1 - summation & num > 0:
            summation += 1
    except ValueError as e:
        if "negative shift count" in str(e):
            pass
        else:
            raise
    return summation

In [None]:
for m in methods:
    panel = get_panel(m)
    print(m)
    for i,item in enumerate(panel.items):
        df = panel.loc[item,:,:][["uncorrected","corrected","truth"]].astype("int")
        df["uncorrected_residue"] = df["truth"] ^ df["uncorrected"]
        df["uncorrected_lzc"] = lzcu(df["uncorrected_residue"], 32)
        df["ms1"] = [ms1(x) for x in df["uncorrected_residue"]]
        corr_lzcdiff_ms1 = np.corrcoef(df["uncorrected_lzc"].astype("int").diff().abs()[1:], df["ms1"].astype("int")[:-1])[0,1]
        corr_residue_ms1 = np.corrcoef(df["uncorrected_residue"].astype("int"), df["ms1"].astype("int"))[0,1]
        corr_lzcdiff_residue = np.corrcoef(df["uncorrected_lzc"].astype("int").diff().abs()[1:], df["uncorrected_residue"].astype("int")[:-1])[0,1]
        corr_lzc_residue = np.corrcoef(df["uncorrected_lzc"].astype("int"), df["uncorrected_residue"].astype("int"))[0,1]
        print("{:60} {:5.2f} {:5.2f} {:5.2f} {:5.2f}".format(item, corr_lzcdiff_ms1, corr_residue_ms1, corr_lzcdiff_residue,corr_lzc_residue))

### Predict the most likely domain of error 

In [None]:
from termcolor import colored

In [None]:
def colored_binary(num, center, domain=4, bits=32):
    string_repr = np.binary_repr(num,bits)
    right,left = min(center + domain, bits), max(center - domain, 0)
    restrict = colored(string_repr[:left], 'green')
    work_area = colored(string_repr[left:right], 'yellow')
    untouched = colored(string_repr[right:],'red')
    colored_string_repr = restrict + work_area + untouched
    return colored_string_repr

In [None]:
test = np.frompyfunc(colored_binary, 4, 1)

In [None]:
domain,bits = 4,32
df = panel.iloc[1,:,:][["uncorrected","corrected","truth"]].astype("int")
df["uncorrected_residue"] = df["truth"] ^ df["uncorrected"]
df["uncorrected_residue_lzc"] = lzcu(df["uncorrected_residue"],  bits)
tmp = df[["uncorrected_residue_lzc"]].shift(1)
tmp.iloc[0] =  0
df["uncorrected_domain"] = test(df[["uncorrected"]],tmp,domain, bits)
df["truth_colored"] = test(df[["truth"]],tmp, domain, bits)
df["_s"] = tmp['uncorrected_residue_lzc']

In [None]:
print("{:>4} {:>32} {:>32} lzc ctr".format("ix","uncorrected", "truth"))
for i in range(330,750):
    print("{:>4} {:>32} {:>32} {:>3} {:>3}".format(i,df["uncorrected_domain"][i],df["truth_colored"][i], lzc(df["uncorrected"][i] ^ df["truth"][i]),df['_s'][i]))

The most likely domain for improving >90% of the prediction is the range within 4 binary values of the previous LZC. This information gives us the range for a possible bitflip area we need to be careful. Additional information we have is the following:

- Value range of the appropiate floating point values
- The average case of being either too high or too low
- The number of zeros and ones back-to-back for interested area

In [None]:
uncorrected_yellows = np.zeros(df.truth.size)
truth_yellows = np.zeros(df.truth.size)
centers = np.zeros(df.truth.size)
rise_tendency = np.zeros(df.truth.size)
not_fall_tendency = np.zeros(df.truth.size)

# print("{:>4} {:>32} {:>32} {:>3} {:>3} ris !fall".format("ix","uncorrected yellow","truth yellow", "dif", "ctr"))
for i in range(2, df["uncorrected"].size):
    selection = 255 << (32 - df["_s"][i] - 4)
    move = 32 - df["_s"][i] - 4
    uncorrected_yellow = (selection & df["uncorrected"][i]) >> move
    truth_yellow = (selection & df["truth"][i]) >> move
    center = df['_s'][i]
    diff = max(uncorrected_yellow,truth_yellow) - min(uncorrected_yellow,truth_yellow)
    rising = df["truth"][i-1] > df["truth"][i-2]
    not_falling = df["truth"][i] >= df["truth"][i-1]
#     print("{:>4} {:>32b} {:>32b} {:>3} {:>3} {} {}".format(i,uncorrected_yellow,truth_yellow,diff,center,rising,not_falling))
    uncorrected_yellows[i] = uncorrected_yellow
    truth_yellows[i] = truth_yellow
    centers[i] = center
    rise_tendency[i] = rising
    not_fall_tendency[i] = not_falling

In [None]:
df2 = pd.DataFrame({"uncorrected_yellows":uncorrected_yellows.astype(int), "truth_yellows":truth_yellows.astype(int),
                    "centers":centers.astype(int), "rise_tendency":rise_tendency.astype(bool), "not_fall_tendency":not_fall_tendency.astype(bool)})

In [None]:
df2["adiff"] = (df2["uncorrected_yellows"]-df2["truth_yellows"]).abs()
df2["diff"] = (df2["uncorrected_yellows"]-df2["truth_yellows"])