In [None]:
import numpy as np
import pandas as pd
import struct
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
npxor = np.frompyfunc(np.bitwise_xor, 2, 1)

In [None]:
def lzc(t):
    return 32 - len(np.binary_repr(t))
lzcv = np.frompyfunc(lzc, 1, 1)

In [None]:
def to_u32(f):
    s = struct.pack('>f',f)
    return struct.unpack('>l', s)[0]
to_u32v = np.frompyfunc(to_u32, 1, 1)

In [None]:
def mean_lzc(m, o):
    dist = np.random.normal(m,o,size=1000)
    truth = to_u32(m)
    vals = to_u32v(dist)
    xors = npxor(vals, truth)
    lzcs = lzcv(xors)
    return lzcs.mean()

## TODO

- [x] Plot LZC($\mu$)
- [x] Plot LZC($\sigma$)
- [ ] Find out with what $\sigma$ we are working for different predictors for climate data
- [ ] Plot compression ratio dependency of $\mu$
- [ ] Plot compression ratio dependency of $\sigma$
- [x] Mark the 0101010101 shift values in the plot
- [x] Mark the 1010101010 shift values in the plot

# LZC($\mu$)

In [None]:
mu = np.arange(1000)
sigma1 =  64 * mu
sigma2 = 128 * mu
sigma3 = 256 * mu
sigma4 = 512 * mu

result1_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma1)]
result2_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma2)]
result3_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma3)]
result4_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma4)]

factors = [1.75,1.625,1.5,1.25,1.125,1]
fcolors = ['crimson','magenta','skyblue','limegreen','olivedrab', 'goldenrod']

# LZC(mu)
_, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=pd.DataFrame({' x64':result1_x_is_mu, 'x128':result2_x_is_mu, 
                                'x256':result3_x_is_mu, 'x512':result4_x_is_mu})[1:])
for i,f in enumerate(factors):
    [plt.axvline(2**x*f, alpha=.5, color=fcolors[i], ls=":") for x in range(4,10)];
plt.legend();

In [None]:
mu = np.arange(1000)
sigma1 = mu*.20
sigma2 = mu*.10
sigma3 = mu*.05
sigma4 = mu*.01

result1_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma1)]
result2_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma2)]
result3_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma3)]
result4_x_is_mu = [mean_lzc(m,o) for m,o in zip(mu,sigma4)]

factors = [1.75,1.625,1.5,1.25,1.125,1]
fcolors = ['crimson','magenta','skyblue','limegreen','olivedrab', 'goldenrod']

# LZC(mu)
_, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=pd.DataFrame({'20%':result1_x_is_mu, '10%':result2_x_is_mu, 
                                ' 5%':result3_x_is_mu, ' 1%':result4_x_is_mu})[1:])
for i,f in enumerate(factors):
    [plt.axvline(2**x*f, alpha=.5, color=fcolors[i], ls=":") for x in range(4,10)];
plt.legend();

In [None]:
# Vertical lines at above plot
base = 128
for i,f in enumerate(factors):
    print("{:.4f} {:>9} {:>10}".format(f, np.binary_repr(int(base*f), 8), fcolors[i]))

# LZC($\sigma$)

In [None]:
sigma = np.arange(1000)
mu1 = 10 * sigma
mu2 = 50 * sigma
mu3 = 100 * sigma
mu4 = 500 * sigma
# mu_test = [175] * sigma.size  # because of the high gap @x~175
# mu_corr = [get_flipping_value(128)] *sigma.size

result1_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu1,sigma)]
result2_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu2,sigma)]
result3_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu3,sigma)]
result4_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu4,sigma)]
# result_test = [mean_lzc(m,o) for m,o in zip(mu_test,sigma)]
# result_corr = [mean_lzc(m,o) for m,o in zip(mu_corr,sigma)]

# LZC(sigma)
_, ax = plt.subplots(figsize=(15,5))
df = pd.DataFrame({' x10':result1_x_is_sigma, 
                   ' x50':result2_x_is_sigma, 
                   'x100':result3_x_is_sigma, 
                   'x500':result4_x_is_sigma, 
#                    mu_test[0]:result_test, 
#                    mu_corr[0]:result_corr
})
sns.lineplot(data=df[100:])
for i,f in enumerate(factors):
    [plt.axvline(128*f, alpha=.5, color=fcolors[i], ls=":")];
plt.legend();

In [None]:
sigma = np.arange(1000)
mu1 = [128 * 2] * sigma.size
mu2 = [128 * 1.5] * sigma.size
mu3 = [128 * 1.125] * sigma.size
mu4 = [128] * sigma.size
# mu_test = [175] * sigma.size  # because of the high gap @x~175
# mu_corr = [get_flipping_value(128)] *sigma.size

result1_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu1,sigma)]
result2_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu2,sigma)]
result3_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu3,sigma)]
result4_x_is_sigma = [mean_lzc(m,o) for m,o in zip(mu4,sigma)]
# result_test = [mean_lzc(m,o) for m,o in zip(mu_test,sigma)]
# result_corr = [mean_lzc(m,o) for m,o in zip(mu_corr,sigma)]

# LZC(sigma)
_, ax = plt.subplots(figsize=(15,5))
df = pd.DataFrame({mu1[0]:result1_x_is_sigma, 
                   mu2[0]:result2_x_is_sigma, 
                   mu3[0]:result3_x_is_sigma, 
                   mu4[0]:result4_x_is_sigma, 
#                    mu_test[0]:result_test, 
#                    mu_corr[0]:result_corr
})
sns.lineplot(data=df[100:])
for i,f in enumerate(factors):
    [plt.axvline(128*f, alpha=.5, color=fcolors[i], ls=":")];
plt.legend();

In [None]:
# Vertical lines at above plot
base = 128
for i,f in enumerate(factors):
    print("{:.4f} {:>9} {:>10}".format(f, np.binary_repr(int(base*f), 8), fcolors[i]))

# Calculate shift goals

In [None]:
def get_flipping_value(val):
    assert val & (val-1) == 0, "Not power of two"
    i = len(np.binary_repr(val)) - 3
    while i >= 0:
        val += 1 << i
        i -= 2
    return val

def get_mirrored_flipping_value(val, version=0):
#     return int(get_flipping_value(val) - 2*(get_flipping_value(val) - val*1.25))
    flipped = get_flipping_value(val)
    b = val + (val >> 2)
    a = (flipped - b) << 1
    return flipped - a

In [None]:
# LZC(mu)
_, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=pd.DataFrame({'20%':result1_x_is_mu, '10%':result2_x_is_mu, 
                                ' 5%':result3_x_is_mu, ' 1%':result4_x_is_mu})[1:])
for x in range(4,10):
    _ = plt.axvline(2**x, alpha=.5, color=fcolors[x-4], ls=":")
    _ = plt.axvline(get_flipping_value(2**x), alpha=.5, color=fcolors[x-4], ls="-")
    _ = plt.axvline(get_mirrored_flipping_value(2**x), alpha=.5, color=fcolors[x-4], ls="-")
plt.legend();

In [None]:
def get_closest_pow_2(val):
    next_smaller = next_smaller_pow_two(val)
    diff_smaller = val - next_smaller
    next_bigger = next_bigger_pow_two(val)
    diff_bigger = next_bigger - val
    if diff_bigger < diff_smaller:
        return next_bigger
    else:
        return next_smaller

def next_bigger_pow_two(val):
    return 1 << len(np.binary_repr(val))

def next_smaller_pow_two(val):
    return 1 << len(np.binary_repr(val)) - 1

In [None]:
def get_shifted_goal(val, method='closest', mode='flip'):
    """
    
    """
    
    # Calculate power of two on which the shift should be based
    if val != 0 and val & (val-1) == 0:
        val = val
    elif method in ('closest', 'c'):
        val = get_closest_pow_2(val)
    elif method in ('smaller', 's'):
        val = next_smaller_pow_two(val)
    elif method in ('bigger', 'b'):
        val = next_bigger_pow_two(val)
    else:
        raise Exception("Could not understand method '{}'".format(method))
    
    if mode == 'flip':
        return get_flipping_value(val)
    elif mode == 'mflip':
        return get_mirrored_flipping_value(val)
    else:
        raise Exception("Could not understand mode '{}'".format(mode))    

In [None]:
get_shifted_goal(256, 'b', 'mflip')

# $\sigma$ for different climate data

In [None]:
from glob import glob

In [None]:
def describe_data(regex):
    residual_files = glob(regex)
    df = pd.DataFrame(index=['unique', 'mean', 'std'])
    for k in sorted(residual_files):
        data = np.fromfile(k, dtype='uint32')
        ps = pd.Series(data)
        unique = ps.unique().size
        mean = ps.mean()
        std = ps.std()

        df[k[2:]] = [unique, mean, std]
    df = df.T
    df['pcent'] = (df['std'] / df['mean']) * 100
    return df

In [None]:
# describe_data("./*.residual")

In [None]:
# describe_data('../../pzip/data/*.raw')

In [None]:
# describe_data("./*.bplanes")

In [None]:
# describe_data("./*.nlzc")