In [None]:
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
truth = pd.Series(np.fromfile("../../pzip/data/icon.pl.u.f32.little.4x47x351x901_3.raw", dtype='uint32'))
truth.name = "truth"

In [None]:
prediction = truth.shift()
prediction[0] = 0
prediction.name = "prediction"

In [None]:
df = pd.DataFrame(truth).astype('uint32')
df['prediction'] = prediction.astype('uint32')

In [None]:
df.tail()

In [None]:
def _lzc(t):
    if t==0:
        return 32
    return 32 - len(np.binary_repr(t))
lzc = np.frompyfunc(_lzc, 1, 1)
npxor = np.frompyfunc(np.bitwise_xor, 2, 1)

In [None]:
df['xor'] = npxor(df['truth'].values, df['prediction'].values)
df.tail()

In [None]:
df['diff'] = df.loc[:,['prediction','truth']].max(axis=1) - df.loc[:,['prediction','truth']].min(axis=1)

In [None]:
df['truth_bigger'] = df['truth'] >= df['prediction']

In [None]:
df['xor_lzc'] = lzc(df['xor'])

In [None]:
df.tail()

In [None]:
def _truth_bigger_or_same(prediction, lzc):
    if lzc == 32:
        # Truth is same as prediction
        return True
    value = 1 << 32 - lzc - 1
    if prediction & value > 0:
        return False
    else:
        return True
truth_bigger_or_same = np.frompyfunc(_truth_bigger_or_same, 2, 1)

In [None]:
df['truth_bigger_lzc'] = truth_bigger_or_same(df['prediction'], df['xor_lzc'])

In [None]:
# # ix = -3532
# # ix = -142
# ix = -24
# t = np.binary_repr(df['truth'].iloc[ix], 32)
# p = np.binary_repr(df['prediction'].iloc[ix], 32)
# l = np.binary_repr(1 << 32 - df['xor_lzc'].iloc[ix] - 1, 32)
# o = np.binary_repr(df['prediction'].iloc[ix] & 1 << 32 - df['xor_lzc'].iloc[ix] - 1, 32)
# print(t,p,l,o, sep='\n')

In [None]:
np.array_equal(df['truth_bigger_lzc'], df['truth_bigger'])

# Calculate padding zeros for diff

In [None]:
df

In [None]:
def _paddingzeros(lzc, diff):
    if diff == 0:
        return 32
    return 32 - lzc - len(np.binary_repr(diff))
paddingzeros = np.frompyfunc(_paddingzeros, 2, 1)

In [None]:
df['padding_zeros'] = paddingzeros(df['xor_lzc'].values,df['diff'].values)

In [None]:
df.tail()

# Probabilities

In [None]:
from collections import Counter
import operator as op

## Number of zeros

In [None]:
distribution = {k:v/df.index.size for k,v in Counter(df['padding_zeros']).items()}
distribution_value_sorted = {k:v for k,v in sorted(distribution.items(), key=op.itemgetter(1), reverse=True)}

In [None]:
distribution_value_sorted

## First k bits of xor

In [None]:
# k = 4
# np.binary_repr(value)

In [None]:
def _get_first_k_bits_after_1(k, val):
    if val < 1 << k:
        return val
    i = 0
    selection = (1 << k) - 1
    while (selection << i) <= val:
        i += 1
    return ((selection << i - 1) & val) >> i - 1
get_first_k_bits_after_1 = np.frompyfunc(_get_first_k_bits_after_1, 2 , 1)

In [None]:
# for num in [153, 212,513,292,82732, 0, 100, 9, 17]:
#     print(np.binary_repr(num), np.binary_repr(_get_first_k_bits_after_1(k, num), k))

In [None]:
df['first_four_xor'] = get_first_k_bits_after_1(4, df['xor_lzc'].values)

In [None]:
df.tail()

In [None]:
distribution = {k:v/df.index.size for k,v in Counter(df['first_four_xor']).items()}
distribution_value_sorted = {np.binary_repr(k, 4):v for k,v in sorted(distribution.items(), key=op.itemgetter(1), reverse=True)}

In [None]:
df['padding_zeros'].sum()

In [None]:
df['xor_lzc'].sum()

In [None]:
df.index.size*32