# Data Compression Exploration

In this notebook we are defining some basics concepts and implementing

### Entropy 
Representation of randomness or disorder of a system

In [1]:
from math import *

# Entropy defined by Shannon
def H(P):
    """
    S    -> Set of possible states
    P    -> Set of probability for each state
    p(s) -> probability of state s in S
    If we know exactly what state the system is -> H(S) returns 0
    In this context:
        - A "state" is a "message", so S is a set of possible messages, then p(s) is the probability of message
        - Every messages are supposed to have the same lenght
    """
    
    return sum([p*log((1/p),2) for p in P])

In [2]:
H([0.25,0.25,0.25,0.125,0.125])

2.25

In [3]:
H([0.5,0.125,0.125,0.125,0.125])

2.0

In [4]:
H([0.75,0.0625,0.0625,0.0625,0.0625])

1.3112781244591327

### Self information
Number of bits of informations contained in a message and we should use to encode it

In [5]:
def i(p):
    """p -> Probability of a state"""
    
    return log((1/p),2)

In [6]:
i(1/4)

2.0

### Prefix code
Special kind of uniquely decodable code in which no bit-string is a prefix of another one.


In [7]:
# Average lenght of the code
def len_average(C):
    """
    C    -> Prefix code (Set of codewords)
    l(w) -> Lenght of the codeword
    """
    
    return sum([s*len(w) for (s,w) in C])

In [8]:
# Directly replaced the component s by p(s)
len_average([(0.25,'1'),(0.5,"01"),(0.25,"000")])

2.0

# Huffman codes implementation

In [9]:
# Generate list with every differents chars in the data
def parse_bytes(data):
    ret=[]
    for x in data:
        if not x in ret:
            ret.append(x)
    return ret

# Generate dict with char frequencies
def frequency(data):
    ret=[[x,data.count(x)] for x in parse_bytes(data)]
    return sorted(ret,key=lambda x:x[1])

In [10]:
frequency("hello_world")

[['h', 1],
 ['e', 1],
 ['_', 1],
 ['w', 1],
 ['r', 1],
 ['d', 1],
 ['o', 2],
 ['l', 3]]

In [11]:
message="Hello this is a test.!?"
DATA=frequency(message)

# Generate tree in function of frequencies / probabilities
def tree(Q):
    """Q -> Priority queue"""
    ret=[tuple(Q)]
    while len(Q)>1:
        merge=[Q.pop(0) for x in range(2)]
        m=['',0] # Merging list
        for x in merge:
            for y in range(2):
                m[y]+=x[y]
        Q.append(m)
        Q.sort(key=lambda x:x[1])
        ret.append(tuple(Q))
    return ret[:-1][::-1]

In [12]:
TREE=tree(DATA)
TREE

[(['Hoha.!t', 9], ['s?e li', 14]),
 (['s?e', 6], [' li', 8], ['Hoha.!t', 9]),
 (['Hoha', 4], ['.!t', 5], ['s?e', 6], [' li', 8]),
 ([' ', 4], ['li', 4], ['Hoha', 4], ['.!t', 5], ['s?e', 6]),
 (['s', 3], ['?e', 3], [' ', 4], ['li', 4], ['Hoha', 4], ['.!t', 5]),
 (['.!', 2], ['t', 3], ['s', 3], ['?e', 3], [' ', 4], ['li', 4], ['Hoha', 4]),
 (['Ho', 2],
  ['ha', 2],
  ['.!', 2],
  ['t', 3],
  ['s', 3],
  ['?e', 3],
  [' ', 4],
  ['li', 4]),
 (['l', 2],
  ['i', 2],
  ['Ho', 2],
  ['ha', 2],
  ['.!', 2],
  ['t', 3],
  ['s', 3],
  ['?e', 3],
  [' ', 4]),
 (['?', 1],
  ['e', 2],
  ['l', 2],
  ['i', 2],
  ['Ho', 2],
  ['ha', 2],
  ['.!', 2],
  ['t', 3],
  ['s', 3],
  [' ', 4]),
 (['.', 1],
  ['!', 1],
  ['?', 1],
  ['e', 2],
  ['l', 2],
  ['i', 2],
  ['Ho', 2],
  ['ha', 2],
  ['t', 3],
  ['s', 3],
  [' ', 4]),
 (['h', 1],
  ['a', 1],
  ['.', 1],
  ['!', 1],
  ['?', 1],
  ['e', 2],
  ['l', 2],
  ['i', 2],
  ['Ho', 2],
  ['t', 3],
  ['s', 3],
  [' ', 4]),
 (['H', 1],
  ['o', 1],
  ['h', 1],
  ['

In [13]:
# Generate values [0,1] for each stages of the tree
def encode(T):
    """T -> Tree"""
    U={}
    for x in range(len(T)):
        for y in range(len(T[x])):
            if not T[x][y][0] in U:
                U[T[x][y][0]]=str(y%2)
                
    U=dict(sorted(U.items(),reverse=True ,key=lambda item: len(item[0])))
    return U

# Build prefix codes dict
def build(e):
    parsed=parse_bytes(DATA)[0][0]
    ret=dict.fromkeys(parsed, '')
    for x in parsed:
        for k,v in e.items():
            if x in k:
                ret[x]+=e[k]
    return ret
                
PREFIX_CODE=build(encode(TREE))
PREFIX_CODE

{'H': '0000',
 'o': '0001',
 'h': '0010',
 'a': '0011',
 '.': '0100',
 '!': '0101',
 't': '011',
 's': '100',
 '?': '1010',
 'e': '1011',
 ' ': '110',
 'l': '1110',
 'i': '1111'}

## Compression

In [14]:
# Generate prefix codes from message
def compress(prefix_code):
    return "".join([prefix_code[x] for x in message])

COMPRESSED=compress(PREFIX_CODE)
message,COMPRESSED

('Hello this is a test.!?',
 '0000101111101110000111001100101111100110111110011000111100111011100011010001011010')

### Compression rate 

In [15]:
len(COMPRESSED)/(len(message)*8) * 100

44.565217391304344

## Decompression

In [16]:
# Generate message from prefix codes
def decompress(prefix_code):
    ret,x=[],0
    prefix_code=dict(zip(prefix_code.values(), prefix_code.keys()))
    _max=len(max(prefix_code.keys(),key=len))
    while x < len(COMPRESSED):
        for i in range(_max,0,-1):
            if COMPRESSED[x:x+i] in prefix_code:
                ret.append(prefix_code[COMPRESSED[x:x+i]])
                x+=i
                break
    return "".join(ret) 

In [17]:
decompress(PREFIX_CODE)

'Hello this is a test.!?'