In [13]:
# Data compression
# Entropy -> Representation of randomness or disorder of a system

from math import *

# Entropy defined by Shannon
def H(P):
    """
    S    -> Set of possible states
    P    -> Set of probability for each state
    p(s) -> probability of state s in S
    If we know exactly what state the system is -> H(S) returns 0
    In this context:
        - A "state" is a "message", so S is a set of possible messages, then p(s) is the probability of message
        - Every messages are supposed to have the same lenght
    """
    
    return sum([p*log((1/p),2) for p in P])

In [14]:
H([0.25,0.25,0.25,0.125,0.125])

2.25

In [15]:
H([0.5,0.125,0.125,0.125,0.125])

2.0

In [16]:
H([0.75,0.0625,0.0625,0.0625,0.0625])

1.3112781244591327

In [17]:
# Self information message notion in the context of information theory
def i(p):
    """
    p    -> Probability of a state
    Self information -> Number of bits of informations contained in a message
                     -> Number of bits we should use to encode it
    """
    
    return log((1/p),2)

In [18]:
i(1/4)

2.0

In [19]:
# A prefix code is a special kind of uniquely decodable code in which no bit-string is a prefixof another one.

# Average lenght of the code

def len_average(C):
    """
    C    -> Prefix code (Set of codewords)
    l(w) -> Lenght of the codeword
    """
    
    return sum([s*len(w) for (s,w) in C])

In [20]:
# Directly replaced the component s by p(s)
len_average([(0.25,'1'),(0.5,"01"),(0.25,"000")])

2.0

In [21]:
# Huffman Codes

# Generate list with every differents chars in the data
def parse_bytes(data):
    ret=[]
    for x in data:
        if not x in ret:
            ret.append(x)
    return ret

# Generate dict with char frequencies
def frequency(data):
    ret=[[x,data.count(x)] for x in parse_bytes(data)]
    return sorted(ret,key=lambda x:x[1])
    

In [22]:
frequency("hello_world")

[['h', 1],
 ['e', 1],
 ['_', 1],
 ['w', 1],
 ['r', 1],
 ['d', 1],
 ['o', 2],
 ['l', 3]]

In [25]:
DATA=frequency("hello_world")

# Generate tree in function of frequencies

def tree(Q):
    TREE=[tuple(Q)]
    while len(Q)>1:        
        merge=[Q.pop(0) for x in range(2)]
        ret=['',0]
        for x in merge:
            for y in range(2):
                ret[y]+=x[y]
        
        Q.append(ret)
        Q.sort(key=lambda x:x[1])
        TREE.append(tuple(Q))
    return TREE
            

In [26]:
tree(DATA)

[(['h', 1],
  ['e', 1],
  ['_', 1],
  ['w', 1],
  ['r', 1],
  ['d', 1],
  ['o', 2],
  ['l', 3]),
 (['_', 1], ['w', 1], ['r', 1], ['d', 1], ['o', 2], ['he', 2], ['l', 3]),
 (['r', 1], ['d', 1], ['o', 2], ['he', 2], ['_w', 2], ['l', 3]),
 (['o', 2], ['he', 2], ['_w', 2], ['rd', 2], ['l', 3]),
 (['_w', 2], ['rd', 2], ['l', 3], ['ohe', 4]),
 (['l', 3], ['ohe', 4], ['_wrd', 4]),
 (['_wrd', 4], ['lohe', 7]),
 (['_wrdlohe', 11],)]

In [None]:
def 