In [54]:
A_PDB = "../data/A.pdb"
data = None
with open(A_PDB, "r") as in_file:
	data = in_file.read()

In [55]:
def mb(string: str, k = 1024):
	return len(string) / (k*k)

In [56]:
# first I want to parse the string to read the unique characters in the file
def char_frequencies(string: str) -> dict[str, int]:
	freqs = {}
	for char in string:
		if char not in freqs:
			freqs[char] = 0
		else:
			freqs[char] += 1
	return freqs

In [57]:
# todo: make MODEL a character so I don't encode those separately and ATOM no need to model char separately
char_frequencies(data)

{'M': 27120,
 'O': 37836,
 'D': 3393,
 'E': 10295,
 'L': 14156,
 ' ': 887763,
 '1': 127074,
 '\n': 26754,
 'A': 64749,
 'T': 29876,
 'N': 11958,
 '-': 38823,
 '2': 95921,
 '5': 69215,
 '.': 133754,
 '8': 60099,
 '0': 111676,
 '4': 71186,
 '3': 76945,
 '9': 57091,
 '6': 64746,
 'C': 34267,
 '7': 62000,
 'B': 3201,
 'G': 8583,
 'S': 9036,
 'U': 4521,
 'H': 4195,
 'I': 2907,
 'P': 4253,
 'R': 6146,
 'V': 1462,
 'Z': 715,
 'Y': 5405}

In [58]:
freqs = char_frequencies(data)

In [59]:
class HuffNode():
	def __init__(self, freq: int, lchild: "HuffNode" = None, rchild: "HuffNode" = None):
		self.freq = freq
		self.lchild = lchild
		self.rchild = rchild
	def __repr__(self):
		return self.__class__.__name__ + "(" + self.__dict__.__str__() + ")"

class HuffLeaf(HuffNode):
	def __init__(self, freq: int, char: str):
		super().__init__(freq)
		self.char = char

def is_leaf(node: HuffNode):
	return node.__class__ is HuffLeaf

a = HuffLeaf(1, 'A')
b = HuffNode(12)

print(is_leaf(a))
print(is_leaf(b))
a

True
False


HuffLeaf({'freq': 1, 'lchild': None, 'rchild': None, 'char': 'A'})

In [60]:
from heapq import heapify, heappop, heappush

def node_to_heapq_format(node: HuffNode):
	return (node.freq, node) # high frequency ones should come up first


def heapq_format_to_node(heapq_item: tuple[int, HuffNode]):
	return heapq_item[1] # (priority, node)[1] selects node

class HuffQueue():
	def __init__(self, freqs: dict[str, int]):
		self.priority_queue = []
		heapify(self.priority_queue)
		
		# add all the leaves (characters) first
		for c, f in freqs.items():
			self.push(HuffLeaf(freq=f, char=c))

	def pop(self) -> HuffNode:
		return heapq_format_to_node(heappop(self.priority_queue))

	def push(self, new_node: HuffNode):
		heappush(self.priority_queue, node_to_heapq_format(new_node))
	
	def peak(self) -> HuffNode:
		return heapq_format_to_node(self.priority_queue[0])
	
	def __len__(self) -> int:
		return len(self.priority_queue)
	
	def __repr__(self) -> str:
		return f"HuffQueue(len={len(self)}, top={self.peak()})"
		
q = HuffQueue(freqs)

In [61]:
def huffman_code(freqs: dict[str, int]) -> HuffNode:
    """Returns the root node of the tree"""

    q = HuffQueue(freqs)
    while len(q) > 1:
        # pop two smallest nodes, they are now the bottom of the tree
        child_a, child_b = q.pop(), q.pop()
        parent = HuffNode(
            freq=child_a.freq + child_b.freq, 
            lchild=child_a, 
            rchild=child_b
        )
        # push the nodes (almost think of merged) back into circulation
        q.push(parent)

    return q.pop() # return the root


r = huffman_code(freqs)
r

HuffNode({'freq': 2167121, 'lchild': HuffLeaf({'freq': 887763, 'lchild': None, 'rchild': None, 'char': ' '}), 'rchild': HuffNode({'freq': 1279358, 'lchild': HuffNode({'freq': 523681, 'lchild': HuffNode({'freq': 255963, 'lchild': HuffLeaf({'freq': 127074, 'lchild': None, 'rchild': None, 'char': '1'}), 'rchild': HuffNode({'freq': 128889, 'lchild': HuffNode({'freq': 64143, 'lchild': HuffLeaf({'freq': 29876, 'lchild': None, 'rchild': None, 'char': 'T'}), 'rchild': HuffLeaf({'freq': 34267, 'lchild': None, 'rchild': None, 'char': 'C'})}), 'rchild': HuffLeaf({'freq': 64746, 'lchild': None, 'rchild': None, 'char': '6'})})}), 'rchild': HuffNode({'freq': 267718, 'lchild': HuffLeaf({'freq': 133754, 'lchild': None, 'rchild': None, 'char': '.'}), 'rchild': HuffNode({'freq': 133964, 'lchild': HuffLeaf({'freq': 64749, 'lchild': None, 'rchild': None, 'char': 'A'}), 'rchild': HuffLeaf({'freq': 69215, 'lchild': None, 'rchild': None, 'char': '5'})})})}), 'rchild': HuffNode({'freq': 755677, 'lchild': Huff

In [75]:
from treelib import Tree
def node_fmt(node: HuffNode):
	return f"Node(freq=`{node.freq:,}`)"

def huffman_code_to_treelib(root: HuffNode):
	# add parent first
	global_id = 0
	t = Tree()
	t.create_node(node_fmt(root), global_id)

	# then add children
	def traverse(node: HuffNode, parent_id):
		nonlocal global_id, t
		if is_leaf(node):
			global_id += 1
			t.create_node(f"Leaf(char=`{node.char}`, freq=`{node.freq:,}`)", global_id, parent=parent_id)
			return

		global_id += 1
		t.create_node(node_fmt(node.lchild), global_id, parent=parent_id)
		traverse(node.lchild, parent_id=global_id)

		global_id += 1
		t.create_node(node_fmt(node.rchild), global_id, parent=parent_id)
		traverse(node.rchild, parent_id=global_id)

	traverse(root, global_id)

	return t

print(huffman_code_to_treelib(r))
huffman_code_to_treelib(r).save2file("example")

Node(freq=`2,167,121`)
├── Node(freq=`1,279,358`)
│   ├── Node(freq=`523,681`)
│   │   ├── Node(freq=`255,963`)
│   │   │   ├── Node(freq=`127,074`)
│   │   │   │   └── Leaf(char=`1`, freq=`127,074`)
│   │   │   └── Node(freq=`128,889`)
│   │   │       ├── Node(freq=`64,143`)
│   │   │       │   ├── Node(freq=`29,876`)
│   │   │       │   │   └── Leaf(char=`T`, freq=`29,876`)
│   │   │       │   └── Node(freq=`34,267`)
│   │   │       │       └── Leaf(char=`C`, freq=`34,267`)
│   │   │       └── Node(freq=`64,746`)
│   │   │           └── Leaf(char=`6`, freq=`64,746`)
│   │   └── Node(freq=`267,718`)
│   │       ├── Node(freq=`133,754`)
│   │       │   └── Leaf(char=`.`, freq=`133,754`)
│   │       └── Node(freq=`133,964`)
│   │           ├── Node(freq=`64,749`)
│   │           │   └── Leaf(char=`A`, freq=`64,749`)
│   │           └── Node(freq=`69,215`)
│   │               └── Leaf(char=`5`, freq=`69,215`)
│   └── Node(freq=`755,677`)
│       ├── Node(freq=`312,572`)
│       │   ├── N