In [1]:
A_PDB = "../data/A.pdb"
data = None
with open(A_PDB, "r") as in_file:
	data = in_file.read()

In [2]:
def mb(string: str, k = 1024):
	return len(string) / (k*k)

In [3]:
# first I want to parse the string to read the unique characters in the file
def char_frequencies(string: str) -> dict[str, int]:
	freqs = {}
	for char in string:
		if char not in freqs:
			freqs[char] = 0
		else:
			freqs[char] += 1
	return freqs

In [4]:
# todo: make MODEL a character so I don't encode those separately and ATOM no need to model char separately
char_frequencies(data)

{'M': 27120,
 'O': 37836,
 'D': 3393,
 'E': 10295,
 'L': 14156,
 ' ': 887763,
 '1': 127074,
 '\n': 26754,
 'A': 64749,
 'T': 29876,
 'N': 11958,
 '-': 38823,
 '2': 95921,
 '5': 69215,
 '.': 133754,
 '8': 60099,
 '0': 111676,
 '4': 71186,
 '3': 76945,
 '9': 57091,
 '6': 64746,
 'C': 34267,
 '7': 62000,
 'B': 3201,
 'G': 8583,
 'S': 9036,
 'U': 4521,
 'H': 4195,
 'I': 2907,
 'P': 4253,
 'R': 6146,
 'V': 1462,
 'Z': 715,
 'Y': 5405}

In [5]:
freqs = char_frequencies(data)

In [6]:
class HuffNode():
	def __init__(self, freq: int, lchild: "HuffNode" = None, rchild: "HuffNode" = None):
		self.freq = freq
		self.lchild = lchild
		self.rchild = rchild
	def __repr__(self):
		return self.__class__.__name__ + "(" + self.__dict__.__str__() + ")"

class HuffLeaf(HuffNode):
	def __init__(self, freq: int, char: str):
		super().__init__(freq)
		self.char = char

def is_leaf(node: HuffNode):
	return node.__class__ is HuffLeaf

a = HuffLeaf(1, 'A')
b = HuffNode(12)

print(is_leaf(a))
print(is_leaf(b))
a

True
False


HuffLeaf({'freq': 1, 'lchild': None, 'rchild': None, 'char': 'A'})

In [18]:
from heapq import heapify, heappop, heappush

def node_to_heapq_format(node: HuffNode):
	return (-node.freq, node) # prioritize long frequencies so reverse priority queue order

def heapq_format_to_node(heapq_item: tuple[int, HuffNode]):
	return heapq_item[1] # (priority, node)[1] selects node

class HuffQueue():
	def __init__(self, freqs: dict[str, int]):
		self.priority_queue = []
		heapify(self.priority_queue)
		
		# add all the leaves (characters) first
		for c, f in freqs.items():
			self.push(HuffLeaf(freq=f, char=c))

	def pop(self) -> HuffNode:
		return heapq_format_to_node(heappop(self.priority_queue))

	def push(self, new_node: HuffNode):
		heappush(self.priority_queue, node_to_heapq_format(new_node))
	
	def peak(self) -> HuffNode:
		return heapq_format_to_node(self.priority_queue[0])
	
	def __repr__(self) -> str:
		return f"HuffQueue(len={len(self.priority_queue)}, top={self.peak()})"
		
q = HuffQueue(freqs)

In [24]:
def huffman_code(freqs: dict[str, int]) -> HuffNode:
	"""Returns the root node of the tree"""

	q = HuffQueue(freqs)
	print(q)
	
	return

huffman_code(freqs)

HuffQueue(len=34, top=HuffLeaf({'freq': 887763, 'lchild': None, 'rchild': None, 'char': ' '}))
