In [11]:
A_PDB = "../data/A.pdb"
data = None
with open(A_PDB, "r") as in_file:
	data = in_file.read()

In [12]:
def mb(string: str, k = 1024):
	return len(string) / (k*k)

In [13]:
# first I want to parse the string to read the unique characters in the file
def char_frequencies(string: str) -> dict[str, int]:
	freqs = {}
	for char in string:
		if char not in freqs:
			freqs[char] = 0
		else:
			freqs[char] += 1
	return freqs

In [14]:
# todo: make MODEL a character so I don't encode those separately and ATOM no need to model char separately
char_frequencies(data)

{'M': 27120,
 'O': 37836,
 'D': 3393,
 'E': 10295,
 'L': 14156,
 ' ': 887763,
 '1': 127074,
 '\n': 26754,
 'A': 64749,
 'T': 29876,
 'N': 11958,
 '-': 38823,
 '2': 95921,
 '5': 69215,
 '.': 133754,
 '8': 60099,
 '0': 111676,
 '4': 71186,
 '3': 76945,
 '9': 57091,
 '6': 64746,
 'C': 34267,
 '7': 62000,
 'B': 3201,
 'G': 8583,
 'S': 9036,
 'U': 4521,
 'H': 4195,
 'I': 2907,
 'P': 4253,
 'R': 6146,
 'V': 1462,
 'Z': 715,
 'Y': 5405}

In [15]:
freqs = char_frequencies(data)

In [16]:
# the most least frequent we can assume will come up not that many times, so can afford to be a longer encoding
# the more frequent should have less characters it represents
def sort_freqs(freqs: dict[str, int]) -> list[tuple[str, int]]:
	return sorted([(k, v) for k,v in freqs.items()], key=lambda x: x[1], reverse=True)

sorted_freqs = sort_freqs(freqs)
sorted_freqs

[(' ', 887763),
 ('.', 133754),
 ('1', 127074),
 ('0', 111676),
 ('2', 95921),
 ('3', 76945),
 ('4', 71186),
 ('5', 69215),
 ('A', 64749),
 ('6', 64746),
 ('7', 62000),
 ('8', 60099),
 ('9', 57091),
 ('-', 38823),
 ('O', 37836),
 ('C', 34267),
 ('T', 29876),
 ('M', 27120),
 ('\n', 26754),
 ('L', 14156),
 ('N', 11958),
 ('E', 10295),
 ('S', 9036),
 ('G', 8583),
 ('R', 6146),
 ('Y', 5405),
 ('U', 4521),
 ('P', 4253),
 ('H', 4195),
 ('D', 3393),
 ('B', 3201),
 ('I', 2907),
 ('V', 1462),
 ('Z', 715)]

In [30]:
class HuffNode():
	def __init__(self, freq: int, lchild: "HuffNode" = None, rchild: "HuffNode" = None):
		self.freq = freq
		self.lchild = lchild
		self.rchild = rchild
	def __repr__(self):
		return self.__class__.__name__ + "(" + self.__dict__.__str__() + ")"

class HuffLeaf(HuffNode):
	def __init__(self, freq: int, char: str):
		super().__init__(freq)
		self.char = char

def is_leaf(node: HuffNode):
	return node.__class__ is HuffLeaf

a = HuffLeaf(1, 'A')
b = HuffNode(12)

print(is_leaf(a))
print(is_leaf(b))
a

True
False


HuffLeaf({'freq': 1, 'lchild': None, 'rchild': None, 'char': 'A'})

In [31]:
bottom1 = sorted_freqs[-1]
bottom2 = sorted_freqs[-2]
c1 = HuffLeaf(bottom1[1], bottom1[0])
c2 = HuffLeaf(bottom2[1], bottom2[0])
c1

HuffLeaf({'freq': 715, 'lchild': None, 'rchild': None, 'char': 'Z'})

In [32]:
n1 = HuffNode(freq=c1.freq+c2.freq, lchild=c1, rchild=c2)

In [33]:
n1

HuffNode({'freq': 2177, 'lchild': HuffLeaf({'freq': 715, 'lchild': None, 'rchild': None, 'char': 'Z'}), 'rchild': HuffLeaf({'freq': 1462, 'lchild': None, 'rchild': None, 'char': 'V'})})