In [26]:
class Node:
    """A simple binary tree node for a basic Huffman encoder."""

    def __init__(self, frequency, symbol=None):
        """Object constructor.

        Inputs
        ------
        frequency : int
          The frequency represented by this node. If the node has also a symbol,
          this is the frequency of the symbol. If no symbol is present, this is
          the sum of frequencies of the node's subtrees.
        symbol : char
          The symbol whose frequency we capture. If symbol is None, the node
          captures frequencies for subtrees under the node.

        Returns
        -------
        Instance of Node object with fields:
          frequency : as described above
          symbol : as described above
          left : pointer to left node child (default none)
          right : pointer to right node child (default none)
        """
        self.frequency = frequency
        self.symbol = symbol
        self.left = None
        self.right = None

    def __lt__(self, other):
        """Redefine < for node to be based on frequency value"""
        return self.frequency < other.frequency

    def set_left(self, left):
        """Setter for left child."""
        self.left = left

    def set_right(self, right):
        """Setter for right child."""
        self.right = right

    def has_left(self):
        return self.left is not None

    def has_right(self):
        return self.right is not None

    def get_left(self):
        return self.left

    def get_right(self):
        return self.right

    def get_frequency(self):
        return self.frequency

    def __str__(self):
        """String representation of object."""
        return f"[{self.symbol} | {self.frequency}]"

def get_frequencies(message):
    """Obtain symbol frequencies from a message given as a string. This method
    parses the string, one character at a time and measures how many time the
    character appears in the message. The method returns an array with the
    frequency counts for each ASCII code present in the message."""

    # Initialize an array for counts of every ASCII code. This may be an
    # overkill. Not all 256 ASCII codes are used in strings. However it
    # guarantees O(1) operations when storing or recalling the frequency count
    # of each symbol in the message.
    NUMBER_OF_SYMBOLS = 256
    freq = [0] * NUMBER_OF_SYMBOLS

    # Parse the message one character at a time.
    if message is not None and len(message) > 0:
        for character in message:
            freq[ord(character)] += 1
    # Done
    return freq


def get_huffman_roots(frequencies):
    """Creates an array of initial nodes with the symbols from a message and
    their frequencies"""

    # Initialize the return array
    forest = list()

    # Consider every ASCII code with non-zero frequency
    for ascii_code in range(len(frequencies)):
        if frequencies[ascii_code] > 0:

            # create a node for this symbol and its frequency then add it
            # to the return array
            new_node = Node(frequencies[ascii_code], chr(ascii_code))
            forest.append(new_node)
    # Done
    return forest


def get_smallest_root(forest):
    """Find the smallest node in a forest, remove it from the forest, and
    return it. The forest is represented by an array of nodes. Nodes are
    compared based on their frequency values."""

    # Assume first element in array is the smallest one
    smallest_index = 0
    smallest_root = forest[smallest_index]

    # Traverse rest of array seeking a node that is smaller than the
    # smallest one.
    for i in range(1,len(forest)):
        if forest[i] < smallest_root: # Node.__lt__ takes care of things here.
            smallest_root = forest[i]
            smallest_index = i
    # Done
    return forest.pop(smallest_index)


def huffman(message):
    """Encodes a message using Huffman codes."""

    # Basic guard statements
    if message is not None and len(message) > 0:

        # Obtain the frequencies of symbols in this message
        message_frequencies = get_frequencies(message)

        # Initialize the forest of the symbol nodes
        forest = get_huffman_roots(message_frequencies)

        # Keep removing smallest two nodes from forest and combine them, until
        # there is only one node left.
        while len(forest) > 1:
            # Remove two nodes with lowest frequencies
            t1 = get_smallest_root(forest)
            t2 = get_smallest_root(forest)
            # Use them to create a new node with their combined frequency and
            # no symbol. The removed nodes become the new nodes left and right
            # children.
            new_node = Node(t1.frequency + t2.frequency)
            new_node.set_left(t1)
            new_node.set_right(t2)
            # Add the new node to the forest
            forest.append(new_node)

        # Done. The loop ends when there is only one node in the forest array.
        # That's the root of the huffman tree.
        return forest[0]

"""
takes the root node of a Huffman tree and returns the Huffmann codes for all its
symbols
"""
#path and codes must be passed as parameters to avoid being reset each recursion
def get_huffman_codes(root, path="", codes = None, left = "0", right = "1"):
  """Given the root of a huffman tree, traverse the tree and construct path
  codes for every leaf node"""
  #only create codes if first iteraiton
  if codes is None:
    codes = dict()
  #reach end of traversal for char so add the path to the dict
  if root.left is None and root.right is None:
      codes[root.symbol] = path
      return codes
  #if the left node is not none then call recursively adding 0 to the path
  if root.left:
    get_huffman_codes(root.left, path + left, codes, left, right)
  #if the right node is not none then call recursively adding 1 to the path
  if root.right:
    get_huffman_codes(root.right, path + right, codes, left, right)
  return codes

#recurse_tree(node, path-string)
#note to self: 0 = left, 1 = right
#base case
"""if node.symbol is not None
    return something --> add to dict
   else
    if node.left : recruse node.left + 0
    if node.right : recurse node.right + 1 """

def trav(root, path):
  if root.symbol:
    print(f' For {root} the path is {path}')
  else:
    if root.left: trav(root.left, path + "0")
    if root.right: trav(root.right, path + "1")

"""
takes a message and the Huffman code of its symbols, and returns a the Huffman
encoding of the message
"""
def encode(message, code):
  #call get_huffman_codes and get the code part of the key pair
  #add the code to a string and return the string with the entire code
  my_code = ""
  #for eveyr char in message find the corresponding code from the dict and add
  #to the string
  for char in message:
    my_code += code[char]
  return my_code

"""
takes a message and its Huffman encoding and reports the efficiency of the
compression
"""
def report_compression_efficiency(encoded, message):
  #use formula 1 - (compressed size / uncompressed size)
  #find compressed size by getting the encoded huffman code
  #find uncompressed by getting the number of characters and mutiplying by 8
  uncompressed = 0
  #compressed size is equal to the number of bits which is equal to length of
  #encoded message
  length  = len(encoded)
  current = 0
  #for every char in the message add 8 bits because ASCII takes 8 bits per char
  for char in message:
    uncompressed += 8
  return 1 - (length / uncompressed)

"""
takes a message encoded with Huffman codes, together with the codes, and returns
the original message.
"""
#use tree to decode not look up table
def decode(root, message):
  #use tree by following path of the code, go left if 0 and go right if 1
  decoded = ""
  current = root
  n = len(message)
  #for every bit in message check the current node to see if it has a left
  #and right node and when a leaf node is reached add the current node's symbol
  #to the decoded message and make current root again
  for bit in message:
    if bit == '0':
      current = current.left
    elif bit == '1':
      current = current.right
    if current.left is None and current.right is None:
      decoded += current.symbol
      current = root
  return decoded

#testing
root = (huffman("HELLO WORLD"))
print(root)
print(root.left)
print(root.right)
trav(root, "")

codes = get_huffman_codes(root,"")
print(codes)

encoded = encode("HELLO WORLD", codes)
print(encoded)

compress_eff = report_compression_efficiency(encoded, "HELLO WORLD")
print(compress_eff)

decoded = decode(root, encoded)
print(decoded)

[None | 11]
[None | 4]
[None | 7]
 For [E | 1] the path is 000
 For [H | 1] the path is 001
 For [R | 1] the path is 010
 For [W | 1] the path is 011
 For [L | 3] the path is 10
 For [O | 2] the path is 110
 For [  | 1] the path is 1110
 For [D | 1] the path is 1111
{'E': '000', 'H': '001', 'R': '010', 'W': '011', 'L': '10', 'O': '110', ' ': '1110', 'D': '1111'}
00100010101101110011110010101111
0.6363636363636364
HELLO WORLD


Assignment 7 Ungrading:

Overall, I believe my assignment 7 submission was well done. Comparing my code to the technical notes, I see that they are very similar. Like the technical notes, my kruskal method uses two for loops to traverse G, adds the edge and weight to E, sorts the edges in non decreasing order using merge sort, and then traverses the edges of T to see if each edge is in a different component adding the edge to T if it is in a different component. One difference between my code and the technical notes is that I created merge and mergsort methods to sort my edges outside of my kruskal method. I did this because I'm used to seperating my sorting methods in Java, but now I know I can have a method within a method in python. The other difference between my code and the technical notes is that I used a for loop instead of a while loop to check if each edge was already in the component. The pseudo-code did say to use a while loop, so in retrospect I should've used a while loop. I used a for loop because I had a lot of trouble debugging my code, and I opted to use a for loop so that I would know every edge was visited since my bug was related to how my code was adding edges to components. Besides these two differences, my code outputs the same output as the technical notes, so I know my code works. I also included significant comments to my code to explain what I was coding. Going to office hours also helped me a lot, and I'm glad I used my resources.