In [10]:
class Max_Heap:
    """A basic max heap structure."""

    def __init__(self):
        """Basic constructor setting up the underlying array for the heap."""
        self.heap = []

    def size(self):
        """Reports the size of the heap."""
        return len(self.heap)

    def insert(self, value):
        """Adds a new element to the heap."""
        self.heap.append(value)
        # Now reorganize the heap to make sure that the newly added element
        # does not violate the max heap property.
        self.heapify_up()

    # Helper methods for parent-children pointers
    def left_child(self, parent):
        return 2*parent+1
    def right_child(self, parent):
        return 2*(parent+1)
    def parent(self, child):
        return (child-1)//2

    def pop(self):
        """Remove the root of the tree, replace it with the last element,
        and ensure that the new root does not violate the heap property."""

        # Guard statements
        if self.size() == 0: return None
        if self.size() == 1: return self.heap.pop()

        # heap has two or more elements
        popped = self.heap[0]
        self.heap[0] = self.heap.pop()
        self.heapify_down()
        return popped

    def heapify_up(self):
        """Restores the max-heap property by shifting the last element up."""
        # Position of latest item
        index = self.size() - 1
        # Traverse from last item to root via parents
        while index > 0:
            parent_index = self.parent(index)
            if self.heap[index] > self.heap[parent_index]:
                # Element is larger than its parent; swap them
                self.heap[index], self.heap[parent_index] = self.heap[parent_index], self.heap[index]
                # Update the item's index
                index = parent_index
            else:
                # No need to swap, heap property is satisfied
                index = 0  # Stop the while loop

    def heapify_down(self):
        """Restores the max-heap property by shifting the root element down."""
        index = 0
        size = self.size()
        while 2 * index + 1 < size:
            # Children of current node at position index
            left = self.left_child(index)
            right = self.right_child(index)
            # Assume the parent node is larger than its children
            largest = index

            # If left child is greater than the currently largest element,
            # update the position of the largest element
            if left < size and self.heap[left] > self.heap[largest]:
                largest = left

            # If right child is greater than the currently largest element,
            # update the position of the largest element
            if right < size and self.heap[right] > self.heap[largest]:
                largest = right

            # If the largest element is not the parent, it means that one of its
            # children is larger -- swap parent with the largest element
            if largest != index:
                self.heap[index], self.heap[largest] = self.heap[largest], self.heap[index]
                index = largest
            else:
                # Heap property is satisfied
                index = size  # Stop the while loop

   #using the code above, build a min-heap. In a min-heap a parent is always
   #smaller than or equal to its children.
   #min heaps operate the same as max heaps excpet the root node is the smallest

    def insert_min(self, value):
      """Adds a new element to the heap."""
      self.heap.append(value)
      # Now reorganize the heap to make sure that the newly added element
      # does not violate the min heap property.
      self.min_heapify_up()


    def min_heapify_up(self):
     """Restores the max-heap property by shifting the last element up."""
     # Position of latest item
     index = self.size() - 1
     # Traverse from last item to root via parents
     while index > 0:
      parent_index = self.parent(index)
      #see if element is smaller than parent
      """
      I'm aware this equation should be <= but I would have to edit class Node
      to make <= possible
      """
      if self.heap[index] < self.heap[parent_index]:
        # Element is smaller than its parent; swap them
        self.heap[index], self.heap[parent_index] = self.heap[parent_index], self.heap[index]
        # Update the item's index
        index = parent_index
      else:
        # No need to swap, heap property is satisfied
        index = 0  # Stop the while loop

class Node:
    """A simple binary tree node for a basic Huffman encoder."""

    def __init__(self, frequency, symbol=None):
        """Object constructor.

        Inputs
        ------
        frequency : int
          The frequency represented by this node. If the node has also a symbol,
          this is the frequency of the symbol. If no symbol is present, this is
          the sum of frequencies of the node's subtrees.
        symbol : char
          The symbol whose frequency we capture. If symbol is None, the node
          captures frequencies for subtrees under the node.

        Returns
        -------
        Instance of Node object with fields:
          frequency : as described above
          symbol : as described above
          left : pointer to left node child (default none)
          right : pointer to right node child (default none)
        """
        self.frequency = frequency
        self.symbol = symbol
        self.left = None
        self.right = None

    def __lt__(self, other):
        """Redefine < for node to be based on frequency value"""
        return self.frequency < other.frequency

    def set_left(self, left):
        """Setter for left child."""
        self.left = left

    def set_right(self, right):
        """Setter for right child."""
        self.right = right

    def has_left(self):
        return self.left is not None

    def has_right(self):
        return self.right is not None

    def get_left(self):
        return self.left

    def get_right(self):
        return self.right

    def get_frequency(self):
        return self.frequency

    def has_symbol(self):
        return self.symbol is not None

    def get_symbol(self):
        return self.symbol

    def __str__(self):
        """String representation of object."""
        return f"[{self.symbol} | {self.frequency}]"

def get_huffman_roots(frequencies):
    """Creates an array of initial nodes with the symbols from a message and
    their frequencies"""

    # Initialize the return array
    forest = list()

    # Consider every ASCII code with non-zero frequency
    for ascii_code in range(len(frequencies)):
        if frequencies[ascii_code] > 0:

            # create a node for this symbol and its frequency then add it
            # to the return array
            new_node = Node(frequencies[ascii_code], chr(ascii_code))
            forest.append(new_node)
    # Done
    return forest

def get_frequencies(message):
    """Obtain symbol frequencies from a message given as a string. This method
    parses the string, one character at a time and measures how many time the
    character appears in the message. The method returns an array with the
    frequency counts for each ASCII code present in the message."""

    # Initialize an array for counts of every ASCII code. This may be an
    # overkill. Not all 256 ASCII codes are used in strings. However it
    # guarantees O(1) operations when storing or recalling the frequency count
    # of each symbol in the message.
    NUMBER_OF_SYMBOLS = 256
    freq = [0] * NUMBER_OF_SYMBOLS

    # Parse the message one character at a time.
    if message is not None and len(message) > 0:
        for character in message:
            freq[ord(character)] += 1
    # Done
    return freq

def get_smallest_root(forest):
    """Find the smallest node in a forest, remove it from the forest, and
    return it. The forest is represented by an array of nodes. Nodes are
    compared based on their frequency values."""

    # Assume first element in array is the smallest one
    smallest_index = 0
    smallest_root = forest[smallest_index]

    # Traverse rest of array seeking a node that is smaller than the
    # smallest one.
    for i in range(1,len(forest)):
        if forest[i] < smallest_root: # Node.__lt__ takes care of things here.
            smallest_root = forest[i]
            smallest_index = i
    # Done
    return forest.pop(smallest_index)

def get_huffman_codes(node, path, table):
    """Builds a lookup table for symbols and their Huffman codes by traversing
    a Huffman tree. The method traverses the tree recursively tracking the path
    towards each leaf node with LEFT and RIGHT information. When a leaf node is
    found, the path is saved together with the symbol in the node."""
    LEFT = '0'; RIGHT = '1'
    if node.has_symbol():
        # Base case; node has a symbol. Add symbol and its path to lookup table.
        table[node.get_symbol()] = path
    else:
        # Continue traversing the tree by following the left and right children
        # of the present node, updating the path with the direction information.
        get_huffman_codes(node.left, path+LEFT, table)
        get_huffman_codes(node.right, path+RIGHT, table)
    # Done
    return table

def huffman(message):
    """Encodes a message using Huffman codes."""

    # Basic guard statements
    if message is not None and len(message) > 0:

        # Obtain the frequencies of symbols in this message
        message_frequencies = get_frequencies(message)

        # Initialize the forest of the symbol nodes
        forest = get_huffman_roots(message_frequencies)

        # Keep removing smallest two nodes from forest and combine them, until
        # there is only one node left.
        while len(forest) > 1:
            # Remove two nodes with lowest frequencies
            t1 = get_smallest_root(forest)
            t2 = get_smallest_root(forest)
            # Use them to create a new node with their combined frequency and
            # no symbol. The removed nodes become the new nodes left and right
            # children.
            new_node = Node(t1.frequency + t2.frequency)
            new_node.set_left(t1)
            new_node.set_right(t2)
            # Add the new node to the forest
            forest.append(new_node)

        # Done. The loop ends when there is only one node in the forest array.
        # That's the root of the huffman tree.
        return forest[0]

def encode(message):
  """Compresses a message with Huffman codes based on the frequency of symbols
  in the message."""
  # Initialize output string
  encoded = ""
  # Obtain the Huffman tree for this message
  huffman_tree_root = huffman(message)
  # Build a lookup table of symbols and Huffman codes based on the tree
  codes = get_huffman_codes(huffman_tree_root, "", dict())
  # Encode the message by looking up the Huffman code of each character in it.
  for character in message:
    encoded += codes[character]
  # Done
  return encoded, huffman_tree_root, codes

def report_compression_efficiency(raw, encoded):
    return 100*(1-len(encoded)/(8*len(raw))) if len(raw) > 0 else 0

def decode(encoded_message, huffman_tree_root, left='0', right='1'):
    """Decodes a compressed message based on a provided Huffman tree."""
    # Initialize decoded string
    decoded = ""
    # Start from the root of the tree
    current = huffman_tree_root
    # Parse the encoded message one character at a time
    for char in encoded_message:
        # If character points left
        if char == left:
            # Move to the left and down the tree
            current = current.get_left()
        else:
            # Otherwise move to the right and down the tree
            current = current.get_right()
        # Check if tree node is leaf node (has a symbol)
        if current.has_symbol():
            # Add symbol to decoded message
            decoded += current.get_symbol()
            # Reset to the top of the tree
            current = huffman_tree_root
    # Done
    return decoded

"""
the forest is a list, so use a min heap instead of a list

to encode the message
1. get huffamn roots, use min heap instead of forest
2. get smallest root by pop() min heap since the first element of a min heap is
the smallest

"""
def efficient_get_huffman_roots(frequencies):
  #instead of initializing a list initialize a min heap
  min_heap = Max_Heap()
  min_heap.min_heapify_up()

  # Consider every ASCII code with non-zero frequency
  for ascii_code in range(len(frequencies)):
    if frequencies[ascii_code] > 0:

      # create a node for this symbol and its frequency then add it
      # to the return array
     new_node = Node(frequencies[ascii_code], chr(ascii_code))
     min_heap.insert_min(new_node)
     # Done
  return min_heap

def efficient_get_smallest_root(min_heap):
  return min_heap.pop()

def efficient_huffman(message):
    """Encodes a message using Huffman codes."""

    # Basic guard statements
    if message is not None and len(message) > 0:

        # Obtain the frequencies of symbols in this message
        message_frequencies = get_frequencies(message)

        # Initialize the forest of the symbol nodes
        #use method that uses min heap
        forest = efficient_get_huffman_roots(message_frequencies)

        # Keep removing smallest two nodes from forest and combine them, until
        # there is only one node left.
        while forest.size() > 1:
            # Remove two nodes with lowest frequencies
            #use method that uses min heap
            t1 = efficient_get_smallest_root(forest)
            t2 = efficient_get_smallest_root(forest)
            # Use them to create a new node with their combined frequency and
            # no symbol. The removed nodes become the new nodes left and right
            # children.
            new_node = Node(t1.frequency + t2.frequency)
            new_node.set_left(t1)
            new_node.set_right(t2)
            # Add the new node to the forest
            forest.insert_min(new_node)

        # Done. The loop ends when there is only one node in the forest array.
        # That's the root of the huffman tree.
        return forest.pop()

def efficient_encode(message):
  """Compresses a message with Huffman codes based on the frequency of symbols
  in the message using min heap"""
  # Initialize output string
  encoded = ""
  # Obtain the Huffman tree for this message using method with min heap
  huffman_tree_root = efficient_huffman(message)
  # Build a lookup table of symbols and Huffman codes based on the tree
  codes = get_huffman_codes(huffman_tree_root, "", dict())
  # Encode the message by looking up the Huffman code of each character in it.
  for character in message:
    encoded += codes[character]
  # Done
  return encoded, huffman_tree_root, codes


#testing min heap
max_heap = Max_Heap()
max_heap.insert(0)
max_heap.insert(1)
max_heap.insert(2)
max_heap.insert(3)

for i in range(max_heap.size()):
  print(max_heap.pop())
print()

min_heap = Max_Heap()
min_heap.min_heapify_up()
min_heap.insert_min(0)
min_heap.insert_min(1)
min_heap.insert_min(2)
min_heap.insert_min(3)

for i in range(min_heap.size()):
  print(min_heap.pop())

#testing huffman
messages = [
    "A",
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
    "HI",
    "HELLO WORLD",
    "Now is the winter of our discontent made glorious by this son of York",
    "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness"
]
for message in messages:
    encoded_message, huffman_tree_root, codes = efficient_encode(message)
    decoded_message = decode(encoded_message, huffman_tree_root)
    efficiency = report_compression_efficiency(message, encoded_message)
    print(f'\n                Encode: {message}')
    print(f'   Decode (using tree): {decoded_message}')
    print(f'         Huffman codes: {codes}')
    print(f'       Encoded message: {encoded_message}')
    print(f'Compression efficiency: {efficiency:.2f}%\n')

3
2
1
0

0
3
2
1

                Encode: A
   Decode (using tree): 
         Huffman codes: {'A': ''}
       Encoded message: 
Compression efficiency: 100.00%


                Encode: ABCDEFGHIJKLMNOPQRSTUVWXYZ
   Decode (using tree): ABCDEFGHIJKLMNOPQRSTUVWXYZ
         Huffman codes: {'B': '0', 'C': '10', 'D': '110', 'E': '1110', 'F': '11110', 'G': '111110', 'H': '1111110', 'I': '11111110', 'J': '111111110', 'K': '1111111110', 'L': '11111111110', 'M': '111111111110', 'N': '1111111111110', 'O': '11111111111110', 'P': '111111111111110', 'Q': '1111111111111110', 'R': '11111111111111110', 'S': '111111111111111110', 'T': '1111111111111111110', 'U': '11111111111111111110', 'V': '111111111111111111110', 'W': '1111111111111111111110', 'X': '11111111111111111111110', 'Y': '111111111111111111111110', 'A': '1111111111111111111111110', 'Z': '1111111111111111111111111'}
       Encoded message: 1111111111111111111111110010110111011110111110111111011111110111111110111111111011111111110111111111110

Assignment 8 Ungrading:

Although there are some differences between my code and the technical notes, my code follows the directions and excecutes the correct solutions. Starting with my get_huffman_codes method, I struggled with the recusion, but eventually I was able to find a way to get my method to work how I wanted it to. The main difference between my method and the technical notes are the parameters. My method's parameters are root, path="", codes = None, left = "0", right = "1" while the technical notes parameters are node, path, table. Although my code works, the technical notes accomplish the same task in a more concise way. My main takeaway from this method is that I need to make sure the variable I am updating through recursion does not get reset in every call. My encode method is also similar to the technical notes besides the parameters -- mine uses message, code and the notes use message. My report_efficiency_compression method takes the same parameters (encoded and raw message) as the notes, but the notes do what my method does in less lines. I think my takeaway here is to try to find where I can make my code more concise. Since I'm strill relatively new to Python, sometimes writing out unnecessary lines of code helps me comprehend what Python is doing better. Lastly, my decode method functions the same as the decode method in the technical notes, except the notes' method take mroe parameters which means the variables do not have to written again in the if statements. Overall, my code works, my code looks organized, I added comments, and testing. However, I could've made better use of the Node object.