# 4.4.3 BAC Encoder and Decoder

In [None]:
import os
import signal

# Define the timeout duration in seconds
timeout_duration = 60

# Function to run model on a file with a timeout
def run_model_on_file(model, file_path, output_len, timeout_duration=30):
    try:
        # Execute the function with a timeout
        signal.signal(signal.SIGALRM, lambda signum, frame: _handle_timeout())
        signal.alarm(timeout_duration)  # Set the timeout duration in seconds (e.g., 5 seconds)

        model.run_onfile(file_path, output_len)
    except TimeoutError:
        print(f"File: {file_path} timed out after {timeout_duration} seconds. Skipping\n-------------------------------")
    except Exception as e:
        print(f"File: {file_path} failed on output_len: {output_len}. Error: {e}")
    finally:
        signal.alarm(0)  # Disable the alarm

# Function to handle timeout
def _handle_timeout():
    raise TimeoutError()

# Loop over models and output lengths
list_models= [BAC(), BAC_fly(), BAC_fly_markov()]
for model in list_models:
    for output_len in [8, 12]:
        for root, dirs, files in os.walk("filtered_datasets/music"):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    print(f"file_path: {file_path}")
                    run_model_on_file(model, file_path, output_len, timeout_duration)
                   
        for root, dirs, files in os.walk("filtered_datasets/books"):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    print(f"file_path: {file_path}")
                    
                    run_model_on_file(model, file_path, output_len, timeout_duration)
                    

file_path: filtered_datasets/music/around_the_world.txt
-------------------
Running BAC Model on input file: filtered_datasets/music/around_the_world.txt of length: 2519 
-------------------
output_len: 8
Codebook length: 169
Codebook contains 169 codewords, out of a maximum of 256
Suggests an inefficiency of 0.5991205637178156 bits per block of 8 bits
Error: block length too long, block_len: 3, max_block_len: 2
File: filtered_datasets/music/around_the_world.txt failed on output_len: 8. Error: 
file_path: filtered_datasets/music/God_save_the_king.txt
-------------------
Running BAC Model on input file: filtered_datasets/music/God_save_the_king.txt of length: 327 
-------------------
output_len: 8
Codebook length: 69
Codebook contains 69 codewords, out of a maximum of 256
Suggests an inefficiency of 1.8914755432218309 bits per block of 8 bits
Input-Ouput Verification: True
num_bits_data_utf8: 2616, num_bits_code: 2216 num_bits_dictionary: 1384
Compression Ratio: 0.7266666666666667
-----

In [2]:
#import numpy as np
from scipy.special import xlogy
import pandas as pd
import matplotlib.pyplot as plt
from numpy import log2
import bitstring as bt
import numpy as np
import os

In [3]:
data = pd.read_csv('datasets/single_counts.csv', index_col=0)['Count']
transitions = pd.read_csv('datasets/transitions.csv', index_col=0)



In [4]:
probs = (data/sum(data)).sort_values()

In [5]:
output_len = 8

In [6]:
def _BAC_splitter(K, probs, probs_sorted = True):
    #Uses Boncelet's 1993 heuristic rule for splitting K messages into subsets using the probabilities
    if not probs_sorted:
        probs = probs.sort_values()

    m = len(probs)
    L = int(np.floor((K-1)/(m-1)))
    tL = L
    q=1.
    #Construct Li series, to preserve labelling in pandas
    Li= pd.Series(index = probs.index, dtype = np.int64, data = np.ones(len(probs))) 

    for i in range(len(probs)):
        tp = probs[i]/q
        Li[i] = np.int64(max(0, np.floor(tp*tL + (tp*(2-i)-1)/(m-1) +0.5)))
    return 1+Li*(m-1)


In [7]:
def BAC_codebook_builder(output_len, probs):
    #Builds the BAC codebook using Boncelet's heuristic
    #Does not give the same codebook as the on-the-fly encoding
    #Moderately slow
    
    K= 2**output_len
    working_blocks = {('',K)}
    final_codewords = set()

    probs = probs.sort_values()

    while len(working_blocks)>0:
        current_block = working_blocks.pop()
        split = _BAC_splitter(current_block[1], probs)
        # print(f"current block: {current_block}, type split: {type(split)}\nsplit:\n{split} ")    
        new_blocks = {(current_block[0]+x, split[x]) for x in split.index if split[x]>1}
        working_blocks = working_blocks.union(new_blocks)
        new_words = {current_block[0]+x for x in split.index if split[x]==1}
        # print(f"new words: {new_words}")
        final_codewords = final_codewords.union(new_words)

    final_codewords = list(final_codewords)
    return {final_codewords[x]:bt.Bits(uint=x, length =output_len) for x in range(len(final_codewords))}

In [8]:
codewords= BAC_codebook_builder(output_len, probs) #Calculate the codebook
print('Codebook contains ' + str(len(codewords)) + ' codewords, out of a maximum of ' + str(2**output_len))
print('Suggests an inefficiency of ' + str(output_len - log2(len(codewords)))+ ' bits per block of ' +str(output_len) +' bits')
codewords

Codebook contains 105 codewords, out of a maximum of 256
Suggests an inefficiency of 1.2857544823338776 bits per block of 8 bits


{'_O': Bits('0x00'),
 'EY': Bits('0x01'),
 'O': Bits('0x02'),
 'W': Bits('0x03'),
 'EV': Bits('0x04'),
 'I': Bits('0x05'),
 'C': Bits('0x06'),
 'ER': Bits('0x07'),
 'EA': Bits('0x08'),
 'G': Bits('0x09'),
 'EP': Bits('0x0a'),
 '_I': Bits('0x0b'),
 '_M': Bits('0x0c'),
 'TL': Bits('0x0d'),
 'X': Bits('0x0e'),
 '_R': Bits('0x0f'),
 'TY': Bits('0x10'),
 '_A': Bits('0x11'),
 'TG': Bits('0x12'),
 '_E': Bits('0x13'),
 '_X': Bits('0x14'),
 'TU': Bits('0x15'),
 '_Y': Bits('0x16'),
 'EN': Bits('0x17'),
 'N': Bits('0x18'),
 'R': Bits('0x19'),
 '_L': Bits('0x1a'),
 'A': Bits('0x1b'),
 '_Z': Bits('0x1c'),
 'EG': Bits('0x1d'),
 'TS': Bits('0x1e'),
 'V': Bits('0x1f'),
 'EC': Bits('0x20'),
 '_V': Bits('0x21'),
 'EU': Bits('0x22'),
 'TT': Bits('0x23'),
 'M': Bits('0x24'),
 'U': Bits('0x25'),
 'EW': Bits('0x26'),
 'Z': Bits('0x27'),
 'D': Bits('0x28'),
 'EL': Bits('0x29'),
 'TX': Bits('0x2a'),
 'EH': Bits('0x2b'),
 'T_': Bits('0x2c'),
 'EJ': Bits('0x2d'),
 'TC': Bits('0x2e'),
 '_H': Bits('0x2f'),
 'P': 

In [9]:
def BAC_encoder(message, codewords):
    # Applies the BAC code
    # Pads with spaces if necessary
    # Inefficient use of memory, but easy to follow

    # Compute max and min input codeword lengths
    min_block_len = min(len(x) for x in codewords.keys()) 
    max_block_len = max(len(x) for x in codewords.keys())

    # Set up padding variable
    pad = '_'*(max_block_len)

    # Set up working in and out-put variables
    remaining_message = message+""
    output = bt.Bits(bin='')
    
    while len(remaining_message) >0:
        # Iterate until the whole message is processed

        block_len = min_block_len
        increase_block = True
        while increase_block:
            # Iterate through increasing input message lengths

            # If the message being processed is too short, pad it
            if len(remaining_message)>=block_len:
                message_block = remaining_message[:block_len]
            else:
                message_block = (remaining_message+pad)[:block_len]

            # Check if input message is in codewords
            if message_block in codewords.keys():
                # If input is in codewords, add corresponding output and move to next section
                output = output + codewords[message_block]
                remaining_message = remaining_message[block_len:]
                increase_block=False
            else:
                block_len = block_len+1

            # If the block length has gotten too long, then throw an error 
            try:   
                assert(block_len<=max_block_len)
            except:
                print(f"Error: block length too long, block_len: {block_len}, max_block_len: {max_block_len}")
                assert(block_len<=max_block_len)
    return(output)

In [10]:
def BAC_decoder(coded_message, codewords):
    # Decodes a message encoded using BAC
    # Similar to Huffman decoder, but fixed-length coded messages makes this easier
    
    # Construct reverse dictionary
    decode_dict = {codewords[x]:x for x in codewords}
    
    # Compute the size of a block from an example codeword
    block_len = len(codewords[sorted(codewords.keys())[0]])
    
    # If the coded message doesn't break into blocks, then throw an error
    assert(len(coded_message)% block_len == 0)

    # Decode each block and output
    output = [decode_dict[coded_message[(block_len*i):(block_len*(i+1))]] for i in range(len(coded_message)//block_len)]
    return ''.join(output)


In [11]:
message = 'THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN'

In [12]:
coded_message = BAC_encoder(message, codewords)
print(coded_message.bin)
print(BAC_decoder(coded_message, codewords))

01011011010111110001100100011011000001010001100000001011000110000110000000110000000110110000010100011000001101000001101101000101010001010011001000001100000110110000010100011000010001010011101100000000000110000100110001100010010111110011000001000101000110110000010100011000
THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN


In [13]:

def BAC_encoder_fly(message, output_len, probs, debug= False):
    # Implements Boncelet's BAC encoder in an on-the-fly manner (no codebook saved)
    # Computes a number corresponding to each input message, then sends this number in binary

    # Compute total number of outputs and sorts probabilities
    K= 2**output_len
    probs = probs.sort_values()
    if debug:
        #print all parameters
        print(f"message: {message}, output_len: {output_len}, probs: {probs}, K: {K}")

    output=bt.Bits()
    while len(message)>0:
        # Iterate through the input message

        # Initialize variables 
        block_len = 1    # Size of message block
        cur_K = K        # Number of potential outputs for this block
        cur_sum_K = 0    # Number of outputs excluded before current iteration
        increase_block = True
        while increase_block:
            # Iterate through increasing block sizes
            # print(f"block_len: {block_len}, cur_K: {cur_K}, cur_sum_K: {cur_sum_K}")

            # If message is too short, then pad with '_'
            if len(message)>=block_len:
                cur_mess = message[:block_len]
            else:
                cur_mess = message + '_'*(block_len-len(message))

            # Split the current set of potential outputs using the heuristic
            split = _BAC_splitter(cur_K, probs)

            # Compute the number of message possibilities for this block
            # print(f"cur_mess: {cur_mess}, split: {split}")
            cur_K = split[cur_mess[-1]]

            # Compute how many messages there are above the current block's group
            # This is needed so that we can keep track of the position of our final output message
            cur_sum_K = cur_sum_K+split.cumsum().shift(fill_value=0)[cur_mess[-1]]

            # If there is only one possible message for the current block, 
            # compute the number of this message (accounting for previous messages, output it 
            # and move to the next block. 
            if split[cur_mess[-1]]==1:
                output = output+bt.Bits(uint = cur_sum_K + cur_K-1, length = output_len)
                if len(message)>block_len:
                    message = message[block_len:] # Remove processed block from message
                else:
                    message = '' # If our block was longer than the message (so we padded it), just make the message empty
                increase_block=False
            else:
                block_len = block_len+1

    return output
    


In [14]:
def _BAC_decode_single(coded_message, probs):
    # Uses the BAC algorithm, in on-the-fly mode, to decode a single message block

    # Set up variables
    cur_K = 2**len(coded_message)    # Total number of possible messages
    cur_message = coded_message.uint # Coded message expressed as an integer
    cur_output=''       # Current partial output string
    cur_output_loc = 0  # Current partial output value

    while cur_output_loc+1<cur_message:
        # Iterate until output value agrees with coded message value

        # Calculate the BAC heuristic split of messages
        split = _BAC_splitter(cur_K, probs)
        split_sum = split.cumsum()

        # Compute the next character of the message, by comparing with the values assigned to different partial strings
        cur_char = split_sum[(split_sum>cur_message-cur_output_loc) 
                             &(split_sum.shift(fill_value=0)<=cur_message-cur_output_loc )].index[0]
        
        # Add next character to output
        cur_output = cur_output + cur_char

        # Compute number of messages following current partial string
        cur_K = split[cur_char]

        # Compute the value of the current partial string
        cur_output_loc = cur_output_loc + split_sum.shift(fill_value=0)[cur_char]
    return cur_output

In [15]:
def BAC_decoder_fly(coded_message, output_len, probs):
    # Decodes a message encoded using the BAC algorithm, in on-the-fly mode (no codebook stored)

    # Check probabilities are sorted
    probs = probs.sort_values()

    # Check coded_message can be cleanly split into fixed-length codeblocks, if not throw an error
    assert(len(coded_message) % output_len ==0)

    # Compute the decoding of each codeblock
    output = [_BAC_decode_single(coded_message[i*output_len:(i+1)*output_len], probs) for i in range(len(coded_message)//output_len)]

    # Concatenate decoded values and output
    return ''.join(output)
        
    


In [16]:
coded_message_fly = BAC_encoder_fly(message, output_len, probs)
print(coded_message_fly.bin)

00101010010011010001000100010111000101000001010101100010000101010110000100000111000101110001010000010101010110000001011100001111000011110001001101011011000101110001010000010101000011110000100101100100000101010110011000010010010011010000011100001111000101110001010000010101


In [17]:
BAC_decoder_fly(coded_message_fly, output_len, probs)

'THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN'

# BAC runner class 

In [18]:
class BAC:
    def __init__(self):
        self.output_len = None
        self.probs = None
        self.K = None
        self.codewords = {}
        self.message = ''
        self.coded_message= bt.Bits()
        
    def run(self, message, output_len, probs= None, debug= False, file_name = None):
        #runs the BAC model 
        #returns the compression ratio
        #Inputs: filename: string of the file to be encoded 
        #        debug [True, False, "Verbose"]: boolean to print debug statements
        #Prints: Input-Ouput Verification: True if the input and output are the same, False otherwise
        #        Compression ratio: the compression ratio of the input and output
        #        code: the encoded data
        #        encoder_codebook: the codebook of the encoder
        #        decoder_codebook: the codebook of the decoder
        if file_name is not None and len(file_name)>0:
            print(f"-------------------\nRunning BAC Model on input file: {file_name} of length: {len(message)} \n-------------------")
        
        else: 
            print(f"---------------------\nRunning BAC Model on data of length {len(message)}\n---------------------")
        #print params all in one line
        print(f"output_len: {output_len}")
        if probs is not None:
            self.probs = probs.sort_values()
        else: 
            self.probs = self.get_probs(message)
        self.output_len = output_len
        self.K = 2**output_len
        self.message = message
        self.codewords = BAC_codebook_builder(self.output_len, self.probs) #build codebook
        print(f"Codebook length: {len(self.codewords)}")
        # print(f"Codebook: {self.codewords}")
        print('Codebook contains ' + str(len(self.codewords)) + ' codewords, out of a maximum of ' + str(2**output_len))
        print('Suggests an inefficiency of ' + str(output_len - log2(len(self.codewords)))+ ' bits per block of ' +str(output_len) +' bits')
        self.coded_message = BAC_encoder(self.message, self.codewords)
        try:
            if debug.lower() == "verbose":
                print(f"Encoding: {self.coded_message.bin}")
                print(f"Codebook: {self.codewords}")
        except:
            if debug:
                print(f"Encoding: {self.coded_message.bin}")
        self.decoded_message = BAC_decoder(self.coded_message, self.codewords)
        if debug:
            print(f"Decoding: {self.decoded_message}")
            
        print(f"Input-Ouput Verification: {self.decoded_message == message}") 
        print(f"Compression Ratio: {self.get_compression_ratio()}")
        print(f"---------------------\nDone\n---------------------")
        
    def get_compression_ratio(self):
        num_bits_data_utf8 = len(self.message.encode('utf-8'))*8
        num_bits_dictionary = sum([len(self.codewords[key]) + len(key.encode('utf-8'))*8 for key in self.codewords]) 
        #value is in binary, key is in utf-8, we add key + value bits for each key.
        num_bits_code = len(self.coded_message) #code is already in bits.
        print(f"num_bits_data_utf8: {num_bits_data_utf8}, num_bits_code: {num_bits_code} num_bits_dictionary: {num_bits_dictionary}")
        compression_ratio = num_bits_data_utf8/ (num_bits_code+ num_bits_dictionary) 
        return compression_ratio
    
    def run_onfile(self, file_name, output_len, probs=None, debug= False):
        with open(file_name, 'r') as f:
            data = f.read()
        self.run(data, output_len, probs, debug, file_name)
        
    def get_probs(self, message):
        #returns the probability of each character in the message
        probs = pd.Series(list(message)).value_counts()/len(message)
        probs = probs.sort_values()
        return probs

        

### Testing BAC

In [19]:
message = 'THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN'

bac = BAC()
bac.run(message, output_len, probs)

---------------------
Running BAC Model on data of length 43
---------------------
output_len: 8
Codebook length: 105
Codebook contains 105 codewords, out of a maximum of 256
Suggests an inefficiency of 1.2857544823338776 bits per block of 8 bits
Input-Ouput Verification: True
num_bits_data_utf8: 344, num_bits_code: 272 num_bits_dictionary: 2328
Compression Ratio: 0.13230769230769232
---------------------
Done
---------------------


## BAC Encoder Decoder Fly

In [20]:
class BAC_fly:
    def __init__(self):
        self.output_len = None
        self.probs = None
        self.K = None
        self.message = ''
        self.coded_message= bt.Bits()
        
    def run(self, message, output_len, probs= None, debug= False, file_name= None):
        #runs the BAC model 
        #returns the compression ratio
        #Inputs: filename: string of the file to be encoded 
        #        debug [True, False, "Verbose"]: boolean to print debug statements
        #Prints: Input-Ouput Verification: True if the input and output are the same, False otherwise
        #        Compression ratio: the compression ratio of the input and output
        #        code: the encoded data
        #        encoder_codebook: the codebook of the encoder
        #        decoder_codebook: the codebook of the decoder
        
        if file_name is not None:
            print(f"-------------------\nRunning BAC Fly Model on input file: {file_name} of length: {len(message)}\n-------------------")
        
        else: print(f"---------------------\nRunning BAC Fly model on data of length {len(message)}\n---------------------")
        #print params all in one line
        print(f"output_len: {output_len}")
        if probs is not None:
            self.probs = probs.sort_values()
        else: 
            self.probs = self.get_probs(message)
        self.output_len = output_len
        self.K = 2**output_len
        self.message = message
        self.coded_message = BAC_encoder_fly(self.message, self.output_len, self.probs, debug)
        
        if debug:
            print(f"Encoding: {self.coded_message.bin}")
        self.decoded_message = BAC_decoder_fly(self.coded_message, self.output_len, self.probs)
        if debug:
            print(f"Decoding: {self.decoded_message}")
            
        #verification to make sure the input and output are the same
        #account for padding at the end of decoded message
        print(f"Input-Ouput Verification: {self.decoded_message == message}")
        if self.decoded_message != message:
            #save output to diff/BAC_fly_{file_name}_decoded.txt
            #create diff folder if it doesn't exist and file too
            if not os.path.exists('diff'):
                os.makedirs('diff')
            with open(f"diff/{file_name}", 'w') as f:
                f.write(self.decoded_message)
                      
        print(f"Compression Ratio: {self.get_compression_ratio()}")
        print(f"---------------------\nDone\n---------------------")
        
    def get_compression_ratio(self):
        num_bits_data_utf8 = len(self.message.encode('utf-8'))*8
        num_bits_code = len(self.coded_message) #code is already in bits.
        num_bits_output_len = len(bt.Bits(uint=self.output_len, length = 8))
        num_bits_probs = len(bt.Bits(bytes = self.probs.to_csv().encode('utf-8')))
        print(f"num_bits_data_utf8: {num_bits_data_utf8}, num_bits_code: {num_bits_code} num_bits_output_len: {num_bits_output_len} num_bits_probs: {num_bits_probs}")
        compression_ratio = num_bits_data_utf8 /(num_bits_code + num_bits_output_len + num_bits_probs) 
        return compression_ratio
    
    def run_onfile(self, file_name, output_len, probs=None, debug= False):
        with open(file_name, 'r') as f:
            data = f.read()
            #drop new line characters:
            data = data.replace('\n', '')
        self.run(data, output_len, probs, debug= debug, file_name = file_name)
        
    def get_probs(self, message):
        #returns the probability of each character in the message
        probs = pd.Series(list(message)).value_counts()/len(message)
        probs = probs.sort_values()
        return probs

        

In [21]:
message = 'THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN'
bac_fly = BAC_fly()
bac_fly.run(message, output_len, probs)

---------------------
Running BAC Fly model on data of length 43
---------------------
output_len: 8
Input-Ouput Verification: True
num_bits_data_utf8: 344, num_bits_code: 272 num_bits_output_len: 8 num_bits_probs: 4936
Compression Ratio: 0.06595092024539877
---------------------
Done
---------------------


## BAC Encoder Decoder Fly Markov
Instead of fixed transition probabilities we use markov probabilities given by the transition matrix.

### Define helper functions

In [22]:

def BAC_encoder_fly_markov(message, output_len, probs, transitions= None, debug= False):
    # Implements Boncelet's BAC encoder in an on-the-fly manner (no codebook saved)
    # Computes a number corresponding to each input message, then sends this number in binary

    # Compute total number of outputs and sorts probabilities
    K= 2**output_len
    probs = probs.sort_values()

    output=bt.Bits()
    while len(message)>0:
        # Iterate through the input message

        # Initialize variables 
        cur_K = K        # Number of potential outputs for this block
        cur_sum_K = 0    # Number of outputs excluded before current iteration
        block_len = 1
        increase_block = True
        while increase_block:
            # Iterate through increasing block sizes
            # If message is too short, then pad with '_'
            if len(message)>=block_len:
                cur_mess = message[:block_len]
            else:
                cur_mess = message + '_'*(block_len-len(message))
            if block_len == 1 or transitions is None:
                cur_probs = probs.sort_values()
            else:
                # print(f"using transitions for {cur_mess[-1]}")
                transitions.loc[cur_mess[-1]] if cur_mess[-1] in transitions.index else probs
            # Split the current set of potential outputs using the heuristic
            split = _BAC_splitter(cur_K, cur_probs)
            
    
            # Compute the number of message possibilities for this block
            cur_K = split[cur_mess[-1]]

            # Compute how many messages there are above the current block's group
            # This is needed so that we can keep track of the position of our final output message
            cur_sum_K = cur_sum_K+split.cumsum().shift(fill_value=0)[cur_mess[-1]]

            # If there is only one possible message for the current block, 
            # compute the number of this message (accounting for previous messages, output it 
            # and move to the next block. 
            if split[cur_mess[-1]]==1:
                output = output+bt.Bits(uint = cur_sum_K + cur_K-1, length = output_len)
                if len(message)>block_len:
                    message = message[block_len:] # Remove processed block from message
                else:
                    message = '' # If our block was longer than the message (so we padded it), just make the message empty
                increase_block=False
            else:
                block_len = block_len+1
    return output
    


In [23]:
def _BAC_decode_single_markov(coded_message, probs, transitions= None):
    # Uses the BAC algorithm, in on-the-fly mode, to decode a single message block

    # Set up variables
    cur_K = 2**len(coded_message)    # Total number of possible messages
    cur_message = coded_message.uint # Coded message expressed as an integer
    cur_output=''       # Current partial output string
    cur_output_loc = 0  # Current partial output value
    j = 0

    while cur_output_loc+1<cur_message:
        # Iterate until output value agrees with coded message value
        if j == 0 or transitions is None:
            cur_probs = probs
        else:
            cur_probs = transitions.loc[cur_output[-1]] if cur_output[-1] in transitions.index else probs
        # Calculate the BAC heuristic split of messages
        split = _BAC_splitter(cur_K, cur_probs)
        split_sum = split.cumsum()

        # Compute the next character of the message, by comparing with the values assigned to different partial strings
        cur_char = split_sum[(split_sum>cur_message-cur_output_loc) 
                             &(split_sum.shift(fill_value=0)<=cur_message-cur_output_loc )].index[0]
        
        # Add next character to output
        cur_output = cur_output + cur_char

        # Compute number of messages following current partial string
        cur_K = split[cur_char]

        # Compute the value of the current partial string
        cur_output_loc = cur_output_loc + split_sum.shift(fill_value=0)[cur_char]
    return cur_output

In [24]:
def BAC_decoder_fly_markov(coded_message, output_len, probs, transitions = None):
    # Decodes a message encoded using the BAC algorithm, in on-the-fly mode (no codebook stored)

    # Check probabilities are sorted
    probs = probs.sort_values()

    # Check coded_message can be cleanly split into fixed-length codeblocks, if not throw an error
    assert(len(coded_message) % output_len ==0)

    # Compute the decoding of each codeblock
    output = [_BAC_decode_single_markov(coded_message[i*output_len:(i+1)*output_len], probs, transitions) for i in range(len(coded_message)//output_len)]

    # Concatenate decoded values and output
    return ''.join(output)
        
    


## BAC Fly Markov

In [25]:
from regex import F


class BAC_fly_markov:
    def __init__(self):
        self.output_len = None
        self.probs = None
        self.K = None
        self.codewords = {}
        self.message = ''
        self.coded_message= bt.Bits()
        self.transitions = None
        
    def run(self, message, output_len, probs, transitions =None , debug= False, file_name=None):
        #runs the BAC fly markov model 
        #returns the compression ratio
        #Inputs: filename: string of the file to be encoded 
        #        debug [True, False, "Verbose"]: boolean to print debug statements
        #Prints: Input-Ouput Verification: True if the input and output are the same, False otherwise
        #        Compression ratio: the compression ratio of the input and output
        #        code: the encoded data
        #        encoder_codebook: the codebook of the encoder
        #        decoder_codebook: the codebook of the decoder
        
        if file_name is not None:
            print(f"-------------------\nRunning BAC Fly Markov Model on input file: {file_name} of length: {len(message)}\n-------------------")
        
        else: print(f"---------------------\nRunning BAC Fly Markov model on data of length {len(message)}\n---------------------")
        print(f"output_len: {output_len}")
        #print params all in one line
        if probs is not None:
            self.probs = probs.sort_values()
        else: 
            self.probs = self.get_probs(message)
        if transitions is not None:
            transitions = transitions.apply(lambda row: row.sort_values(), axis=1)
            self.transitions =  transitions.transpose()
        else:
            self.transitions = self.get_transition(message)
            
           
        self.output_len = output_len
        self.K = 2**output_len
        self.message = message
        self.coded_message = BAC_encoder_fly_markov(self.message, self.output_len, self.probs, self.transitions, debug)
        
        if debug:
            print(f"Encoding: {self.coded_message.bin}")
            
        self.decoded_message = BAC_decoder_fly_markov(self.coded_message, self.output_len, self.probs, self.transitions)
        if debug:
            print(f"Input: {self.message}")
            print(f"Decoding: {self.decoded_message}")
            
        print(f"Input-Ouput Verification: {self.decoded_message == message}")
        if self.decoded_message != message:
            #save output to diff/BAC_fly_markov_{file_name}_decoded.txt
            #create diff folder if it doesn't exist and file too

            if not os.path.exists('diff'):
                os.makedirs('diff')
            with open(f"diff/{file_name}", 'w') as f:
                f.write(self.decoded_message)
            

        print(f"Compression Ratio: {self.get_compression_ratio()}")
        print(f"---------------------\nDone\n---------------------")
        
    def get_compression_ratio(self):
        num_bits_data_utf8 = len(self.message.encode('utf-8'))*8
        num_bits_code = len(self.coded_message) #code is already in bits.
        num_bits_output_len = len(bt.Bits(uint=self.output_len, length = 8))
        num_bits_probs = len(bt.Bits(bytes = self.probs.to_csv().encode('utf-8')))
        num_bits_transition =  0 if self.transitions is None else len(bt.Bits(bytes = self.transitions.to_csv().encode('utf-8')))
        print(f"num_bits_data_utf8: {num_bits_data_utf8}, num_bits_code: {num_bits_code} num_bits_output_len: {num_bits_output_len} num_bits_probs: {num_bits_probs} num_bits_transition: {num_bits_transition}")
        compression_ratio = num_bits_data_utf8 /(num_bits_code + )
        return compression_ratio
    
    def run_onfile(self, file_name, output_len, probs=None, debug= False):
        with open(file_name, 'r') as f:
            data = f.read()
            data = data.replace('\n', '')
        self.run(data, output_len, probs, debug= debug, file_name = file_name)
        
    def get_probs(self, message):
        #returns the probability of each character in the message
        probs = pd.Series(list(message)).value_counts()/len(message)
        probs = probs.sort_values()
        return probs
    
    def get_transition(self, message):
        #returns the probability of each character following another character from 0 to 255 in ascii
        #returns a dataframe of the transition probabilities
        # Input: message: string of the message 
        # Output: dataframe of the transition probabilities

        #initialize the transition matrix
        transition = pd.DataFrame(index = range(0,256), columns = range(0,256))
        transition = transition.fillna(0)
        #iterate through the message
        for i in range(len(message)-1):
            #get the current and next character
            cur_char = ord(message[i])
            next_char = ord(message[i+1])
            #increment the transition matrix
            transition.loc[cur_char, next_char] += 1 # row: next_char, col: cur_char
        #normalize the transition matrix over current characters
        transition = transition.div(transition.sum(axis=0), axis=1)
        
        return transition.transpose() #t(next, cur)
        

        

In [26]:
message = 'THE_RAIN_IN_SPAIN_FALLS_MAINLY_ON_THE_PLAIN'
output_len = 8
bac_fly_markov = BAC_fly_markov()
bac_fly_markov.run(message, output_len, probs, transitions= transitions, debug= False)

---------------------
Running BAC Fly Markov model on data of length 43
---------------------
output_len: 8
Input-Ouput Verification: True
num_bits_data_utf8: 344, num_bits_code: 272 num_bits_output_len: 8 num_bits_probs: 4936 num_bits_transition: 115776
Compression Ratio: 0.0028431631843427664
---------------------
Done
---------------------


In [27]:
import os
import signal

# Define the timeout duration in seconds
timeout_duration = 60

# Function to run model on a file with a timeout
def run_model_on_file(model, file_path, output_len, timeout_duration=30):
    try:
        # Execute the function with a timeout
        signal.signal(signal.SIGALRM, lambda signum, frame: _handle_timeout())
        signal.alarm(timeout_duration)  # Set the timeout duration in seconds (e.g., 5 seconds)

        model.run_onfile(file_path, output_len)
    except TimeoutError:
        print(f"File: {file_path} timed out after {timeout_duration} seconds. Skipping\n-------------------------------")
    except Exception as e:
        print(f"File: {file_path} failed on output_len: {output_len}. Error: {e}")
    finally:
        signal.alarm(0)  # Disable the alarm

# Function to handle timeout
def _handle_timeout():
    raise TimeoutError()

# Loop over models and output lengths
list_models= [BAC(), BAC_fly(), BAC_fly_markov()]
for model in list_models:
    for output_len in [8, 12]:
        for root, dirs, files in os.walk("filtered_datasets/music"):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    print(f"file_path: {file_path}")
                    run_model_on_file(model, file_path, output_len, timeout_duration)
                   
        for root, dirs, files in os.walk("filtered_datasets/books"):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    print(f"file_path: {file_path}")
                    
                    run_model_on_file(model, file_path, output_len, timeout_duration)
                    

file_path: filtered_datasets/music/around_the_world.txt
-------------------
Running BAC Model on input file: filtered_datasets/music/around_the_world.txt of length: 2519 
-------------------
output_len: 8
Codebook length: 169
Codebook contains 169 codewords, out of a maximum of 256
Suggests an inefficiency of 0.5991205637178156 bits per block of 8 bits
Error: block length too long, block_len: 3, max_block_len: 2
File: filtered_datasets/music/around_the_world.txt failed on output_len: 8. Error: 
file_path: filtered_datasets/music/God_save_the_king.txt
-------------------
Running BAC Model on input file: filtered_datasets/music/God_save_the_king.txt of length: 327 
-------------------
output_len: 8
Codebook length: 69
Codebook contains 69 codewords, out of a maximum of 256
Suggests an inefficiency of 1.8914755432218309 bits per block of 8 bits
Input-Ouput Verification: True
num_bits_data_utf8: 2616, num_bits_code: 2216 num_bits_dictionary: 1384
Compression Ratio: 0.7266666666666667
-----

## Individual Tests:

In [27]:
bac = BAC()
bac.run_onfile("filtered_datasets/music/around_the_world.txt", 8)

-------------------
Running BAC Model on input file: filtered_datasets/music/around_the_world.txt of length: 2519 
-------------------
output_len: 8
Codebook length: 169
Codebook contains 169 codewords, out of a maximum of 256
Suggests an inefficiency of 0.5991205637178156 bits per block of 8 bits
Error: block length too long, block_len: 3, max_block_len: 2


AssertionError: 

In [None]:

bac_fly = BAC_fly()
bac_fly.run_onfile("filtered_datasets/music/around_the_world.txt", 16, debug= True)

-------------------
Running BAC Fly Model on input file: filtered_datasets/music/around_the_world.txt of length: 2448
-------------------
output_len: 16
message: Around the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, around the worldAround the world, arou

In [None]:
bac_fly_markov = BAC_fly_markov()
bac_fly_markov.run_onfile("filtered_datasets/music/around_the_world.txt", 16, debug= True)

-------------------
Running BAC Fly Markov Model on input file: filtered_datasets/music/around_the_world.txt of length: 2448
-------------------
output_len: 16
Encoding: 00000100010111000011001000100111010010010100001010101110100110010000111000011110101000001101011111100010100000011110100111000010011110111101001110100000110101111110001010000001111010011100001001111011111001000001001101100110001100100010011101001001010000101010111010011001000001000101110000110010001001110100100101000010101011101001100100001110000111101010000011010111111000101000000111101001110000100111101111010011101000001101011111100010100000011110100111000010011110111110010000010011011001100011001000100111010010010100001010101110100110010000010001011100001100100010011101001001010000101010111010011001000011100001111010100000110101111110001010000001111010011100001001111011110100111010000011010111111000101000000111101001110000100111101111100100000100110110011000110010001001110100100101000010101011101001100100000100010111

In [None]:
bac_fly_markov = BAC_fly_markov()
bac_fly_markov.run_onfile("filtered_datasets/music/God_save_the_king.txt", 22, debug= True)

-------------------
Running BAC Fly Markov Model on input file: filtered_datasets/music/God_save_the_king.txt of length: 313
-------------------
output_len: 22
Encoding: 00011101010000101000011000010000101001111000101100010011010001000001100000110001010011011011000101001011110111101001010011111011011000001110110110010010101101110001010100001101111100110110100101110110010110011000000101011101001111110110111110011100110000010110001001001001101110010110001101010010001101000100110111001111111111111100001010101001000011001001000001010000000111000100011010111100011100100111111111010011001010011101000110100111000000111001100110010100010100110010000000011001000001010010100001101101000010110101010110000000111111110000101001001101111000001011111001110011101110001010011010101000001011110110010111000000110001000111010100001010000110000100001010011110000100110110011000111011101001010011111011100100000111111001010111001101110110000010111101001010100101100010001011101010100100111011010100111101001101