# Functions

In [24]:
import zlib
import lzma
import bz2
import gzip

#read the utf-16-le file
def open_txt():
  with open("/content/test_file.txt", "rb") as f:
        text = f.read().decode("utf-16-le", errors="strict")
  return text

#encoding of the text to utf-16-le for compression
def encodes(text):
  return text.encode(encoding='utf-16-le', errors='strict')

#decode the text back to utf-16
def decodes(text):
  return text.decode(encoding='utf-16-le', errors='strict')

#fragment text in blocks of 32 bytes
def frament_the_text(text):
  payload = list()
  for i in range(0,len(text), 32):
    payload.append(text[i:i+32])
  return payload

In [29]:
# function to check that original and decompressed string are the same
def are_strings_equal_zlib(original, compressed):
  decompressed = decodes(zlib.decompress(compressed))
  print("Original:")
  #print(original)
  print("Decompressed:")
  #print(decompressed)
  if decompressed == original:
    print("compression ok")
  else:
    print("compression wrong")

# function to check that original and decompressed string are the same
def are_strings_equal_lzma(original, compressed):
  decompressed = decodes(lzma.decompress(compressed))
  if decompressed == original:
    print("compression ok")
  else:
    print("compression wrong")

# function to check that original and decompressed string are the same
def are_strings_equal_bz2(original, compressed):
  decompressed = decodes(bz2.decompress(compressed))
  if decompressed == original:
    print("compression ok")
  else:
    print("compression wrong")

# function to check that original and decompressed string are the same
def are_strings_equal_gzip(original, compressed):
  decompressed = decodes(gzip.decompress(compressed))
  if decompressed == original:
    print("compression ok")
  else:
    print("compression wrong")

# Create a random txt of 10.000 lines

In [26]:
import random
import string
import unicodedata

# Define the file name and the number of lines to be created
filename = "test_file.txt"
num_lines = 10000

# Define the text for the first three lines
line1 = "Fitxer de prova de la competició Quick Mode MTP-S’23 TEAM C, editat el 13 d’Abril del 2023, a veure si tot va bé i la línia arriba sencera i sense errors incloent caràcters curiosos com æ, ę, î, ô, ü."
line2 = "… i aquesta és la segona línia, a veure que tal va."
line3 = "… i ja no posem més línies perque de fet, amb una n’hi ha prou."

# Get a list of all non-ASCII letters and digits
non_ascii_chars = [chr(i) for i in range(0x00A1, 0x0100) if not unicodedata.category(chr(i)).startswith('A')]

print(non_ascii_chars)
# Open the file for writing
with open(filename, 'w', encoding='utf-16-le') as file:
    # Write the first three lines
    file.write(f"1\t{line1}\n")
    file.write(f"2\t{line2}\n")
    file.write(f"3\t{line3}\n")
    
    # Write the remaining lines
    for i in range(4, num_lines+1):
        # Generate a random string of length between 10 and 50 characters
        rand_string = ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + random.choice(non_ascii_chars), k=random.randint(10,100)))
        # Write the line with a tab character and the generated string
        file.write(f"{i}\t{rand_string}\n")

print("File created successfully.")

['¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ']
File created successfully.


# No Compression

In [30]:
#read the file
original_text = open_txt()
text_to_tx = encodes(original_text)

#fragment text in blocks of 32 bytes
payload = frament_the_text(text_to_tx)

print('length: ' + str(len(text_to_tx)))
print('num. packets: ' + str(len(payload)))

length: 1216848
num. packets: 38027


# Compressions

In [31]:
#read the file
original_text = open_txt()
text_to_tx = encodes(original_text)

# level = 9 -> max compression, but slowest
text_compressed = zlib.compress(text_to_tx ,level=9)
payload = frament_the_text(text_compressed)

print('length: ' + str(len(text_compressed)))
print('num. packets: ' + str(len(payload)))

are_strings_equal_zlib(original_text, text_compressed)

length: 622259
num. packets: 19446
Original:
Decompressed:
compression ok


In [36]:
#read the file
original_text = open_txt()
text_to_tx = encodes(original_text)

# preset = 9 -> max compression, but slowest
text_compressed = lzma.compress(text_to_tx, preset=9)

payload = frament_the_text(text_compressed)

print('length: ' + str(len(text_compressed)))
print('num. packets: ' + str(len(payload)))

are_strings_equal_lzma(original_text, text_compressed)

length: 503584
num. packets: 15737
compression ok


In [33]:
#read the file
original_text = open_txt()
text_to_tx = encodes(original_text)

# compresslevel=9 -> max compression, but slowest
text_compressed = bz2.compress(text_to_tx, compresslevel=9)

payload = frament_the_text(text_compressed)

print('length: ' + str(len(text_compressed)))
print('num. packets: ' + str(len(payload)))

are_strings_equal_bz2(original_text, text_compressed)

length: 498529
num. packets: 15580
compression ok


In [34]:
#read the file
original_text = open_txt()
text_to_tx = encodes(original_text)

# compresslevel=9 -> max compression, but slowest
text_compressed = gzip.compress(text_to_tx, compresslevel=9)

payload = frament_the_text(text_compressed)

print('length: ' + str(len(text_compressed)))
print('num. packets: ' + str(len(payload)))

are_strings_equal_gzip(original_text, text_compressed)

length: 622271
num. packets: 19446
compression ok
