Refactored queue & added tests ⚡

typio · Feb 26, 2024 · 5394be0 · 5394be0
1 parent 8da8947
commit 5394be0
Show file tree

Hide file tree

Showing 10 changed files with 305 additions and 218 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -33,25 +33,25 @@ jobs:
     strategy:
       matrix:
         include:
-            - name: linux-x86_64
-              target: x86_64-linux
-              strip: strip=true
-              optimize: optimize=ReleaseSafe
+          - name: linux-x86_64
+            target: x86_64-linux
+            strip: strip=true
+            optimize: optimize=ReleaseSafe
 
-            - name: windows-x86_64
-              target: x86_64-windows
-              strip: strip=true
-              optimize: optimize=ReleaseSafe
+          - name: windows-x86_64
+            target: x86_64-windows
+            strip: strip=true
+            optimize: optimize=ReleaseSafe
 
-            - name: macos-aarch64
-              target: aarch64-macos
-              strip: strip=true
-              optimize: optimize=ReleaseSafe
+          - name: macos-aarch64
+            target: aarch64-macos
+            strip: strip=true
+            optimize: optimize=ReleaseSafe
 
-            - name: macos-x86
-              target: x86_64-macos
-              strip: strip=true
-              optimize: optimize=ReleaseSafe
+          - name: macos-x86
+            target: x86_64-macos
+            strip: strip=true
+            optimize: optimize=ReleaseSafe
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -81,4 +81,4 @@ jobs:
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          gh release create v0.5.0 -t "0.5.0" -n "This is the first release right here :zap:" entreepy/entreepy*
+          gh release create v0.5.1 -t "0.5.1" entreepy/entreepy*
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,5 @@ res/*
 !res/test.txt
 !res/nice.shakespeare.txt
 !res/a_midsummer_nights_dream.txt
+
+decoded_*
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@ entreepy<br/>
 [![Actions Status](https://github.com/typio/entreepy/workflows/release/badge.svg)](https://github.com/typio/entreepy/actions)
 ====
 
-> ⚡ Huffman compression
+> ⚡ Fast huffman coding text compression
 
 The name is from entropy coding + binary trees.
 
@@ -15,7 +15,7 @@ Options:
     -h, --help     show help
     -p, --print    print decompressed text to stdout
     -t, --test     test/dry run, does not write to file
-    -d, --debug    print huffman code dictionary and performance times to stdout
+    -d, --debug    print huffman code dictionary and performance times
 
 Commands:
     c    compress a file
@@ -29,32 +29,28 @@ Examples:
     entreepy -ptd d text.txt.et
 ```
 
-Input file must be < 1 terabyte. I recommend keeping an uncompressed backup or testing the program's
-decompression before deleting the original, the program hasn't been robustly tested. Be sure to use
-the same version of the program to decompress as compress.
+Input file must be < 1 terabyte. I recommend keeping an uncompressed backup or testing the program's decompression before deleting the original, the program hasn't been robustly tested. Be sure to use the same version of the program to decompress as compress.
 
 ### Performance
 
-<!-- Time performance is quite good, memory is not optimal compared to other -->
-<!-- solutions but still relatively nothing. The main time bottlenecks are the -->
-<!-- heap allocations for file I/O. -->
+<!-- Time performance is good, memory is not optimal but still negligible. The main time bottlenecks are the heap allocations for file I/O. -->
 
-I've developed a novel approach to decoding that utilizes a decode map. This map is keyed by the integer value of the code and stores a subarray of letters with matching code integer value - that is, the letters that correspond to codes with the same integer value - indexed by length minus one. For example, the map might include the following entries:
+I use a decode map which is keyed by the integer value of the code and stores a subarray of letters with matching code integer value - that is, the letters that correspond to codes with the same integer value - indexed by length minus one. For example, the map might include the following entries:
 
 `{ 2: [_, a (10), e (010), ...], 13: [_, _, _, _, z (01101), ...] }.`
 
-By utilizing this decode map, decoding can be performed much more quickly than by traversing a binary tree bit by bit. I haven't come across a faster decoding approach than this one.
+By utilizing this decode map, decoding can be performed much more quickly than by traversing a binary tree.
 
-#### Current Performance on MacBook Air M2, 8 GB RAM
+#### Performance on MacBook Air M2, 8 GB RAM - v0.5.0
 | File | Original File Size | Compressed Size | Compression Time | Decompression Time |
 | ---- | :----------------: | :-------------: | :--------------: | :----------------: |
 | [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt)   | 477 bytes | 374 bytes | 240μs | 950μs |
 | [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 115 KB | ~ 66 KB | 2.2ms | 150ms |
 | [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.5 MB | ~ 3.2 MB | 0.1s | 7s |
 
-### Compressed File Format (tentative)
+### Compressed File Format
 
-Introduces the `.et` file format, identified by the magic number `e7 c0 de`.
+Uses the `.et` file format, identified by the magic number `e7 c0 de`.
 
 ```bf
 | magic number -> 3 bytes |
@@ -67,6 +63,4 @@ for n symbols
 | symbol code -> m bits |
 
 | packed big-endian bitstream of codes | starting on new byte
-
-| 0 padding -> <=3 bytes |
 ```
diff --git a/build.zig b/build.zig
@@ -8,16 +8,10 @@ pub fn build(b: *std.Build) void {
     defer _ = gpa.deinit();
     const allocator = gpa.allocator();
 
-    const strip = b.option(bool, "strip", "") orelse false;
+    const os = @tagName(target.result.os.tag);
+    const arch = @tagName(target.result.cpu.arch);
 
-    // https://ziglang.org/documentation/master/std/src/target.zig.html
-    const os_table = [_][]const u8{ "freestanding", "ananas", "cloudabi", "dragonfly", "freebsd", "fuchsia", "ios", "kfreebsd", "linux", "lv2", "macos", "netbsd", "openbsd", "solaris", "windows", "zos", "haiku", "minix", "rtems", "nacl", "aix", "cuda", "nvcl", "amdhsa", "ps4", "ps5", "elfiamcu", "tvos", "watchos", "driverkit", "mesa3d", "contiki", "amdpal", "hermit", "hurd", "wasi", "emscripten", "shadermodel", "uefi", "opencl", "glsl450", "vulkan", "plan9", "other" };
-    const arch_table = [_][]const u8{ "arm", "armeb", "aarch64", "aarch64_be", "aarch64_32", "arc", "avr", "bpfel", "bpfeb", "csky", "dxil", "hexagon", "loongarch32", "loongarch64", "m68k", "mips", "mipsel", "mips64", "mips64el", "msp430", "powerpc", "powerpcle", "powerpc64", "powerpc64le", "r600", "amdgcn", "riscv32", "riscv64", "sparc", "sparc64", "sparcel", "s390x", "tce", "tcele", "thumb", "thumbeb", "x86", "x86_64", "xcore", "nvptx", "nvptx64", "le32", "le64", "amdil", "amdil64", "hsail", "hsail64", "spir", "spir64", "spirv32", "spirv64", "kalimba", "shave", "lanai", "wasm32", "wasm64", "renderscript32", "renderscript64", "ve", "spu_2" };
-
-    var os = os_table[@enumToInt(target.getOsTag())];
-    var arch = arch_table[@enumToInt(target.getCpuArch())];
-
-    var name = std.fmt.allocPrint(allocator, "entreepy-{s}-{s}", .{ os, arch }) catch "e";
+    const name = std.fmt.allocPrint(allocator, "entreepy-{s}-{s}", .{ os, arch }) catch "e";
     defer allocator.free(name);
 
     const exe = b.addExecutable(.{
@@ -27,11 +21,10 @@ pub fn build(b: *std.Build) void {
         .target = target,
         .optimize = optimize,
     });
-    exe.strip = strip;
 
-    exe.install();
+    b.installArtifact(exe);
 
-    const run_cmd = exe.run();
+    const run_cmd = b.addRunArtifact(exe);
 
     run_cmd.step.dependOn(b.getInstallStep());
 
@@ -42,12 +35,14 @@ pub fn build(b: *std.Build) void {
     const run_step = b.step("run", "Run the app");
     run_step.dependOn(&run_cmd.step);
 
-    const exe_tests = b.addTest(.{
-        .root_source_file = .{ .path = "src/main.zig" },
+    const unit_tests = b.addTest(.{
+        .root_source_file = .{ .path = "src/test.zig" },
         .target = target,
         .optimize = optimize,
     });
 
+    const run_unit_tests = b.addRunArtifact(unit_tests);
+
     const test_step = b.step("test", "Run unit tests");
-    test_step.dependOn(&exe_tests.step);
+    test_step.dependOn(&run_unit_tests.step);
 }
diff --git a/build.zig.zon b/build.zig.zon
@@ -0,0 +1,11 @@
+.{
+    .name = "entreepy",
+
+    .version = "0.5.1",
+
+    .dependencies = .{},
+
+    .minimum_zig_version = "0.12.0",
+
+    .paths = .{""},
+}
diff --git a/src/decode.zig b/src/decode.zig
@@ -2,15 +2,16 @@ const std = @import("std");
 
 const Allocator = std.mem.Allocator;
 
-pub const DecodeFlags = packed struct {
+pub const DecodeFlags = struct {
     write_output: bool = false,
     print_output: bool = false,
     debug: bool = false,
-    _padding: u30 = 0,
 };
 
-pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: std.fs.File.Writer,
-std_out: std.fs.File, flags: DecodeFlags) !void {
+// TODO: Add checks for to error if it isnt in valid .et file format (min length)
+
+pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: anytype, std_out: std.fs.File, flags: DecodeFlags) !usize {
+    var bytes_written: u32 = 0;
     const start_time = std.time.microTimestamp();
     defer if (flags.debug) std_out.writer().print("\ntime taken: {d}μs\n", .{std.time.microTimestamp() -
         start_time}) catch {};
@@ -19,7 +20,9 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
     var reading_dict_code_len: bool = false;
     var reading_dict_code: bool = false;
 
-    var decode_dictionary_length: usize = compressed_text[3] + 1;
+    const decode_dictionary_length: u8 = compressed_text[3] + 1;
+
+    std.debug.print("decode_dictionary_length: {}\n", .{decode_dictionary_length});
 
     var decode_body_length: u32 = compressed_text[4];
     decode_body_length <<= 8;
@@ -29,6 +32,8 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
     decode_body_length <<= 8;
     decode_body_length |= compressed_text[7];
 
+    std.debug.print("decode body length: {}\n", .{decode_body_length});
+
     var longest_code: u8 = 0;
     var shortest_code: usize = std.math.maxInt(usize);
 
@@ -56,12 +61,12 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
                 while (i <= 7) {
                     if (pos > 7) break :read;
                     build_bits <<= 1;
-                    build_bits |= (byte >> @truncate(u3, 7 - pos)) & 1;
+                    build_bits |= (byte >> @as(u3, @truncate(7 - pos))) & 1;
                     pos += 1;
                     i += 1;
                 }
 
-                current_letter = @truncate(u8, build_bits);
+                current_letter = @as(u8, @truncate(build_bits));
 
                 reading_dict_letter = false;
                 reading_dict_code_len = true;
@@ -74,12 +79,12 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
                 while (i <= 7) {
                     if (pos > 7) break :read;
                     build_bits <<= 1;
-                    build_bits |= (byte >> @truncate(u3, 7 - pos)) & 1;
+                    build_bits |= (byte >> @as(u3, @truncate(7 - pos))) & 1;
                     pos += 1;
                     i += 1;
                 }
 
-                current_code_length = @truncate(u8, build_bits);
+                current_code_length = @as(u8, @truncate(build_bits));
 
                 if (current_code_length > longest_code) longest_code = current_code_length;
                 if (current_code_length < shortest_code) shortest_code = current_code_length;
@@ -95,7 +100,7 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
                 while (i < current_code_length) {
                     if (pos > 7) break :read;
                     build_bits <<= 1;
-                    build_bits |= (byte >> @truncate(u3, 7 - pos)) & 1;
+                    build_bits |= (byte >> @as(u3, @truncate(7 - pos))) & 1;
 
                     pos += 1;
                     i += 1;
@@ -140,29 +145,32 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
         decode_text: while (window_len >= longest_code) {
             // loop through all possible code lengths, checking start of window for match
             checking_code_len = shortest_code;
-            while (checking_code_len <= longest_code and window_len >= longest_code) {
+            while (window_len >= checking_code_len) {
                 if (decoded_letters_read >= decode_body_length or
                     window_len < checking_code_len)
                 {
                     break :decode_text;
                 }
 
                 testing_code = window &
-                    ((@as(u32, 0b1) << @truncate(u5, checking_code_len)) - 1) << @truncate(u5, window_len - checking_code_len);
+                    ((@as(u32, 0b1) << @as(u5, @truncate(checking_code_len))) - 1) << @as(u5, @truncate(window_len - checking_code_len));
 
-                testing_code >>= @truncate(u6, window_len - checking_code_len);
+                testing_code >>= @as(u6, @truncate(window_len - checking_code_len));
 
                 if (decode_table.get(testing_code)) |entry| {
                     if (entry[checking_code_len - 1] > 0) {
-                        var c = entry[checking_code_len - 1];
+                        const c = entry[checking_code_len - 1];
 
-                        if (flags.write_output) try out_writer.writeByte(c);
+                        if (flags.write_output) {
+                            try out_writer.writeByte(c);
+                            bytes_written += 1;
+                        }
                         if (flags.print_output) try std_out.writer().print("{c}", .{c});
 
                         decoded_letters_read += 1;
 
                         window = window & ((@as(u32, 0b1) <<
-                            @truncate(u5, window_len - checking_code_len)) - 1);
+                            @as(u5, @truncate(window_len - checking_code_len))) - 1);
                         window_len -= checking_code_len;
                         checking_code_len = shortest_code;
                     }
@@ -171,4 +179,5 @@ std_out: std.fs.File, flags: DecodeFlags) !void {
             }
         }
     }
+    return bytes_written;
 }