Dumping the filenames to `dataset.pkl` if it does not exist, else loading the entire dictionary from it

In [None]:
import pickle
import os
from datasets import load_dataset, DatasetDict
num_proc = 40
ds_filename='dataset.pkl'
splits = ['0', '1']
if not os.path.exists(ds_filename):
    ds = load_dataset('commaai/commavq', num_proc=num_proc, split=splits)
    ds = DatasetDict(zip(splits, ds))
    with open(ds_filename, 'wb') as f:
        pickle.dump(ds, f)
else:
    with open(ds_filename, 'rb') as f:
        ds =pickle.load(f)

In [None]:
ds.save_to_disk('dataset')

Preprocessing to create dictionaries for each data

In [None]:
import gc
import os
import numpy as np

def create_nested_dict():
    return {k: {i: [] for i in range(5000)} for k in range(128)}

os.makedirs('commavq/dictionary_method_all_data/', exist_ok=True)
file_template = 'commavq/dictionary_method_all_data/number_{}.pkl'

# Initialize files (only need to do this once)
for i in range(-1023, 1024):
    temp_dict = create_nested_dict()
    with open(file_template.format(i), 'wb') as f:
        pickle.dump(temp_dict, f)

# make a copy after this step runs

Preprocessing to store pixel data differences between adjacent frames instead of the frames itself (aiming that it compresses better)

In [None]:
base_path = 'commavq/data_0_to_5000_diff/'

# Process data in larger chunks
chunk_size = 25 # Adjust based on your available memory

for split in ['0', '1']:
    for chunk_start in range(0, 2500, chunk_size):
        chunk_end = min(chunk_start + chunk_size, 2500)
        chunk_dict = {i: create_nested_dict() for i in range(-1023, 1024)}
        
        for i in range(chunk_start, chunk_end):
            print(f'i: {i}, split: {split}')
            temp_arr = np.load(base_path + ds[split][i]['path'])
            
            for j in range(1200):
                for k in range(128):
                    row = k // 16
                    col = k % 16
                    number = temp_arr[j][row][col]
                    if number != 0:
                        chunk_dict[number][k][i].append(j)
            
            del temp_arr
        
        # Update files with chunk data
        for number in range(-1023, 1024):
            if any(chunk_dict[number][k][i] for k in range(128) for i in range(5000)):
                with open(file_template.format(number), 'rb') as f:
                    file_dict = pickle.load(f)
                
                for k in range(128):
                    for i in range(5000):
                        if chunk_dict[number][k][i]:
                            file_dict[k][i].extend(chunk_dict[number][k][i])
                
                with open(file_template.format(number), 'wb') as f:
                    pickle.dump(file_dict, f)
        
        del chunk_dict
        gc.collect()

print("Processing complete.")

In [None]:
import numpy as np
import os

base_path = 'commavq/data_0_to_5000_diff/'

counts = np.zeros(2047, dtype=np.int64)
offset = 1023
for split in ['0','1']:
    for i in range(2500):
        print(f'i: {i}, split: {split}')
        temp_arr = np.load(base_path + ds[split][i]['path'])
        counts += np.bincount(temp_arr.flatten() + offset, minlength=2047)
        del temp_arr

print(counts)

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.linspace(-1023, 1023, 2047), counts)
print(counts[0+offset])
print(np.sum(counts)-counts[0+offset])

In [None]:
import numpy as np
import os
import pickle
import multiprocessing
from datasets import load_dataset, DatasetDict
from pathlib import Path

base_path = 'commavq/data_0_to_5000_diff/'
ds_filename = 'dataset.pkl'
splits = ['0', '1']
num_proc = multiprocessing.cpu_count()

if not os.path.exists(ds_filename):
    ds = load_dataset('commaai/commavq', num_proc=num_proc, split=splits)
    ds = DatasetDict(zip(splits, ds))
    with open(ds_filename, 'wb') as f:
        pickle.dump(ds, f)
else:
    with open(ds_filename, 'rb') as f:
        ds =pickle.load(f)

data = []
for split in ds:
    for min in ds[split]:
        path = Path(base_path+ min['path'])
        tokens=np.load(path)
        data.append(tokens)

data=np.array(data)
data_shape = data.shape
print(data_shape)

output_dir='commavq/pixel_data_diff/'
os.makedirs(output_dir, exist_ok=True)

data = data.reshape(5000, 1200, 128)
# Iterate over each pixel position
for pixel_idx in range(128):
    print("pixel:"+str(pixel_idx))
    # Extract data for the current pixel position across all frames and minutes
    pixel_data = data[:, :, pixel_idx].flatten()
    file=f'pixel_{pixel_idx}.npy'
    # Save the extracted data to a .npy file
    np.save(output_dir+file, pixel_data)

In [None]:
import lzma
import shutil
from tqdm import tqdm

lzmaobj= lzma.LZMACompressor(preset=9)
base_path='commavq/pixel_data_diff/'
output_dir = Path('./temp/')
os.makedirs(output_dir,exist_ok=True)

for pixel_idx in tqdm(range(128)):
    file=f'pixel_{pixel_idx}.npy'
    tokens=np.load(base_path+file)
    bytes=lzmaobj.compress(tokens.tobytes())
    with open(output_dir/file,'wb') as f:
        f.write(bytes)
    print("pixel:"+str(pixel_idx))

shutil.make_archive('temp', 'zip', output_dir)
final_zip= 'temp.zip'
rate = (sum(ds.num_rows.values()) * 1200 * 128 * 10 / 8) / os.path.getsize(final_zip)
print(f"Compression rate: {rate:.1f}")

In [None]:
base_path = 'commavq/pixel_data_diff/'
pixel_data = np.load(base_path+'pixel_0.npy')

import matplotlib.pyplot as plt
i = 1600
plt.plot(pixel_data[i:i+100])

In [None]:
import os
import subprocess

# Path to the directory containing the .npy files
data_dir = 'commavq/pixel_data_diff'

# Initialize the zpaq archive
archive_name = 'temp.zpaq'

# Iterate over each .npy file in the directory
for i in range(128):
    file_name = f'pixel_{i}.npy'
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        # Create the command to add the file to the zpaq archive
        command = f'zpaq add {archive_name} {file_path} -method 5'
        
        # Print the command (for debugging purposes)
        print(f'Running command: {command}')
        
        # Run the command
        result = subprocess.run(command, shell=True)
        
        # Check if the command was successful
        if result.returncode != 0:
            print(f'Error compressing {file_name}')
    else:
        print(f'{file_name} does not exist in the directory')

print("Compression complete.")


In [None]:
import numpy as np

base_path = 'commavq/data_0_to_5000/'
temp_arr = np.load(base_path + ds['0'][0]['path'])

np.set_printoptions(1100)
np.set_printoptions(linewidth=200)

i = 1
print(temp_arr[i])
print(temp_arr[i+1])
print(temp_arr[i+2])

In [None]:
import os
import subprocess
from tqdm import tqdm

# Path to the directory containing the files to compress
base_path = 'commavq/data_0_to_5000/'

# Initialize the zpaq archive
archive_name = 'temp.zpaq'

# Iterate over each file in the directory
for file_name in tqdm(os.listdir(base_path)):
    file_path = os.path.join(base_path, file_name)
    if os.path.isfile(file_path):
        # Create the command to add the file to the zpaq archive
        command = f'zpaq add {archive_name} {file_path} -method 5'
        
        # Print the command (for debugging purposes)
        # print(f'Running command: {command}')
        
        # Run the command
        result = subprocess.run(command, shell=True)
        
        # Check if the command was successful
        if result.returncode != 0:
            print(f'Error compressing {file_name}')
    else:
        print(f'{file_name} is not a file')

print("Compression complete.")


In [None]:
import os
import numpy as np
from tqdm import tqdm

input_dir = 'commavq/pixel_data/'
output_dir = 'commavq/pixel_data_6bits/'
os.makedirs(output_dir, exist_ok=True)
template = 'pixel_{}.npy'

def write_to_6bits(data):
    temp = data[0]
    new_data = []
    count = 1
    
    for i in tqdm(range(1, len(data))):
        if data[i] == temp and count <= 62:
            count += 1
        else:
            encoded_value = (count << 10) + temp
            new_data.append(encoded_value)
            temp = data[i]
            count = 1

        
    encoded_value = (count << 10) + temp
    new_data.append(encoded_value)
    
    new_data = np.array(new_data, dtype=np.uint16)
    return new_data

for i in range(128):
    file = template.format(i)
    data = np.load(os.path.join(input_dir, file))
    compressed_data = write_to_6bits(data)
    np.save(os.path.join(output_dir, file), compressed_data)


In [None]:
import numpy as np
import os

number_of_files_in_one_merge = 128
number_of_merges = 128//number_of_files_in_one_merge

# Create the output directory if it doesn't exist

data_dir = 'commavq/pixel_data/'
output_dir = f'commavq/pixel_data_merged_{number_of_files_in_one_merge}/'
os.makedirs(output_dir, exist_ok=True)

# Initialize lists to hold the arrays for each merged file
merged_data = []

# Load and concatenate the arrays
j = 0
for i in range(number_of_merges):
    merged_data = []
    for k in range(number_of_files_in_one_merge):
        file = f'pixel_{j}.npy'
        data = np.load(os.path.join(data_dir, file))
        merged_data.append(data)
        j += 1
    merged_data = np.concatenate(merged_data, axis=0)
    output_file = f'pixel_merged_{i}.npy'
    np.save(os.path.join(output_dir, output_file), merged_data)


In [None]:
import os
import subprocess
from tqdm import tqdm

# Path to the directory containing the .npy files
number_of_files_in_one_merge = 8
number_of_merges = 128//number_of_files_in_one_merge
data_dir = f'commavq/pixel_data_merged_{number_of_files_in_one_merge}/'

# Initialize the zpaq archive
archive_name = 'temp.zpaq'

# Iterate over each .npy file in the directory
for i in tqdm(range(number_of_merges)):
    file_name = f'pixel_merged_{i}.npy'
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        # Create the command to add the file to the zpaq archive
        command = f'zpaq add {archive_name} {file_path} -method x6.0w1c256ci1,1,1,1,1,1,2ac0,2,0,255i1c0,3,0,0,255i2c0,4,0,0,0,255i2mm20ts22t0'
        
        # Print the command (for debugging purposes)
        print(f'Running command: {command}')
            
        # Run the command
        result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Check if the command was successful
        if result.returncode != 0:
            print(f'Error compressing {file_name}')
            print(result.stderr)
            break
    else:
        print(f'{file_name} does not exist in the directory')

print("Code complete.")

In [None]:
import lzma
import shutil
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pickle

with open('dataset.pkl', 'rb') as f:
        ds =pickle.load(f)

base_path='commavq/pixel_data_6bits/'
output_dir = Path('./temp/')
os.makedirs(output_dir,exist_ok=True)

for pixel_idx in tqdm(range(128)):
    lzmaobj=lzma.LZMACompressor(preset=9)
    file=f'pixel_{pixel_idx}.npy'
    tokens=np.load(base_path+file)
    bytes=lzmaobj.compress(tokens.tobytes())
    flush=lzmaobj.flush()
    with open(output_dir/file,'wb') as f:
        f.write(bytes)
        f.write(flush)
    print("pixel:"+str(pixel_idx))

shutil.make_archive('temp', 'zip', output_dir)
final_zip= 'temp.zip'
rate = (sum(ds.num_rows.values()) * 1200 * 128 * 10 / 8) / os.path.getsize(final_zip)
print(f"Compression rate: {rate:.1f}")

In [None]:
# Convert pixel_data to pixel_bit_planes

import numpy as np
import os
from tqdm import tqdm

# Directory containing the pixel data files
data_folder = 'commavq/pixel_data'

# Create a directory to store the output
output_folder = 'commavq/pixel_bit_planes_byte_arrays'
os.makedirs(output_folder, exist_ok=True)

# Iterate over each pixel file
for pixel_index in tqdm(range(128)):
    pixel_file = os.path.join(data_folder, f'pixel_{pixel_index}.npy')
    pixel_data = np.load(pixel_file)  # Load pixel data
    
    # Convert pixel data to boolean arrays for each bit
    for bit_index in range(10):
        # Extract the specific bit across all timesteps
        bit_data = (pixel_data & (1 << bit_index)) > 0
        # Save the boolean array to a file
        output_file = os.path.join(output_folder, f'pixel_{pixel_index}_bit_{bit_index}.npy')
        np.save(output_file, bit_data)

print('Conversion complete.')

In [None]:
import numpy as np
import os
from tqdm import tqdm

input_dir = 'commavq/pixel_data'
output_dir = 'commavq/pixel_data_split'

os.makedirs(output_dir, exist_ok=True)

for i in tqdm(range(128)):
    # Load the original file
    file_path = os.path.join(input_dir, f'pixel_{i}.npy')
    data = np.load(file_path)
    
    # Ensure data is of type int16
    data = data.astype(np.int16)
    
    lower_bits = (data & 0x00FF).astype(np.int8)
    upper_bits = ((data >> 8) & 0x00FF).astype(np.int8)
    
    # Save the lower and upper bits into separate files
    lower_file_path = os.path.join(output_dir, f'pixel_{i}_lower.npy')
    upper_file_path = os.path.join(output_dir, f'pixel_{i}_upper.npy')
    
    np.save(lower_file_path, lower_bits)
    np.save(upper_file_path, upper_bits)

print('All files processed.')


In [None]:
import numpy as np

data_dir = 'commavq/pixel_data_split'
temp_arr = np.load(data_dir + '/pixel_0_lower.npy')
i = 1000
print(temp_arr[i:i+100])

In [None]:
import os
import subprocess
from tqdm import tqdm

# Path to the directory containing the .npy files
data_dir = 'commavq/pixel_data_split'

# Initialize the zpaq archive
archive_name = 'temp.zpaq'

# Iterate over each .npy file in the directory
for i in tqdm(range(128)):

    lower_file_name = f'pixel_{i}_lower.npy'
    upper_file_name = f'pixel_{i}_upper.npy'
    lower_file_path = os.path.join(data_dir, lower_file_name)
    upper_file_path = os.path.join(data_dir, upper_file_name)

    if os.path.exists(lower_file_path) and os.path.exists(upper_file_path):
        
        command = f'zpaq add {archive_name} {lower_file_path} -method 5'
        print(f'Running command: {command}')
        result = subprocess.run(command, shell=True)
        if result.returncode != 0:
            print(f'Error compressing {lower_file_name}')

        command = f'zpaq add {archive_name} {upper_file_path} -method 5'
        print(f'Running command: {command}')
        result = subprocess.run(command, shell=True)
        if result.returncode != 0:
            print(f'Error compressing {upper_file_name}')
    else:
        print(f'{lower_file_name} or {upper_file_name} does not exist in the directory')

print("Compression complete.")

In [None]:
# Merge the higher and lower splits
import numpy as np
import os

number_of_files_in_one_merge = 16
number_of_merges = 128//number_of_files_in_one_merge

# Create the output directory if it doesn't exist

data_dir = 'commavq/pixel_data_split/'
output_dir = f'commavq/pixel_data_split_merged_{number_of_files_in_one_merge}/'
os.makedirs(output_dir, exist_ok=True)

# Load and concatenate the arrays
j = 0
for i in range(number_of_merges):
    merged_data_lower = []
    merged_data_upper = []
    for k in range(number_of_files_in_one_merge):
        file_lower = f'pixel_{j}_lower.npy'
        file_upper = f'pixel_{j}_upper.npy'
        data_lower = np.load(os.path.join(data_dir, file_lower))
        data_upper = np.load(os.path.join(data_dir, file_upper))
        merged_data_lower.append(data_lower)
        merged_data_upper.append(data_upper)
        j += 1
    merged_data_lower = np.concatenate(merged_data_lower, axis=0)
    merged_data_upper = np.concatenate(merged_data_upper, axis=0)
    output_file_lower = f'pixel_merged_{i}_lower.npy'
    output_file_upper = f'pixel_merged_{i}_upper.npy'
    np.save(os.path.join(output_dir, output_file_lower), merged_data_lower)
    np.save(os.path.join(output_dir, output_file_upper), merged_data_upper)

In [None]:
import os
import subprocess
from tqdm import tqdm

# Path to the directory containing the .npy files
number_of_files_in_one_merge = 16
number_of_merges = 128//number_of_files_in_one_merge
data_dir = f'commavq/pixel_data_split_merged_{number_of_files_in_one_merge}/'

# Initialize the zpaq archive
archive_name = 'temp.zpaq'

# Iterate over each .npy file in the directory
for i in tqdm(range(number_of_merges)):
    file_name_lower = f'pixel_merged_{i}_lower.npy'
    file_name_upper = f'pixel_merged_{i}_upper.npy'
    file_path_lower = os.path.join(data_dir, file_name_lower)
    file_path_upper = os.path.join(data_dir, file_name_upper)
    if os.path.exists(file_path_lower) and os.path.exists(file_path_upper):
        
        command_lower = f'zpaq add {archive_name} {file_path_lower} -method 5'
        print(f'Running command: {command_lower}')  
        result = subprocess.run(command_lower, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            print(f'Error compressing {file_name_lower}')
            print(result.stderr)
            break
        
        # command_upper = f'zpaq add {archive_name} {file_path_upper} -method 5'
        # print(f'Running command: {command_upper}')
        # result = subprocess.run(command_upper, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        # if result.returncode != 0:
        #     print(f'Error compressing {file_name_upper}')
        #     print(result.stderr)
        #     break
    else:
        print(f'{file_name_lower} or {file_name_upper} does not exist in the directory')

print("Code complete.")