In [None]:
import os
import numpy as np
import tiktoken
import matplotlib.pyplot as plt
from tqdm import tqdm

# number of workers in .map() call
num_proc = 8

# initialize the tokenizer (GPT-2 encoding)
enc = tiktoken.get_encoding("gpt2")

# path to your data folder containing .txt files
data_folder = 'data'

def process_files(data_folder):
    files = [f for f in os.listdir(data_folder) if f.endswith('.txt')]
    all_ids = []
    file_lengths = {}

    print(f"Found {len(files)} .txt files.")
    for file in files:
        file_path = os.path.join(data_folder, file)
        print(f"Processing {file}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            ids = enc.encode_ordinary(text)
            ids.append(enc.eot_token)
            all_ids.extend(ids)
            file_lengths[file] = len(ids)

    return all_ids, file_lengths, files

def restore_text_from_bin(bin_filename, file_lengths):
    dtype = np.uint16
    arr = np.memmap(bin_filename, dtype=dtype, mode='r')
    restored_files = {}
    idx = 0
    for file, length in file_lengths.items():
        restored_ids = arr[idx:idx + length]
        restored_text = enc.decode(restored_ids[:-1])  # remove EOT
        restored_files[file] = restored_text
        idx += length
    return restored_files

if __name__ == '__main__':
    # Process files
    all_token_ids, file_lengths, files = process_files(data_folder)

    output_dir = 'output/bin'
    output_filename = os.path.join(output_dir, 'dataset.bin')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Safe deletion of existing binary file
    if os.path.exists(output_filename):
        import gc
        gc.collect()
        import time
        time.sleep(2)
        try:
            os.remove(output_filename)
        except PermissionError:
            print(f"Could not delete {output_filename}. It may still be in use.")

    dtype = np.uint16
    arr_len = len(all_token_ids)

    arr = np.memmap(output_filename, dtype=dtype, mode='w+', shape=(arr_len,))
    for idx in tqdm(range(0, arr_len, num_proc), desc=f'writing {output_filename}'):
        batch = all_token_ids[idx:idx + num_proc]
        arr[idx:idx + len(batch)] = batch

    arr.flush()
    print(f"Dataset has been saved to {output_filename}")

    # Restore from binary
    restored_files = restore_text_from_bin(output_filename, file_lengths)

    # Save restored files to desired folder
    restored_output_dir = r'C:\Users\tahme\Desktop\server\output\restored_txt'
    os.makedirs(restored_output_dir, exist_ok=True)

    for file, content in restored_files.items():
        restored_file_path = os.path.join(restored_output_dir, file)
        with open(restored_file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved restored: {restored_file_path}")

    # Size comparison
    original_sizes = [os.path.getsize(os.path.join(data_folder, file)) / (1024 * 1024) for file in files]
    total_original_size = sum(original_sizes)
    bin_size = os.path.getsize(output_filename) / (1024 * 1024)
    restored_sizes = [len(restored_files[file]) * 2 / (1024 * 1024) for file in files]

    # Calculate size difference and percentage
    size_difference = total_original_size - bin_size
    percent_difference = (size_difference / total_original_size) * 100 if total_original_size > 0 else 0

    print("\n📊 Size Summary:")
    print(f"Total Original Size: {total_original_size:.2f} MB")
    print(f"Binary (.bin) Size:  {bin_size:.2f} MB")
    print(f"Difference:          {size_difference:.2f} MB")
    print(f"Compression:         {percent_difference:.2f}%")

    # Plotting histogram
    plt.figure(figsize=(10, 6))
    plt.hist(original_sizes, bins=5, alpha=0.6, label='Original Text')
    plt.hist([bin_size] * len(files), bins=5, alpha=0.6, label='Binary Size')
    plt.hist(restored_sizes, bins=5, alpha=0.6, label='Restored Text')
    plt.xlabel('Size (MB)')
    plt.ylabel('Count')
    plt.title('Comparison of File Sizes (Original, Binary, Restored)')
    plt.legend()
    plt.show()

    # Verify correctness
    for file in files:
        original_text = open(os.path.join(data_folder, file), 'r', encoding='utf-8').read()
        restored_text = restored_files[file]
        if original_text == restored_text:
            print(f"✅ Restoration of {file} was successful!")
        else:
            print(f"❌ Restoration of {file} failed!")


In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

# Directories
data_dir = "data"
output_webp_dir = os.path.join("output", "webp")
output_restored_dir = os.path.join("output", "restored")
os.makedirs(output_webp_dir, exist_ok=True)
os.makedirs(output_restored_dir, exist_ok=True)

total_original_size = 0
total_webp_size = 0
first_original_path = None
first_webp_path = None
first_restored_path = None

# Convert images to WebP (lossy mode)
for idx, filename in enumerate(os.listdir(data_dir)):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
        file_path = os.path.join(data_dir, filename)
        original_size = os.path.getsize(file_path)
        total_original_size += original_size

        image = Image.open(file_path).convert("RGB")
        
        # Save as WebP (lossy)
        webp_filename = os.path.splitext(filename)[0] + ".webp"
        webp_path = os.path.join(output_webp_dir, webp_filename)
        image.save(webp_path, format="WEBP", quality=80)  # Lossy WebP at quality 80
        webp_size = os.path.getsize(webp_path)
        total_webp_size += webp_size
        
        # Restore from WebP back to JPG
        restored_filename = os.path.splitext(filename)[0] + "_restored.jpg"
        restored_path = os.path.join(output_restored_dir, restored_filename)
        restored_image = Image.open(webp_path).convert("RGB")
        restored_image.save(restored_path, format="JPEG", quality=95)  # Restore as high-quality JPG

        # Store first image paths for comparison
        if first_original_path is None:
            first_original_path = file_path
            first_webp_path = webp_path
            first_restored_path = restored_path

# Summary
print("\nSummary:")
print(f"Total Original Size: {total_original_size / 1024:.2f} KB")
print(f"Total WebP Size: {total_webp_size / 1024:.2f} KB")
if total_original_size > 0:
    size_ratio = (total_webp_size / total_original_size) * 100
    reduction = 100 - size_ratio
    print(f"Size Ratio (WebP/Original): {size_ratio:.2f}%")
    print(f"Size Reduction: {reduction:.2f}%")
else:
    print("No images found.")

# --- Data Visualization ---
if first_original_path and first_webp_path and first_restored_path:
    original_img = np.array(Image.open(first_original_path).convert("RGB"))
    webp_img = np.array(Image.open(first_webp_path).convert("RGB"))
    restored_img = np.array(Image.open(first_restored_path).convert("RGB"))

    # Flatten the RGB values
    original_pixels = original_img.flatten()
    webp_pixels = webp_img.flatten()
    restored_pixels = restored_img.flatten()

    plt.figure(figsize=(18, 5))

    # Histogram of Original Image
    plt.subplot(1, 3, 1)
    plt.hist(original_pixels, bins=256, color='blue', alpha=0.7, label="Original")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    plt.title("Pixel Distribution - Original Image")
    plt.legend()

    # Histogram of WebP Image
    plt.subplot(1, 3, 2)
    plt.hist(webp_pixels, bins=256, color='red', alpha=0.7, label="WebP (Lossy)")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    plt.title("Pixel Distribution - WebP Image")
    plt.legend()

    # Histogram of Restored JPG Image
    plt.subplot(1, 3, 3)
    plt.hist(restored_pixels, bins=256, color='green', alpha=0.7, label="Restored JPG")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    plt.title("Pixel Distribution - Restored JPG Image")
    plt.legend()

    plt.tight_layout()
    plt.show()


In [None]:
import zstandard as zstd 
import msgpack
import os
import subprocess
import fitz

# ======================== Configuration ========================
INPUT_PDF = r'C:\Users\tahme\Desktop\server\data\M.M. THAMEED THOKY, D.pdf'
COMPRESSED_BIN = r'C:\Users\tahme\Desktop\server\output\bin\output.bin'
RESTORED_PDF = r'C:\Users\tahme\Desktop\server\output\restored_txt\restored.pdf'
COMPRESSION_LEVEL = 22
GHOSTSCRIPT_PATH = r'C:\Program Files\gs\gs10.05.0\bin\gswin64c.exe'
# ================================================================

GHOSTSCRIPT_OPTIONS = [
    "-dPDFSETTINGS=/ebook",
    "-dColorImageResolution=72",
    "-dGrayImageResolution=72",
    "-dMonoImageResolution=72",
    "-dDownsampleColorImages=true",
    "-dDownsampleGrayImages=true",
    "-dDownsampleMonoImages=true",
    "-dAutoFilterColorImages=false",
    "-dAutoFilterGrayImages=false",
    "-dColorImageDownsampleType=/Bicubic",
    "-dGrayImageDownsampleType=/Bicubic",
    "-dMonoImageDownsampleType=/Bicubic"
]

def format_size(size_bytes: int) -> str:
    """Convert bytes to human-readable format."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.2f} TB"

def optimize_with_ghostscript(input_path: str, output_path: str) -> None:
    """Fixed path handling for Windows"""
    try:
        # Create absolute paths
        input_path = os.path.abspath(input_path)
        output_path = os.path.abspath(output_path)
        
        # Verify GS executable exists
        if not os.path.exists(GHOSTSCRIPT_PATH):
            raise FileNotFoundError(f"Ghostscript not found at {GHOSTSCRIPT_PATH}")

        # Create output directory
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Build command with proper Windows quoting
        cmd = [
            f'"{GHOSTSCRIPT_PATH}"',
            '-q',
            '-dNOPAUSE',
            '-dBATCH',
            '-sDEVICE=pdfwrite',
            f'-sOutputFile="{output_path}"',
            *GHOSTSCRIPT_OPTIONS,
            f'"{input_path}"'
        ]

        print("\n🔧 Running Ghostscript command:")
        print(" ".join(cmd))

        # Execute with proper shell handling
        subprocess.run(" ".join(cmd), check=True, shell=True)

        # Verify output
        if not os.path.exists(output_path):
            raise RuntimeError("Optimized PDF not created")

    except subprocess.CalledProcessError as e:
        print(f"\n❌ Ghostscript failed with error {e.returncode}")
        raise RuntimeError("PDF optimization failed") from e


def pdf_to_bin(pdf_path: str, bin_path: str) -> float:
    """Two-stage compression pipeline with error handling"""
    try:
        print("🔧 Stage 1: PDF optimization with Ghostscript")
        temp_pdf = "temp_optimized.pdf"
        
        # Create output directory for compressed binary
        os.makedirs(os.path.dirname(bin_path), exist_ok=True)
        
        optimize_with_ghostscript(pdf_path, temp_pdf)

        print("\n🔧 Stage 2: Zstandard compression")
        if not os.path.exists(temp_pdf):
            raise FileNotFoundError("Optimized PDF not created")

        with open(temp_pdf, "rb") as f:
            optimized_bytes = f.read()

        # Perform compression
        cctx = zstd.ZstdCompressor(level=COMPRESSION_LEVEL)
        compressed = cctx.compress(optimized_bytes)

        # Package data
        data = {
            "zstd_compressed": compressed,
            "original_size": os.path.getsize(pdf_path),
            "optimized_size": len(optimized_bytes)
        }

        # Write compressed data
        with open(bin_path, "wb") as f:
            f.write(msgpack.packb(data))

        final_size = os.path.getsize(bin_path)
        return final_size

    finally:
        # Cleanup temporary file
        if os.path.exists(temp_pdf):
            os.remove(temp_pdf)

def bin_to_pdf(bin_path: str, output_path: str) -> None:
    """Enhanced decompression with validation"""
    try:
        if not os.path.exists(bin_path):
            raise FileNotFoundError(f"Compressed file not found: {bin_path}")

        # Create output directory if needed
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(bin_path, "rb") as f:
            data = msgpack.unpackb(f.read())

        # Validate sizes
        if 'zstd_compressed' not in data:
            raise ValueError("Invalid compressed data format")

        # Decompress
        dctx = zstd.ZstdDecompressor()
        restored = dctx.decompress(data["zstd_compressed"])

        # Size validation
        if len(restored) != data["optimized_size"]:
            raise ValueError("Decompressed size mismatch")

        # Write restored PDF
        with open(output_path, "wb") as f:
            f.write(restored)

    except Exception as e:
        if os.path.exists(output_path):
            os.remove(output_path)
        raise

def print_stats(original_path: str, bin_path: str, optimized_size: int):
    """Enhanced size reporting"""
    try:
        original_size = os.path.getsize(original_path)
        compressed_size = os.path.getsize(bin_path)
        
        print("\n📊 Final Compression Stats:")
        print(f"• Original PDF Size:   {format_size(original_size)}")
        print(f"• Ghostscript Output:  {format_size(optimized_size)}")
        print(f"• Final .bin Size:     {format_size(compressed_size)}")
        
        total_reduction = original_size - compressed_size
        print(f"\n🔥 Total Reduction: {format_size(total_reduction)} ({total_reduction/original_size*100:.1f}%)")
    
    except FileNotFoundError as e:
        print(f"\n❌ Error calculating stats: {str(e)}")

if __name__ == "__main__":
    try:
    # Verify input file exists
        if not os.path.exists(INPUT_PDF):
            raise FileNotFoundError(f"Input PDF not found: {INPUT_PDF}")

        # Create output directories
        os.makedirs(os.path.dirname(COMPRESSED_BIN), exist_ok=True)
        os.makedirs(os.path.dirname(RESTORED_PDF), exist_ok=True)

        # Compression workflow
        final_size = pdf_to_bin(INPUT_PDF, COMPRESSED_BIN)
        print_stats(INPUT_PDF, COMPRESSED_BIN, final_size)
        
        # Decompression workflow
        bin_to_pdf(COMPRESSED_BIN, RESTORED_PDF)
        
        # Validation
        with fitz.open(RESTORED_PDF) as doc:
            print(f"\n✅ Restoration successful! Pages: {len(doc)}")
            print("Metadata:", doc.metadata)

    except Exception as e:
        print(f"\n❌ Critical error: {str(e)}")
        # Cleanup partial files
        for path in [COMPRESSED_BIN, RESTORED_PDF]:
            if os.path.exists(path):
                os.remove(path)