<a href="https://colab.research.google.com/github/vaishnav221/File_Upload_View_Download/blob/main/File_Compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import zipfile
import tarfile
import bz2
import lzma
import json
import io

def get_folder_size(folder):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size, len(filenames)

def show_folder_contents(folder, description):
    if os.path.exists(folder):
        files_list = sorted(os.listdir(folder))
        total_size, num_files = get_folder_size(folder)
        print(f"{description} ({folder}): {num_files} files, {total_size / 1024:.2f} KB")
    else:
        print(f"{description} ({folder}) does not exist.")

def determine_compression_method(folder):
    total_size, _ = get_folder_size(folder)
    extensions = {os.path.splitext(f)[1].lower() for f in os.listdir(folder)}

    if total_size > 100 * 1024 * 1024:
        return "tar.gz"
    elif {'.txt', '.csv', '.log'}.intersection(extensions):
        return "zip"
    elif total_size < 10 * 1024 * 1024:
        return "bz2"
    else:
        return "xz"

def compress_folder(input_folder, compression_method):
    index_map = {}
    compressed_file = ""
    original_size, num_files = get_folder_size(input_folder)

    if compression_method == "zip":
        zip_filename = "compressed_files.zip"
        with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
            for idx, file in enumerate(sorted(os.listdir(input_folder), reverse=True)):
                index_map[file] = str(idx)
                zipf.write(os.path.join(input_folder, file), arcname=index_map[file])
        compressed_file = zip_filename

    elif compression_method == "tar.gz":
        tar_filename = "compressed_files.tar.gz"
        with tarfile.open(tar_filename, "w:gz") as tarf:
            for idx, file in enumerate(sorted(os.listdir(input_folder), reverse=True)):
                index_map[file] = str(idx)
                tarf.add(os.path.join(input_folder, file), arcname=index_map[file])
        compressed_file = tar_filename

    elif compression_method == "bz2":
        bz2_filename = "compressed_files.bz2"
        with bz2.BZ2File(bz2_filename, "w") as bz2f:
            for idx, file in enumerate(sorted(os.listdir(input_folder), reverse=True)):
                # Add file to index_map before compressing
                if os.path.isfile(os.path.join(input_folder, file)):
                    index_map[file] = str(idx)
                    with open(os.path.join(input_folder, file), "rb") as f:
                        bz2f.write(f.read())
        compressed_file = bz2_filename

    elif compression_method == "xz":
        xz_filename = "compressed_files.xz"
        with lzma.open(xz_filename, "w") as xzf:
            for idx, file in enumerate(sorted(os.listdir(input_folder), reverse=True)):
                # Add file to index_map before compressing
                if os.path.isfile(os.path.join(input_folder, file)):
                    index_map[file] = str(idx)
                    with open(os.path.join(input_folder, file), "rb") as f:
                        xzf.write(f.read())
        compressed_file = xz_filename


    compressed_size = os.path.getsize(compressed_file)
    compression_success = (1 - (compressed_size / original_size)) * 100

    with open("index_map.json", "w") as json_file:
        json.dump(index_map, json_file)

    print(f"Files compressed into {compressed_file} using {compression_method}.")
    print(f"Original Size: {original_size / 1024:.2f} KB, Compressed Size: {compressed_size / 1024:.2f} KB")
    print(f"Compression Ratio: {compressed_size / original_size:.2%}")
    print(f"Compression Success: {compression_success:.2f}%")
    return compressed_file, "index_map.json"

def extract_folder(compressed_file, index_map_filename, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    with open(index_map_filename, "r") as json_file:
        index_map = json.load(json_file)
    reverse_map = {str(v): k for k, v in index_map.items()}

    if compressed_file.endswith(".zip"):
        with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
            for file in zip_ref.namelist():
                extracted_path = os.path.join(output_folder, reverse_map[file])
                with open(extracted_path, "wb") as f:
                    f.write(zip_ref.read(file))

    elif compressed_file.endswith(".tar.gz"):
        with tarfile.open(compressed_file, 'r:gz') as tar_ref:
            for member in tar_ref.getmembers():
                extracted_path = os.path.join(output_folder, reverse_map[member.name])
                with open(extracted_path, "wb") as f:
                    f.write(tar_ref.extractfile(member).read())
    # Handle bz2 and xz extraction using index_map
    elif compressed_file.endswith(".bz2") or compressed_file.endswith(".xz"):
        with open(compressed_file, "rb") as compressed_f:
            decompressed_data = bz2.decompress(compressed_f.read()) if compressed_file.endswith(".bz2") else lzma.decompress(compressed_f.read())
            # Split the decompressed data based on file boundaries using index_map

             # Wrap decompressed_data with io.BytesIO to treat it as a file-like object
            data_stream = io.BytesIO(decompressed_data)

            for idx, file in enumerate(sorted(index_map.keys(), key=lambda x: int(index_map[x]))):
                extracted_path = os.path.join(output_folder, file)
                # Write the file content
                with open(extracted_path, "wb") as f:
                     f.write(data_stream.read()) # Write the entire decompressed data, assuming single file in bz2/xz

    extracted_size, num_files = get_folder_size(output_folder)
    print("Files extracted successfully using index mapping.")
    print(f"Extracted Folder Size: {extracted_size / 1024:.2f} KB, Number of Files: {num_files}")
    show_folder_contents(output_folder, "Extracted Folder Contents")

if __name__ == "__main__":
    input_folder = input("Enter the full path of the folder to compress: ")
    output_folder = input("Enter the full path of the folder to extract files: ")
    show_folder_contents(input_folder, "Original Folder Contents")

    compression_method = determine_compression_method(input_folder)
    compressed_file, index_map_filename = compress_folder(input_folder, compression_method)
    extract_folder(compressed_file, index_map_filename, output_folder)


Enter the full path of the folder to compress: /content/Sample_folder
Enter the full path of the folder to extract files: /content/Copy_Folder1
Original Folder Contents (/content/Sample_folder): 10 files, 2534.27 KB
Files compressed into compressed_files.bz2 using bz2.
Original Size: 2534.27 KB, Compressed Size: 2442.91 KB
Compression Ratio: 96.40%
Compression Success: 3.60%
Files extracted successfully using index mapping.
Extracted Folder Size: 2534.27 KB, Number of Files: 10
Extracted Folder Contents (/content/Copy_Folder1): 10 files, 2534.27 KB
