In [8]:
additional_camera_directories = [
    "/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam",
    "/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYFB/WxCam",
    "/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYXY/WxCam",
    "/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CWSK/WxCam",
]

In [9]:
import os

def find_tgz_files_os_walk_grouped(directory):
    tgz_files_grouped = {}
    # Walk through directory
    for root, dirs, files in os.walk(directory):
        # Filter for .tgz files
        tgz_files = [os.path.join(root, file) for file in files if file.endswith('.tgz')]
        if tgz_files:  # Only add if .tgz files are found in the current directory
            tgz_files_grouped[root] = tgz_files
    return tgz_files_grouped

In [14]:
for directory in additional_camera_directories:
    print("=" * 50)
    print(f"Directory: {directory}")
    tgz_files_grouped = find_tgz_files_os_walk_grouped(directory)

    total_num_files = 0
    # Iterating through all the files in the found structure
    for directory, files in tgz_files_grouped.items():
        total_num_files += len(files)

    print(f"Found {total_num_files} .tgz files in {len(tgz_files_grouped)} directories.")

    # Iterating through all the files in the found structure
    for directory, files in tgz_files_grouped.items():
        print(f"{directory} contains {len(files)} .tgz files.")
    print("=" * 50)

Directory: /space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam
Found 8030 .tgz files in 75 directories.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201809 contains 97 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201805 contains 96 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201801 contains 124 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201812 contains 12 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201803 contains 124 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201808 contains 124 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201806 contains 120 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201802 contains 112 .tgz files.
/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/CYYZ/WxCam/2018/201804 contains 107 .tgz files.
/space/hall5/sitestore/eccc/

In [15]:
import random

def reproducible_random_sort(lst, seed=42):
    # Set the seed for the random number generator to make the shuffle reproducible
    random.seed(seed)
    # Shuffle the list in place in a reproducible manner
    random.shuffle(lst)
    return lst

In [16]:
sorted_directories = reproducible_random_sort(list(tgz_files_grouped.keys()))

In [18]:
# Define target range for preprocessing
output_path = "/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002/NavCan_WxCams_Sample_v4"
start_dir_index = 0
end_dir_index = len(sorted_directories)

In [19]:
tgz_paths = []

for directory in sorted_directories[start_dir_index:end_dir_index]:
    tgz_paths += tgz_files_grouped[directory]
    
print(f"Preprocessing {len(tgz_paths)} .tgz files.")

Preprocessing 11369 .tgz files.


In [20]:
import tarfile
import os

def extract_tgz_to_folder(input_path, output_path, folder_name):
    """
    Extracts a .tgz file to a specified folder within the output path.

    Parameters:
    - input_path: The path to the .tgz file to be extracted.
    - output_path: The base path where the extracted files will be saved.
    - folder_name: The name of the folder to create at the output path for storing the extracted files.

    Returns:
    - None
    """
    # Create the full output directory path with the folder name
    full_output_path = os.path.join(output_path, folder_name)
    
    # Check if the .tgz file exists
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"The file {input_path} does not exist.")
    
    # Create the full output directory (including folder name) if it does not exist
    if not os.path.exists(full_output_path):
        os.makedirs(full_output_path, exist_ok=True)
    
    # Open the .tgz file
    with tarfile.open(input_path, "r:gz") as file:
        # Extract all the contents into the full output directory
        file.extractall(path=full_output_path)

In [22]:
def format_folder_name(tgz_path):
    """
    Takes a string of a specific format and returns a formatted string.
    
    Parameters:
    - tgz_path: A string with the format '/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/NavCan_WxCams/CYUX/2024/202401/NavCan-WxCams_CYUX-W_2024-01-29.tgz'
    
    Returns:
    - A formatted string 'CYUX_2024_202401_NavCan-WxCams_CYUX-W_2024-01-29'
    """
    # Split the string by '/'
    parts = tgz_path.split('/')
    
    # Extract the necessary parts
    station = parts[9]  # 'CYUX'
    year = parts[10]     # '2024'
    month = parts[11]    # '202401'
    filename = parts[-1].replace('.tgz', '')  # 'NavCan-WxCams_CYUX-W_2024-01-29'
    
    # Construct the formatted string
    formatted_str = f"{station}_{year}_{month}_{filename}"
    
    return formatted_str

In [None]:
from tqdm import tqdm

for tgz_path in tqdm(tgz_paths):
    try:
        folder_name = format_folder_name(tgz_path)
        extract_tgz_to_folder(tgz_path, output_path, folder_name)
    except Exception as e:
        print(e)
        continue

  5%|▍         | 522/11369 [01:41<1:40:51,  1.79it/s]

Compressed file ended before the end-of-stream marker was reached


 99%|█████████▊| 11210/11369 [50:33<00:58,  2.70it/s] 

In [25]:
import os

def list_subdirectories_to_file(directory, output_file):
    """
    List all subdirectories in the given directory and store their names in a .txt file.

    Args:
    directory (str): The path to the directory whose subdirectories you want to list.
    output_file (str): The path to the output .txt file where the subdirectory names will be stored.
    """
    with open(output_file, 'w') as file:
        for item in os.listdir(directory):
            if os.path.isdir(os.path.join(directory, item)):
                print(item, file=file)

In [26]:
output_path

'/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002/NavCan_WxCams_Sample_v4'

In [27]:
output_file_path = "NavCan_WxCams_Sample_v4_subdirectories.txt"

In [28]:
list_subdirectories_to_file(output_path, output_file_path)