In [1]:
# List all files in data/raw_ping/ and for each one
# check if a parquet version already exists in the ping folder
# If not, decompress the bz2 file, parse it to parquet, and delete the decompressed version
# Process all files in parallel

import os
import subprocess
import json
import shutil
import pyarrow as pa
import dask.dataframe as dd
from dask.distributed import Client
import dask.bag as db
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import threading

in_progress = set()
in_progress_lock = threading.Lock()

client = Client()
print(client.dashboard_link)

# Define the schema for parquet files
parquet_schema = pa.schema([
    pa.field("prb_id", pa.int64(), nullable=False),
    pa.field("dst_addr", pa.string(), nullable=False),
    pa.field("ts", pa.int64(), nullable=False),
    pa.field("sent", pa.int64(), nullable=False),
    pa.field("rcvd", pa.int64(), nullable=False),
    pa.field("avg", pa.float64(), nullable=True),
    pa.field("result", pa.string(), nullable=True),
])

raw_dir = "data/raw_ping/"
decomp_dir = "data/decomp_ping/"
ping_dir = "data/ping/"

# Ensure output directories exist
for d in [decomp_dir, ping_dir]:
    os.makedirs(d, exist_ok=True)

# Get all files in raw_dir
raw_files = [f for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f))]

print(f"Raw files: {raw_files}")

# Collect files that need to be processed
to_process = []
for raw_file in raw_files:
    base_name = os.path.splitext(raw_file)[0]
    parquet_path = os.path.join(ping_dir, base_name + '.parquet')
    
    # Check if parquet file already exists
    if not os.path.exists(parquet_path) and raw_file.endswith('.bz2'):
        to_process.append(raw_file)

print(f"To process: {to_process}")

def process_single_file(raw_file):
    """
    Decompress a bz2 file, parse it to parquet, and clean up the decompressed file
    """
    base_name = os.path.splitext(raw_file)[0]
    raw_path = os.path.join(raw_dir, raw_file)
    decomp_path = os.path.join(decomp_dir, base_name + '.json')
    parquet_path = os.path.join(ping_dir, base_name + '.parquet')
    

    print(f"Starting processing: {raw_file} -> {parquet_path}")

    with in_progress_lock:
        in_progress.add(decomp_path)
        in_progress.add(parquet_path)
    try:
        # Step 1: Decompress the bz2 file (skip if decompressed file already exists)
        if not os.path.exists(decomp_path):
            print(f"  Decompressing: {raw_file}")
            cmd = ["bzip2", "-dc", raw_path]
            with open(decomp_path, "wb") as f_out:
                subprocess.run(cmd, stdout=f_out, check=True)
        else:
            print(f"  Using existing decompressed file: {decomp_path}")

        # Remove already decompressed file from in_progress set
        with in_progress_lock:
            in_progress.discard(decomp_path)
        
        # Step 2: Parse the decompressed JSON to parquet
        print(f"  Parsing to parquet: {base_name}.json")
        
        # Read the decompressed file as a bag of lines
        bag = db.read_text(decomp_path, blocksize="128MB")
        
        # Parse JSON and filter
        processed_bag = (bag
            .map(json.loads)  # Parse JSON
            .filter(lambda r: r.get("sent", -1) == 3)  # Filter for sent == 3
            .map(lambda r: {  # Select and transform fields
                "prb_id": r["prb_id"],
                "dst_addr": r["dst_addr"],
                "ts": r["timestamp"],
                "sent": r["sent"],
                "rcvd": r["rcvd"],
                "avg": r.get("avg"),
                "result": json.dumps(r.get("result"))
            })
        )
        
        # Convert to DataFrame and save as parquet
        df = processed_bag.to_dataframe()
        df.to_parquet(parquet_path, engine='pyarrow', schema=parquet_schema)
        
        if os.path.exists(parquet_path):
            # Step 3: Clean up the decompressed file
            print(f"  Cleaning up: {decomp_path}")
            os.remove(decomp_path)
        else:
            print(f"  Parquet file not created: {parquet_path}, skipping cleanup of {decomp_path}")
            
        print(f"Completed processing: {raw_file} -> {parquet_path}")

        with in_progress_lock:
            in_progress.discard(parquet_path)
        
        return parquet_path
        
    except Exception as e:
        print(f"Error processing {raw_file}: {e}")
        # Clean up decompressed file if it exists and there was an error
        if os.path.exists(decomp_path):
            try:
                os.remove(decomp_path)
                print(f"  Cleaned up decompressed file after error: {decomp_path}")
            except:
                pass
        
        with in_progress_lock:
            in_progress.discard(decomp_path)
            in_progress.discard(parquet_path)
        
        return None

try:
    if to_process:
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(process_single_file, to_process))
        
        # Report results
        successful = [r for r in results if r is not None]
        failed = len(results) - len(successful)
        
        print(f"\nProcessing complete!")
        print(f"Successfully processed: {len(successful)} files")
        print(f"Failed: {failed} files")
        
        if successful:
            print("Successfully created parquet files:")
            for result in successful:
                print(f"  {result}")
    else:
        print("No files to process.")
except KeyboardInterrupt:
    print("\nKeyboardInterrupt detected! Cleaning up unfinished files...")
    with in_progress_lock:
        for f in list(in_progress):
            if os.path.exists(f):
                try:
                    if os.path.isdir(f):
                        shutil.rmtree(f)
                        print(f"  Deleted unfinished directory: {f}")
                    else:
                        os.remove(f)
                        print(f"  Deleted unfinished file: {f}")
                except Exception as e:
                    print(f"  Could not delete {f}: {e}")

http://127.0.0.1:8787/status
Raw files: ['ping-2025-06-24T0000.bz2', 'ping-2025-06-24T0100.bz2', 'ping-2025-06-24T0200.bz2', 'ping-2025-06-24T0300.bz2', 'ping-2025-06-24T0400.bz2', 'ping-2025-06-24T0500.bz2', 'ping-2025-06-24T0600.bz2', 'ping-2025-06-24T0700.bz2', 'ping-2025-06-24T0800.bz2', 'ping-2025-06-24T0900.bz2', 'ping-2025-06-24T1000.bz2', 'ping-2025-06-24T1100.bz2', 'ping-2025-06-24T1200.bz2', 'ping-2025-06-24T1300.bz2', 'ping-2025-06-24T1400.bz2', 'ping-2025-06-24T1500.bz2', 'ping-2025-06-24T1600.bz2', 'ping-2025-06-24T1700.bz2', 'ping-2025-06-24T1800.bz2', 'ping-2025-06-24T1900.bz2', 'ping-2025-06-24T2000.bz2', 'ping-2025-06-24T2100.bz2', 'ping-2025-06-24T2200.bz2', 'ping-2025-06-24T2300.bz2', 'ping-2025-06-25T0000.bz2', 'ping-2025-06-25T0100.bz2', 'ping-2025-06-25T0200.bz2', 'ping-2025-06-25T0300.bz2', 'ping-2025-06-25T0400.bz2', 'ping-2025-06-25T0500.bz2', 'ping-2025-06-25T0600.bz2', 'ping-2025-06-25T0700.bz2', 'ping-2025-06-25T0800.bz2', 'ping-2025-06-25T0900.bz2', 'ping-2

2025-07-27 14:58:44,205 - distributed.diskutils - ERROR - Failed to remove '/tmp/dask-scratch-space/worker-ta120b3i/shuffle-fce8f3b51244e073fa0ea1a48b36718f-3/428' (failed in <built-in function unlink>): [Errno 2] No such file or directory: '428'
2025-07-27 14:58:44,205 - distributed.diskutils - ERROR - Failed to remove '/tmp/dask-scratch-space/worker-ta120b3i/shuffle-fce8f3b51244e073fa0ea1a48b36718f-3/1058' (failed in <built-in function unlink>): [Errno 2] No such file or directory: '1058'
2025-07-27 14:58:44,205 - distributed.diskutils - ERROR - Failed to remove '/tmp/dask-scratch-space/worker-ta120b3i/shuffle-fce8f3b51244e073fa0ea1a48b36718f-3/1087' (failed in <built-in function unlink>): [Errno 2] No such file or directory: '1087'
2025-07-27 14:58:44,205 - distributed.diskutils - ERROR - Failed to remove '/tmp/dask-scratch-space/worker-ta120b3i/shuffle-fce8f3b51244e073fa0ea1a48b36718f-3/1851' (failed in <built-in function unlink>): [Errno 2] No such file or directory: '1851'
2025-0


KeyboardInterrupt detected! Cleaning up unfinished files...
  Deleted unfinished directory: data/ping/ping-2025-07-08T0500.parquet
  Could not delete data/ping/ping-2025-07-08T0700.parquet: [Errno 39] Directory not empty: 'data/ping/ping-2025-07-08T0700.parquet'
  Deleted unfinished directory: data/ping/ping-2025-07-08T0100.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-08T0900.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-08T0200.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-07T2300.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-08T2200.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-07T2100.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-07T2200.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-08T1500.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-07T2000.parquet
  Deleted unfinished directory: data/ping/ping-2025-07-08T0300.parquet
  Deleted unfinished direc

2025-07-27 15:24:36,470 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-a07bb979b90a3621e3cde2aa1710f9df', 42)
State:     executing
Task:  <Task ('toparquetdata-a07bb979b90a3621e3cde2aa1710f9df', 42) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T0500.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0500.parquet/part.42.parquet'
Error processing ping-2025-07-07T1800.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T1800.parquet/part.42.parquet'
Error processing ping-2025-07-08T0800.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0800.parquet/part.41.parquet'
Error processing ping-2025-07-07T2200.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T2200.parquet/part.41.parquet'
Error processing ping-2025-07-08T1800.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1800.parquet/part.41.parquet'
Error processing ping-202

2025-07-27 15:24:38,288 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-b23c68fa209f37336f56ea2ba5bc2512', 41)
State:     executing
Task:  <Task ('toparquetdata-b23c68fa209f37336f56ea2ba5bc2512', 41) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T0600.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0600.parquet/part.41.parquet'
Error processing ping-2025-07-08T0200.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0200.parquet/part.41.parquet'
Error processing ping-2025-07-08T1500.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1500.parquet/part.41.parquet'Error processing ping-2025-07-08T1400.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1400.parquet/part.42.parquet'
Error processing ping-2025-07-08T1100.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1100.parquet/part.41.parquet'

  Cleaned up decompressed

2025-07-27 15:24:42,358 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-c162fd546a178cfac64f717884350900', 41)
State:     executing
Task:  <Task ('toparquetdata-c162fd546a178cfac64f717884350900', 41) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-07T2100.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T2100.parquet/part.41.parquet'
Error processing ping-2025-07-08T1000.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1000.parquet/part.41.parquet'
Error processing ping-2025-07-07T1700.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T1700.parquet/part.42.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T1000.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-07T2100.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-07T1700.json


2025-07-27 15:24:53,351 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-9fc727963ab558ebbaa3633867714ae4', 42)
State:     executing
Task:  <Task ('toparquetdata-9fc727963ab558ebbaa3633867714ae4', 42) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T0300.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0300.parquet/part.42.parquet'Error processing ping-2025-07-07T1900.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T1900.parquet/part.40.parquet'
Error processing ping-2025-07-08T0100.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0100.parquet/part.42.parquet'

  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-07T1900.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T0300.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T0100.json


2025-07-27 15:24:55,446 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-a96f261e5eedc019beb09852ad65db0d', 42)
State:     executing
Task:  <Task ('toparquetdata-a96f261e5eedc019beb09852ad65db0d', 42) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T2200.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T2200.parquet/part.42.parquet'Error processing ping-2025-07-08T1600.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1600.parquet/part.42.parquet'
Error processing ping-2025-07-07T2300.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T2300.parquet/part.42.parquet'

Error processing ping-2025-07-08T0000.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0000.parquet/part.39.parquet'


2025-07-27 15:24:55,712 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-d0cd83da95dbd8f3cc2c8ce367324f1a', 40)
State:     executing
Task:  <Task ('toparquetdata-d0cd83da95dbd8f3cc2c8ce367324f1a', 40) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T1200.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1200.parquet/part.40.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-07T2300.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T1600.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T2200.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T0000.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T1200.json


2025-07-27 15:24:57,251 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-54dd6bf94d3a74d6dce32bd8179347ec', 41)
State:     executing
Task:  <Task ('toparquetdata-54dd6bf94d3a74d6dce32bd8179347ec', 41) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T0900.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0900.parquet/part.41.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T0900.json


2025-07-27 15:25:13,855 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-c614adcae1858b2f90fcaef2c0a87edd', 41)
State:     executing
Task:  <Task ('toparquetdata-c614adcae1858b2f90fcaef2c0a87edd', 41) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-07T2000.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-07T2000.parquet/part.41.parquet'
Error processing ping-2025-07-08T1900.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T1900.parquet/part.37.parquet'
Error processing ping-2025-07-08T0400.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T0400.parquet/part.41.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T1900.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-07T2000.json
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T0400.json


2025-07-27 15:25:14,897 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-15bcf3db9e6117f10a805509bc8a2ff1', 40)
State:     executing
Task:  <Task ('toparquetdata-15bcf3db9e6117f10a805509bc8a2ff1', 40) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T2300.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T2300.parquet/part.99.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T2300.json


2025-07-27 15:26:44,039 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-298de082cc44b195755ada1d06292598', 99)
State:     executing
Task:  <Task ('toparquetdata-298de082cc44b195755ada1d06292598', 99) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T2100.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T2100.parquet/part.99.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T2100.json


2025-07-27 15:26:54,003 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-e580edcf86f718620d334f98a171ca29', 98)
State:     executing
Task:  <Task ('toparquetdata-e580edcf86f718620d334f98a171ca29', 98) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-08T2000.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-08T2000.parquet/part.98.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-08T2000.json
  Cleaning up: data/decomp_ping/ping-2025-07-08T0700.json
Completed processing: ping-2025-07-08T0700.bz2 -> data/ping/ping-2025-07-08T0700.parquet
  Cleaning up: data/decomp_ping/ping-2025-07-08T1700.json
Completed processing: ping-2025-07-08T1700.bz2 -> data/ping/ping-2025-07-08T1700.parquet


2025-07-27 15:27:30,515 - distributed.worker - ERROR - Compute Failed
Key:       ('toparquetdata-b4a4f9728062a7ea3005f481629eadc1', 89)
State:     executing
Task:  <Task ('toparquetdata-b4a4f9728062a7ea3005f481629eadc1', 89) to-parquet(...)>
Exception: "FileNotFoundError(2, 'No such file or directory')"
Traceback: '  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/core.py", line 158, in __call__\n    return self.engine.write_partition(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.py", line 852, in write_partition\n    with fs.open(fs.sep.join([path, filename]), "wb") as fil:\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/.venv/lib/python3.12/site-packages/fsspec/spec.py", line 1338, in open\n    f = self._ope

Error processing ping-2025-07-09T0000.bz2: [Errno 2] No such file or directory: '/scratch/workspace/zevwilson_umass_edu-simple/ping-ingest/data/ping/ping-2025-07-09T0000.parquet/part.89.parquet'
  Cleaned up decompressed file after error: data/decomp_ping/ping-2025-07-09T0000.json
