In [2]:
import pyarrow as pa

parquet_schema = pa.schema([
    pa.field("prb_id", pa.int64(), nullable=False),
    pa.field("dst_addr", pa.string(), nullable=False),
    pa.field("ts", pa.int64(), nullable=False),
    pa.field("sent", pa.int64(), nullable=False),
    pa.field("rcvd", pa.int64(), nullable=False),
    pa.field("avg", pa.float64(), nullable=True),
    pa.field("result", pa.string(), nullable=True),
])


In [None]:
import json
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io.filesystem import CompressionTypes
from apache_beam.io.parquetio import WriteToParquet
from apache_beam.options.pipeline_options import PipelineOptions

# 1. Pipeline options (adjust as needed for Dataflow/runner)
options = PipelineOptions([])

with beam.Pipeline(options=options) as p:
    (
        p
        | "ReadBZ2" >> ReadFromText(
            "https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-06-07/ping-2025-06-07T0000.bz2",
            compression_type=CompressionTypes.BZIP2,   # tell Beam to decompress BZ2 :contentReference[oaicite:6]{index=6}
            strip_trailing_newlines=True
          )

        | "ParseJSON" >> beam.Map(json.loads)         # PCollection[dict] :contentReference[oaicite:7]{index=7}

        | "FilterSent" >> beam.Filter(
            lambda r: r.get("sent", -1) == 3
          )

        | "SelectFields" >> beam.Map(lambda r: {
            "prb_id":   r["prb_id"],
            "dst_addr": r["dst_addr"],
            "ts":       r["timestamp"],
            "sent":     r["sent"],
            "rcvd":     r["rcvd"],
            "avg":      r.get("avg"),
            # Keep result as JSON string or adjust to dict if desired
            "result":   json.dumps(r.get("result"))
          })

        | "WriteParquet" >> WriteToParquet(
            file_path_prefix="data/ping-2025-06-10T0000.parquet",
            schema=parquet_schema,
            file_name_suffix=".parquet",
            codec="snappy"
          )
    )
