In [1]:
from pyhdf.SD import SD, SDC
from pyhdf.HDF import HDF, HC
from pyhdf.VS import VS
from pyhdf.V import V
import numpy as np

In [8]:
infile = '/glade/work/swei/data/RawOBS/CALIPSO/CAL_LID_L2_05kmAPro-Standard-V4-20.2019-07-22T03-45-56ZN.hdf'
outfile = '/glade/work/swei/data/RawOBS/CALIPSO/test_subset.hdf'
N = 50

# Choose datasets to extract
var_names = [
    'Latitude',
    'Longitude',
    'Profile_Time',
    'Profile_UTC_Time',
    'Extinction_Coefficient_532',
    'Extinction_Coefficient_1064',
    'Extinction_Coefficient_Uncertainty_532',
    'Extinction_Coefficient_Uncertainty_1064',
    'Extinction_QC_Flag_532',
    'Extinction_QC_Flag_1064',
    'CAD_Score',
    'Pressure',
]

In [9]:
# Mapping numpy dtype.kind to SDC type
def detect_sdc_type(value):
    if isinstance(value, int):
        return SDC.INT32
    elif isinstance(value, float):
        return SDC.FLOAT64
    elif isinstance(value, str):
        return SDC.CHAR
    elif isinstance(value, (bytes, bytearray)):
        return SDC.CHAR
    elif isinstance(value, (list, tuple, np.ndarray)):
        # Check if homogeneous
        if all(isinstance(v, int) for v in value):
            return SDC.INT32
        elif all(isinstance(v, float) for v in value):
            return SDC.FLOAT64
        elif all(isinstance(v, str) for v in value):
            return SDC.CHAR
        else:
            raise TypeError("Mixed types in list, can't determine SDC type")
    else:
        raise TypeError(f"Unsupported type: {type(value)}")

In [10]:
# -------- Step 1: Read SDS and subset --------
sd_in = SD(infile, SDC.READ)
sd_out = SD(outfile, SDC.WRITE | SDC.CREATE)

for sds_name in sd_in.datasets().keys():
    if sds_name not in var_names:
        continue
    print(sds_name)
    sds = sd_in.select(sds_name)
    ndim = sds.info()[1]
    dtype = sds.info()[3]
    data = None

    # Only subset dimensions where profiles are present (typically dim=0 or dim=1)
    if ndim == 1:
        data = sds[:N]
    elif ndim == 2:
        data = sds[:N, :]
    elif ndim == 3:
        data = sds[:N, :, :]
    else:
        data = sds[:]  # Keep entire dataset if unknown shape

    # Create output SDS
    out_sds = sd_out.create(sds_name, dtype, data.shape)
    out_sds[:] = data

    for attr_name, attr_val in sds.attributes().items():
        attr_type = detect_sdc_type(attr_val)
        out_sds.attr(attr_name).set(attr_type, attr_val)

    sds.endaccess()
    out_sds.endaccess()

# Copy global attributes
for attr_name, attr_val in sd_in.attributes().items():
    attr_type = detect_sdc_type(attr_val)
    sd_out.attr(attr_name).set(attr_type, attr_val)

sd_in.end()
sd_out.end()

# -------- Step 2: Copy Vgroups and Vdata (metadata) --------
print('Working on Vgroups')
hdf_in = HDF(infile, HC.READ)
hdf_out = HDF(outfile, HC.WRITE)

# Copy metadata Vdata
vs_in = hdf_in.vstart()
vs_out = hdf_out.vstart()
vs_in = hdf_in.vstart()
vs_out = hdf_out.vstart()
ref = vs_in.find('metadata')
vdata = vs_in.attach(ref)
fields = vdata._fields
nrecs = vdata._nrecs
vdname = vdata._name
data = vdata.read(nrecs)

fields_str = []
for name, dtype, order, _, _, _, _ in vdata.fieldinfo():
    fields_str.append((name, dtype, order))

vdata_new = vs_out.create(vdname, fields_str)
vdata_new.write(data)

vdata.detach()
vdata_new.detach()

hdf_in.close()
hdf_out.close()


Latitude
Longitude
Profile_Time
Profile_UTC_Time
Pressure
Extinction_QC_Flag_532
Extinction_QC_Flag_1064
CAD_Score
Extinction_Coefficient_532
Extinction_Coefficient_Uncertainty_532
Extinction_Coefficient_1064
Extinction_Coefficient_Uncertainty_1064
Working on Vgroups
