In [105]:
import numpy as np
import h5py
import os

from datasets import load_dataset
import polars as pl

In [106]:
print(np.__version__)
print(pl.__version__)

1.24.2
1.18.0


## Get list of files

In [82]:
raw_dir = "/home/vikas/Desktop/Globus/gaia"

In [83]:
def get_files(raw_dir, ext):

    files_found = []

    for path, dirs, files in os.walk(raw_dir):
        for file in files:
            if file.endswith(ext):
                files_found.append(os.path.join(path, file))

    return files_found

In [80]:
# Define the sorting key function
def healpix_sort_key(path):
    
    # Extract the number after 'healpix=' using string manipulation
    healpix_part = path.split('healpix=')[1]
    healpix_number = int(healpix_part.split('/')[0])
    
    return healpix_number

In [81]:
def sorted_files(start, end):
    
    files_hdf = get_files(raw_dir, ".hdf5")
    
    files_hdf_sorted = sorted(files_hdf, key = healpix_sort_key)
    
    return files_hdf_sorted[start:end]    

## Load dataset

In [168]:
def load_data(start, end):
    
    # Total number of files/shards is 1882    
    data_files = {
    "train": sorted_files(start, end)
    }
    
    hdf_dset = load_dataset(raw_dir,
                            split = 'train',
                            num_proc = 24,
                            streaming = False,
                            data_files = data_files)
    
    #hdf_dset = hdf_dset.with_format('numpy')
    
    return hdf_dset   

## Convert to Polars DataFrame

In [129]:
def get_object_id(hdf_dset):
    
    object_id_all = hdf_dset[:]["object_id"]
    
    object_series = pl.Series("object_id", object_id_all)
    
    return object_series    

In [176]:
def dset_to_polars(hdf_dset, feature, object_series):
    
    # Collect all rows    
    feature_all = hdf_dset[:][feature]
    
    # Convert to Polars DataFrame    
    df_feature_all = pl.DataFrame(feature_all)
    
    # Add object_id as the first column    
    df_feature_all.insert_column(0, object_series)
    
    return df_feature_all   

In [177]:
# hdf_dset = load_data(10, 58)

# object_series = get_object_id(hdf_dset)

# df_feature = dset_to_polars(hdf_dset, "gspphot") 

## Convert to parquet

In [138]:
dest_dir = "/home/vikas/Desktop/Globus/gaia_parquet"

In [178]:
def hdf_to_parquet(start, end):

    hdf_dset = load_data(start, end)

    object_series = get_object_id(hdf_dset)

    features_list = [
                     "photometry",
                     "astrometry",
                     "radial_velocity",
                     "gspphot"        
                    ]
    
    for feature in features_list:
        #try:
            df_feature = dset_to_polars(hdf_dset, feature, object_series)
            
            dest_parquet = os.path.join(
                                        dest_dir,
                                        feature,
                                        f"gaia_{feature}_{start}_{end}.parquet"
                                       )
            
            df_feature.write_parquet(
                dest_parquet,
                compression = "zstd",
                compression_level = 22,
                
            )
            
        # except:
        #     print(f"Unable to create parquet output for {feature}!")
        #     continue
            
    return None        

In [179]:
# (0, 10) --> Done

hdf_to_parquet(10, 58)

## Test exported parquet files

In [152]:
# test_dir = "/home/vikas/Desktop/Globus/gaia_parquet/photometry/*.parquet"

# query = (pl
#          .scan_parquet(test_dir)
#          .filter(pl.col("phot_g_mean_flux") > 200000)
#         )

# query.collect(streaming=True)