In [105]:
import numpy as np
import h5py
import os

from datasets import load_dataset
import polars as pl

In [106]:
print(np.__version__)
print(pl.__version__)

1.24.2
1.18.0


## Get list of files

In [82]:
raw_dir = "/home/vikas/Desktop/Globus/gaia"

In [83]:
def get_files(raw_dir, ext):

    files_found = []

    for path, dirs, files in os.walk(raw_dir):
        for file in files:
            if file.endswith(ext):
                files_found.append(os.path.join(path, file))

    return files_found

In [80]:
# Define the sorting key function
def healpix_sort_key(path):
    
    # Extract the number after 'healpix=' using string manipulation
    healpix_part = path.split('healpix=')[1]
    healpix_number = int(healpix_part.split('/')[0])
    
    return healpix_number

In [81]:
def sorted_files(start, end):
    
    files_hdf = get_files(raw_dir, ".hdf5")
    
    files_hdf_sorted = sorted(files_hdf, key = healpix_sort_key)
    
    return files_hdf_sorted[start:end]    

## Load dataset

In [128]:
def load_data(start, end):
    
    # Total number of files/shards is 1882    
    data_files = {
    "train": sorted_files(start, end)
    }
    
    hdf_dset = load_dataset(raw_dir,
                            split = 'train',
                            num_proc = 12,
                            streaming = False,
                            data_files = data_files)
    
    #hdf_dset = hdf_dset.with_format('numpy')
    
    return hdf_dset   

## Convert to Polars DataFrame

In [129]:
def get_object_id(hdf_dset):
    
    object_id_all = hdf_dset[:]["object_id"]
    
    object_series = pl.Series("object_id", object_id_all)
    
    return object_series    

In [141]:
def dset_to_polars(hdf_dset, feature):
    
    # Collect all rows    
    feature_all = hdf_dset[:][feature]
    
    # Convert to Polars DataFrame    
    df_feature_all = pl.DataFrame(feature_all)
    
    # Add object_id as the first column    
    df_feature_all.insert_column(0, object_series)
    
    return df_feature_all   

In [146]:
hdf_dset = load_data(0, 10)

object_series = get_object_id(hdf_dset)

df_photometry = dset_to_polars(hdf_dset, "photometry") 

In [147]:
df_photometry

object_id,phot_g_mean_mag,phot_g_mean_flux,phot_g_mean_flux_error,phot_bp_mean_mag,phot_bp_mean_flux,phot_bp_mean_flux_error,phot_rp_mean_mag,phot_rp_mean_flux,phot_rp_mean_flux_error,phot_bp_rp_excess_factor,bp_rp,bp_g,g_rp
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
4295806720,17.641426,1653.394775,2.075764,18.080235,800.429565,12.601409,17.061232,1187.588013,15.823832,1.202385,1.019003,0.438808,0.580194
38655544960,14.128453,42030.601562,11.392837,14.70305,17955.478516,26.03932,13.410816,34263.488281,36.75135,1.242404,1.292233,0.574596,0.717637
1275606125952,16.236271,6031.68457,3.778775,16.662441,2954.204102,15.563785,15.644692,4378.056152,17.483109,1.215624,1.017749,0.42617,0.591578
1653563247744,16.148806,6537.695801,3.721863,16.436014,3639.244629,19.61796,15.681716,4231.28125,13.291231,1.203869,0.754298,0.287209,0.46709
2851858288640,12.356248,215004.46875,66.965332,12.676144,116134.0625,101.780373,11.865775,142183.140625,67.716812,1.20145,0.810369,0.319896,0.490473
…,…,…,…,…,…,…,…,…,…,…,…,…,…
22516250085811840,14.284503,36403.6875,20.802837,14.977914,13939.628906,17.328167,13.465993,32565.744141,24.999876,1.277491,1.511921,0.693411,0.81851
22517074719532288,15.481333,12089.646484,5.258407,15.926439,5818.901367,14.095147,14.869446,8940.875,20.823158,1.220861,1.056993,0.445107,0.611887
22517143439009152,12.846614,136867.765625,35.41399,13.39869,59695.386719,48.034939,12.137379,110715.039062,106.337708,1.245073,1.261312,0.552076,0.709235
22517899353252480,17.28809,2289.34375,4.978139,17.963089,891.622803,20.064796,16.476665,2034.659058,14.773705,1.278219,1.486423,0.674999,0.811424


## Convert to parquet

In [138]:
dest_dir = "/home/vikas/Desktop/Globus/gaia_parquet"

In [142]:
def hdf_to_parquet(start, end):

    hdf_dset = load_data(start, end)

    object_series = get_object_id(hdf_dset)

    features_list = [
                     "photometry",
                     "astrometry",
                     "radial_velocity"
                    ]
    
    for feature in features_list:
        try:
            df_feature = dset_to_polars(hdf_dset, feature)
            
            dest_parquet = os.path.join(
                                        dest_dir,
                                        feature,
                                        f"gaia_{feature}_{start}_{end}.parquet"
                                       )
            
            df_feature.write_parquet(
                dest_parquet,
                compression = "zstd",
                compression_level = 22,
                
            )
            
        except:
            print(f"Unable to create parquet output for {feature}!")
            continue
            
    return None        

In [143]:
hdf_to_parquet(0, 10)

## Test exported parquet files

In [152]:
# test_dir = "/home/vikas/Desktop/Globus/gaia_parquet/photometry/*.parquet"

# query = (pl
#          .scan_parquet(test_dir)
#          .filter(pl.col("phot_g_mean_flux") > 200000)
#         )

# query.collect(streaming=True)