In [2]:
import polars as pl
print(pl.__version__)

import os

1.18.0


## Get list of files

In [3]:
parquet_folder = "/home/vikas/Desktop/Globus/Gaia/gaia_parquet"

def get_files(table_name):

    return os.path.join(parquet_folder, table_name, "*")

## Run query

#### Eager

In [4]:
# df_photometry = (
#     pl.read_parquet(get_files("photometry"))
#     .filter(pl.col("phot_bp_mean_flux") > 5)
# )

#### Lazy

##### Number of unique objects

In [5]:
def stream_query_1():

    q1 = (
        pl.scan_parquet(get_files("photometry"))
          .select("object_id")
          .unique()
    )

    #print(q1.explain(streaming = True))

    return q1.collect(streaming = True)

In [6]:
#stream_query_1().shape

##### Highest and lowest brightness

In [7]:
def stream_query_2():

    q1 = (
        pl.scan_parquet(get_files("photometry"))
          .group_by("healpix")
          .agg(
              pl.col("phot_g_mean_mag").min().alias("min_phot_g_mean_mag"),
              pl.col("phot_g_mean_mag").max().alias("max_phot_g_mean_mag")
          )
        .sort("healpix")
    )

    #print(q1.explain(streaming = True))

    return q1.collect(streaming = True)

In [8]:
stream_query_2()

healpix,min_phot_g_mean_mag,max_phot_g_mean_mag
i32,f32,f32
0,6.855546,19.842781
1,7.355467,19.706682
2,7.312822,19.992142
3,6.964732,19.499153
4,3.382374,20.126875
…,…,…
2685,6.030447,20.092436
2686,4.124845,20.017384
2687,4.425611,20.595608
2688,4.03991,19.728607


##### Joins

In [11]:
def stream_query_3(max_healpix):

    q1 = (pl.scan_parquet(get_files("radial_velocity"))
            .select("radial_velocity",
                    "object_id",
                    "healpix")
            .filter(pl.col("healpix") < max_healpix)
         )

    q2 = (
        pl.scan_parquet(get_files("photometry"))
          .filter(pl.col("healpix") < max_healpix)
          .select(
              "object_id",
              "healpix",
              "phot_g_mean_mag"
          )
          .join(q1, 
                on = ["object_id", "healpix"],
                how = "inner")          
          .group_by("healpix", maintain_order = False) 
          .agg(
              pl.col("phot_g_mean_mag").min().alias("min_phot_g_mean_mag"),
              pl.col("phot_g_mean_mag").max().alias("max_phot_g_mean_mag"),
              pl.col("radial_velocity").max().alias("max_radial_velocity"),
              pl.col("radial_velocity").min().alias("min_radial_velocity")
              )
          .sort("healpix")
    )

    #print(q2.explain(streaming = True))

    return q2.collect(streaming = True)

In [16]:
%%time

stream_query_3(1500)

CPU times: user 31.3 s, sys: 3.5 s, total: 34.8 s
Wall time: 11.6 s


healpix,min_phot_g_mean_mag,max_phot_g_mean_mag,max_radial_velocity,min_radial_velocity
i32,f32,f32,f32,f32
0,6.855546,19.842781,244.173599,-185.370361
1,7.355467,19.706682,243.473114,-224.071869
2,7.312822,19.992142,241.080765,-545.814758
3,6.964732,19.499153,235.114029,-295.097992
4,3.382374,20.126875,256.130188,-258.272003
…,…,…,…,…
1495,7.374996,19.269976,226.814987,-110.846519
1496,6.559766,19.765856,312.435913,-295.014282
1497,6.820708,20.04957,308.593109,-275.263733
1498,5.724092,19.317711,319.57486,-148.072952
