# Compute Cl on galaxy overdensities on CosmoDC2

- author : Sylvie Dagoret-Campagne
- affliliation : IJCLab/in2p3/CNRS
- creation date : July 29th 2020


In [2]:
%matplotlib inline

import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
import pyarrow

In [4]:
pyarrow.__version__

'0.15.1'

In [5]:
from pyspark.sql.functions import col, pandas_udf 
from pyspark.sql.types import LongType

from pyspark.sql.types import IntegerType,FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [6]:
# here is how we create a function ('Ang2Pix') that can be called by dataframes
# it takes as input the "ra" and "dec" values (which are not very different from theta/phi)
# and returns the pixel number (but as pandas series for efficiency)
import numpy as np
import pandas as pd
import healpy as hp

nside=512

In [7]:
npix = hp.nside2npix(nside)
lmax = 3 * nside

In [8]:
def Ang2Pix_func(ra: pd.Series, dec: pd.Series) -> pd.Series:
    return pd.Series(hp.ang2pix(nside,np.radians(90-dec),np.radians(ra)))

In [9]:
pd_ang2pix = pandas_udf(Ang2Pix_func, returnType=IntegerType())
#gal = gal.withColumn("ihealpix",pd_ang2pix(gal["RA"],gal["DEC"]))

## Get parquet files


### scan all availables files

In [10]:
import os,re

In [11]:
the_input_dir="/global/cfs/cdirs/lsst/shared/DC2-prod/Run2.2i/truth/galtruth"

In [12]:
the_list_of_files=os.listdir(the_input_dir)

In [19]:
the_list_of_files[:5]

['truth_summary_hp9304.sqlite3',
 'truth_summary_hp10194.sqlite3',
 'truth_summary_hp9945.parquet',
 'truth_summary_hp9939.parquet',
 'truth_summary_hp9944.sqlite3']

### sort all files

In [20]:
the_sorted_list_of_files=sorted(the_list_of_files)

### filter the good parquet files

In [21]:
selected_files=[]
for filename in the_sorted_list_of_files:
    sel_filename=re.findall("^truth_summary_hp.*.parquet$",filename)
    if len(sel_filename) > 0:
        selected_files.append(sel_filename[0])

In [22]:
selected_files[:5]

['truth_summary_hp10066.parquet',
 'truth_summary_hp10067.parquet',
 'truth_summary_hp10068.parquet',
 'truth_summary_hp10069.parquet',
 'truth_summary_hp10070.parquet']

In [23]:
InputPath = [the_input_dir + "/{}".format(filename) for filename in selected_files]

### Read files with spark

In [24]:
from pyspark.sql import SparkSession

# Initialise our Spark session
spark = SparkSession.builder.getOrCreate()

# Read the data as DataFrame
#df = spark.read.format("parquet").load(datafile)

In [25]:
#InputPath = [base_dir + "/dc2_object_run2.2i_dr6b_tract2897.parquet",
#             base_dir + "/dc2_object_run2.2i_dr6b_tract2898.parquet"]

df = spark.read.parquet(*InputPath)

In [26]:
df = df.repartition(df.rdd.getNumPartitions())

### DC2 Object catalog Schema

In [27]:
# Check what we have in the file
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_galaxy: long (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- redshift: float (nullable = true)
 |-- is_variable: integer (nullable = true)
 |-- is_pointsource: integer (nullable = true)
 |-- flux_u: float (nullable = true)
 |-- flux_g: float (nullable = true)
 |-- flux_r: float (nullable = true)
 |-- flux_i: float (nullable = true)
 |-- flux_z: float (nullable = true)
 |-- flux_y: float (nullable = true)
 |-- flux_u_noMW: float (nullable = true)
 |-- flux_g_noMW: float (nullable = true)
 |-- flux_r_noMW: float (nullable = true)
 |-- flux_i_noMW: float (nullable = true)
 |-- flux_z_noMW: float (nullable = true)
 |-- flux_y_noMW: float (nullable = true)



In [29]:
df_gal=df.filter('is_pointsource == false')

In [30]:
pd_ang2pix = pandas_udf(Ang2Pix_func, returnType=IntegerType())
df_gal_healpix = df_gal.withColumn("ihealpix",pd_ang2pix(df_gal["ra"],df_gal["dec"]))

In [31]:
df_gal_healpix.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_galaxy: long (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- redshift: float (nullable = true)
 |-- is_variable: integer (nullable = true)
 |-- is_pointsource: integer (nullable = true)
 |-- flux_u: float (nullable = true)
 |-- flux_g: float (nullable = true)
 |-- flux_r: float (nullable = true)
 |-- flux_i: float (nullable = true)
 |-- flux_z: float (nullable = true)
 |-- flux_y: float (nullable = true)
 |-- flux_u_noMW: float (nullable = true)
 |-- flux_g_noMW: float (nullable = true)
 |-- flux_r_noMW: float (nullable = true)
 |-- flux_i_noMW: float (nullable = true)
 |-- flux_z_noMW: float (nullable = true)
 |-- flux_y_noMW: float (nullable = true)
 |-- ihealpix: integer (nullable = true)



In [32]:
df_gal_healpix_small=df_gal_healpix.select('ra','dec','ihealpix')

In [None]:
df_gal_healpix_small.describe().show()

In [None]:
m=df_gal_healpix_small.groupBy('ihealpix').count()
m.show(5)

In [None]:
#m.orderBy('ihealpix', ascending=True).show(10)

In [None]:
# get to python world (to Pandas) : 
# note that here is the action (lazy evaluation so far)
p=m.toPandas()
p.info()

In [None]:
p

In [None]:
# the following doesn't has anything to do with spark (only Healpix)
hpMap = np.zeros(hp.nside2npix(nside))
#fill the map from the pandas object
hpMap[p['ihealpix'].values]=p['count'].values
#plot using standard healpy function
hp.mollview(hpMap,cmap="jet")
hp.graticule(color='white')

In [None]:
cl = hp.anafast(hpMap)
ell = np.arange(len(cl))