# Cross-Referencing Datasets

Using the HSC SSP catalogue of objects with spectroscopic redshift estimates with a catalogue of objects that have been classified as stars, QSOs, galaxies, or unknown.

We will use these cross-referenced datasets as the basis for validating our Masked Image Modelling approach to developing meaningful embeddings of HSC images.

By creating `.csv` files with the RA, Dec, and redshift measurements of each object, we can then use this information to index into the HSV image data to create datasets of 64$\times$ 64 cutouts around each object.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import h5py
import os
import sys
from scipy.spatial import cKDTree
import time

# from plotting_fns import normalize_images, display_images

### Load the classification data.

`cspec: {0:unknown, 1:star, 2:galaxy, 3:qso}`

In [2]:
class_labels = pd.read_parquet('/home/nick/astro/sky_embeddings/data/redshifts-2024-05-07.parquet')
class_indices = {'unkown': 0, 'star': 1, 'galaxy': 2, 'qso': 3}
class_labels

Unnamed: 0,ra,dec,cspec,zspec,zspec_err
0,24.837864,31.987288,2.0,0.373586,0.0
1,24.834875,32.031507,2.0,0.783066,0.0
2,24.291261,31.900061,1.0,0.000045,0.0
3,24.347372,31.805734,2.0,1.071963,0.0
4,24.270095,31.874742,2.0,0.812852,0.0
...,...,...,...,...,...
55225810,320.101000,-63.718700,,0.021837,0.0
55225811,13.091700,31.442220,,0.015593,0.0
55225812,118.893000,29.173060,,0.021234,0.0
55225813,133.058000,1.460280,,0.204486,0.0


### Load the HSC SSP catalogue

In [3]:
hsc_labels = pd.read_parquet('/home/nick/astro/sky_embeddings/data/hsc_ssp.parquet')
hsc_labels

Unnamed: 0,ra,dec,zspec,zspec_err,cspec,origin
36353,215.096588,52.918053,0.010000,0.006500,0,HSC-SSP
36412,215.224091,53.002052,0.010000,0.006500,0,HSC-SSP
36422,215.190826,52.979145,0.010000,0.006500,0,HSC-SSP
36526,215.039642,52.872917,0.015400,0.010000,0,HSC-SSP
36555,215.223282,53.005901,0.010000,0.006500,0,HSC-SSP
...,...,...,...,...,...,...
3958938,353.040080,33.849074,1.646680,0.000450,0,HSC-SSP
3958939,353.970210,33.985932,0.540920,0.000182,0,HSC-SSP
3958940,357.021430,34.957792,2.215620,0.000388,0,HSC-SSP
3958941,354.444530,35.443101,1.946709,0.000522,0,HSC-SSP


In [13]:
np.count_nonzero(hsc_labels['zspec'].values > 0.)

3533577

In [4]:
def deg_to_cartesian(ra, dec):
    # Convert RA and DEC to radians for spatial indexing
    ra = np.radians(ra)
    dec = np.radians(dec)
    # Convert to Cartesian coordinates
    return np.cos(ra) * np.cos(dec), np.sin(ra) * np.cos(dec), np.sin(dec)

def create_kdtree(ra, dec):
    '''Function to create a KDTree for efficient spatial searches.'''
    # Convert to Cartesian coordinates for KDTree
    x, y, z = deg_to_cartesian(ra, dec)
    coords = np.vstack((x, y, z)).T
    return cKDTree(coords)

tolerance = 1/3600  # Tolerance in degrees
tolerance_rad = np.radians(tolerance)  # Convert tolerance to radians

### Remove duplicates in HSC catalogue.

In [5]:
# Create HSC KDTree to remove duplicates
hsc_kdtree = create_kdtree(hsc_labels['ra'].values, 
                           hsc_labels['dec'].values)

# Collect RA and Dec of HSC SSP data and 
# convert to Cartesian for search
X, Y, Z = deg_to_cartesian(hsc_labels['ra'].values, hsc_labels['dec'].values)

# Remove duplicates
good_indices = []
for i, (x,y,z) in enumerate(zip(X,Y,Z)):
    matches = hsc_kdtree.query_ball_point([x, y, z], r=tolerance_rad)
    if len(matches)<2:
        good_indices.append(i)

print(f'Removed {(len(hsc_labels)-len(good_indices))} duplicates.')
hsc_labels = hsc_labels.iloc[good_indices]

Removed 4834 duplicates.


### Create class .csv files

In [6]:
# Select only a given class of objects and look for matching RA and Decs
for class_name in ['unkown','star','galaxy','qso']:
    class_index = class_indices[class_name]
    # Create KDTree for this class
    class_kdtree = create_kdtree(class_labels[class_labels['cspec']==class_index]['ra'].values, 
                                 class_labels[class_labels['cspec']==class_index]['dec'].values)
    
    # Collect RA and Dec of HSC SSP data and 
    # convert to Cartesian for search
    X, Y, Z = deg_to_cartesian(hsc_labels['ra'].values, hsc_labels['dec'].values)
    
    # Find matching indices within tolerance
    matching_indices = []
    for i, (x,y,z) in enumerate(zip(X,Y,Z)):
        class_matches = class_kdtree.query_ball_point([x, y, z], r=tolerance_rad)
        if len(class_matches)>0:
            matching_indices.append(i)
    

    print(f'Found {len(matching_indices)} objects with the {class_name} class')
    # Write the DataFrame to a CSV file, including only the specified columns
    hsc_labels.iloc[matching_indices].to_csv(f'../data/HSC_{class_name}.csv', 
                                             columns=['ra','dec','zspec','zspec_err'], index=False)

Found 70623 objects with the unkown class
Found 509 objects with the star class
Found 2740520 objects with the galaxy class
Found 741174 objects with the qso class


In [7]:
h5_dir = '/arc/projects/ots/HSC_h5/'
h5_fns = ['HSC_galaxies_GRIZY_64_val_new.h5', 
          'HSC_qso_GRIZY_64_new.h5',
          'HSC_stars_GRIZY_64_new.h5',
          'HSC_unkown_GRIZY_64_new.h5']
csv_dir = '../data/'
csv_fns = ['HSC_galaxy_dud.csv', 
          'HSC_qso_dud.csv', 
          'HSC_star_dud.csv', 
          'HSC_unknown_dud.csv']

for h5_fn, csv_fn in zip(h5_fns, csv_fns):
    with h5py.File(os.path.join(h5_dir, h5_fn), "r") as f:
        # Create a DataFrame from the extracted data
        if 'zspec' in list(f.keys()):
            df = pd.DataFrame({'ra': f['ra'][:],
                               'dec': f['dec'][:],
                               'zspec': f['zspec'][:]})
        else:
            df = pd.DataFrame({'ra': f['ra'][:],
                               'dec': f['dec'][:]})
        
        # Write the DataFrame to a CSV file, without the index
        df.to_csv(os.path.join(csv_dir, csv_fn), index=False)

Update unkown file so that it doesn't have duplicates or objects in the other datasets.

In [None]:
hsc_labels = pd.read_parquet('/arc/projects/unions/catalogues/redshifts/hsc_ssp.parquet')


In [18]:
h5_dir = '/arc/projects/ots/HSC_h5/'
h5_fns = ['HSC_galaxies_zspec_64_train.h5', 
          'HSC_galaxies_zspec_64_val.h5',
          'HSC_galaxies_zspec_64_test.h5']
csv_dir = '../data/'
csv_fns = ['HSC_galaxy_dud_zspec_train.csv', 
          'HSC_galaxy_dud_zspec_val.csv', 
          'HSC_galaxy_dud_zspec_test.csv']

for h5_fn, csv_fn in zip(h5_fns, csv_fns):
    with h5py.File(os.path.join(h5_dir, h5_fn), "r") as f:
        # Create a DataFrame from the extracted data
        if 'zspec' in list(f.keys()):
            df = pd.DataFrame({'ra': f['ra'][:],
                               'dec': f['dec'][:],
                               'zspec': f['zspec'][:]})
        else:
            df = pd.DataFrame({'ra': f['ra'][:],
                               'dec': f['dec'][:]})
        
        # Write the DataFrame to a CSV file, without the index
        df.to_csv(os.path.join(csv_dir, csv_fn), index=False)

In [19]:
for csv_fn in csv_fns:
    print(len(pd.read_csv(os.path.join(csv_dir, csv_fn))))

15446
1921
1934
