# Notebook #2

## Dataset Preprocessing Notebook

Opens the fits files using data_loader and filter the dataset using the catalog standardizer, then Saves the cleaned datasets in gzip files

### Import Statements and Load config.yaml file

In [1]:
import sys
sys.path.append("./utils")

import pandas             as pd
import numpy              as np
import matplotlib.pyplot  as plt

from catalog_standardizer import CatalogStandardizer
from config_loader        import ConfigLoader
from data_loader          import DataLoader

In [2]:
config = ConfigLoader("/Users/trevin/Documents/Git/h20_xray_catalog_matching/src/configs/config.yaml")
data = DataLoader(verbose=0)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Verdana']
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.labelpad'] = 20
plt.rcParams['axes.titlepad'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['legend.fontsize'] = 16


df_erosita = data.get_dataframe(config.EROSITA_PATH)
df_edff = data.get_dataframe(config.EDFF_PATH)
df_cdfs = data.get_dataframe(config.CDFS_PATH)



## Catalog Standardization

Filter the eROSITA dataset so that only the objects in the edff regions are included

In [3]:
eROSITA_standardizer = CatalogStandardizer(
    df_erosita['RA'],
    df_erosita['DEC'],
    df_erosita["POS_ERR"],
    df_erosita["POS_ERR"],
    df_erosita["ML_FLUX_1"],
)

edff_standardizer = CatalogStandardizer(
    df_edff['ALPHA_J2000'],
    df_edff['DELTA_J2000'],
    df_edff['ERRX_MODEL'],
    df_edff['ERRY_MODEL'],
    df_edff['CFHT_u_MAG'],
)

cdfs_standardizer = CatalogStandardizer(
    df_cdfs['RAJ2000'],
    df_cdfs['DEJ2000'],
    df_cdfs['errPos'],
    df_cdfs['errPos'],
    df_cdfs['FB'],
)   

edff_standardized = edff_standardizer.standardize()
erosita_standardized = eROSITA_standardizer.standardize()
cdfs_standardized = cdfs_standardizer.standardize()

min_RA = max(edff_standardized['ra'].min(), erosita_standardized['ra'].min())
max_RA = min(edff_standardized['ra'].max(), erosita_standardized['ra'].max())
min_DEC = max(edff_standardized['dec'].min(), erosita_standardized['dec'].min())
max_DEC = min(edff_standardized['dec'].max(), erosita_standardized['dec'].max())

edff_standardized = edff_standardized[
    (edff_standardized['ra'] >= min_RA) & (edff_standardized['ra'] <= max_RA) &
    (edff_standardized['dec'] >= min_DEC) & (edff_standardized['dec'] <= max_DEC)
].reset_index(drop=True)

erosita_standardized = erosita_standardized[
    (erosita_standardized['ra'] >= min_RA) & (erosita_standardized['ra'] <= max_RA) &
    (erosita_standardized['dec'] >= min_DEC) & (erosita_standardized['dec'] <= max_DEC)
].reset_index(drop=True)

In [4]:
erosita_standardized.to_csv(f"{config.INPUT_CATALOG_PATH}/erosita_LYR.csv", sep=",", index=False)
cdfs_standardized.to_csv(f"{config.INPUT_CATALOG_PATH}/cdfs_LYR.csv", sep=",", index=False)
edff_standardized.to_csv(f"{config.INPUT_CATALOG_PATH}/edff_LYR.csv", sep=",", index=False)