# Анализ рентгеновских транзиентов СРГ

In [None]:
from enum import Enum
from pathlib import Path
import re
import warnings

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearnex import patch_sklearn
from sklearn.decomposition import PCA, SparsePCA
from tqdm.auto import tqdm


# Charts configurations
np.set_printoptions(precision=3)
sns.set('talk', 'whitegrid', 'deep', font_scale=1.0,
        rc={"lines.linewidth": 2, 'grid.linestyle': '--'})
pd.set_option('display.max_rows', 400, 'display.max_columns', None, 'display.max_colwidth', 100)
plt.rcParams.update({'font.size': 24})
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'


# Sklearn acceleration
patch_sklearn()


# Enums
TRANSIENT_SURVEY = Enum("TRANSIENT_SURVEY",
                        ["NONE", "eXVAR", "TDEs2", "FDS4", "TDEs4", "QPE", "eXVAGN3", "TDEs5", "TDEs2r7"])
TRANSIENT_CLASS = Enum("TRANSIENT_CLASS", ["NONE", "TDE", "QSO"])


# Paths
ASSEMBLED_DATA_PATH = Path("assembled_data/")
SRG_DATA_PATH = Path("srg_data/")
TRITON_DATA_PATH = Path("triton_data/")

## Подготовка данных

Чтение данных из системы разметки

In [None]:
# read data and rename columns

def get_column_name_mapper(prefix: str):
        return lambda column_name: f"{prefix}_{{}}".format(column_name)


meta_object_data = pd.read_parquet(SRG_DATA_PATH / "surveys_metaobject.parquet")
erosita_data = (
        pd.read_parquet(SRG_DATA_PATH / "surveys_erosita.parquet")
        .rename(get_column_name_mapper("ero"), axis=1)
)
ls_data = (
        pd.read_parquet(SRG_DATA_PATH / "surveys_ls.parquet")
        .rename(get_column_name_mapper("ls"), axis=1)
)
ps_data = (
        pd.read_parquet(SRG_DATA_PATH / "surveys_ps.parquet")
        .rename(get_column_name_mapper("ps"), axis=1)
)
sdss_data = (
        pd.read_parquet(SRG_DATA_PATH / "surveys_sdss.parquet")
        .rename(get_column_name_mapper("sdss"), axis=1)
)
gaia_data = (
        pd.read_parquet(SRG_DATA_PATH / "surveys_gaia.parquet")
        .rename(get_column_name_mapper("gaia"), axis=1)
)

In [None]:
meta_object_data.shape

In [None]:
# merge tables

print(erosita_data.shape)
erosita_data = pd.merge(left=erosita_data, right=ls_data, left_on="ero_ls_dup", right_on="ls_id", how="left")
erosita_data = pd.merge(left=erosita_data, right=ps_data, left_on="ero_ps_dup", right_on="ps_id", how="left")
erosita_data = pd.merge(left=erosita_data, right=sdss_data, left_on="ero_sdss_dup", right_on="sdss_id", how="left")
erosita_data = pd.merge(left=erosita_data, right=gaia_data, left_on="ero_gaia_dup", right_on="gaia_id", how="left")
print(erosita_data.shape)


meta_object_to_erosita_relation = pd.read_parquet(SRG_DATA_PATH / "surveys_erosita_meta_objects.parquet")
print(meta_object_to_erosita_relation.shape)
meta_object_to_erosita_relation = pd.merge(left=meta_object_data, right=meta_object_to_erosita_relation,
                                           left_on="id", right_on="metaobject_id")
meta_object_to_erosita_relation = pd.merge(left=meta_object_to_erosita_relation, right=erosita_data,
                                           left_on="erosita_id", right_on="ero_id")
print(meta_object_to_erosita_relation.shape)

In [None]:
# as we do not have master survey connection, get source with max x-ray flux for every meta object
# and save the result

group_max_xflux = list()

for _, group in tqdm(meta_object_to_erosita_relation.groupby("metaobject_id")):
        group = group.loc[group["ero_flux_05_20"] == group["ero_flux_05_20"].max()]
        group_max_xflux.append(group)

assmbled_srg_data = pd.concat(group_max_xflux, axis=0)
assmbled_srg_data.to_parquet(ASSEMBLED_DATA_PATH / "srg_data.parquet", compression="GZIP")
assmbled_srg_data.shape

# Подготовка данных спектральных наблюдений

## Чтение данных с признаками и подготовка разметки

В этих данных содержатся комментарии наблюдателей, из которых класс источника нужно вычленить в отдельный столбец

В первом приближении в качестве класса возьмем программу наблюдений

In [None]:
def mag_ab_from_flux_nanomagies(flux):
        return 22.5 - 2.5 * np.log10(flux)

triton_with_photometry_path = TRITON_DATA_PATH / "x1a" / "part-00000.features.gz_pkl"
triton_data = pd.read_pickle(triton_with_photometry_path, compression="gzip")
triton_data = triton_data.loc[triton_data["Prog"].isin(TRANSIENT_SURVEY._member_names_)]

triton_data["ls_mag_g_ab"] = mag_ab_from_flux_nanomagies(triton_data["ls_flux_g"])
triton_data["ls_mag_r_ab"] = mag_ab_from_flux_nanomagies(triton_data["ls_flux_r"])
triton_data["ls_mag_z_ab"] = mag_ab_from_flux_nanomagies(triton_data["ls_flux_z"])
triton_data["ls_mag_w1_ab"] = mag_ab_from_flux_nanomagies(triton_data["ls_flux_w1"])
triton_data["ls_mag_w2_ab"] = mag_ab_from_flux_nanomagies(triton_data["ls_flux_w2"])

triton_data.to_parquet(ASSEMBLED_DATA_PATH / "triton_data.parquet", compression="GZIP")

triton_data.shape

In [None]:
for index, row in (
        triton_data.loc[triton_data["Redshift_str"]
        .apply(lambda s: float(s.replace(",", ".")) if s is not None else -1) > 0]
        .iterrows()
):
    print(f"{index}\t{row['ObserverNotes']}\n\n")

# Анализ с кластеризацией TBD

In [None]:
def calc_ls_colors(features):
        features["ls_g-r_color"] = features["ls_mag_g_ab"] - features["ls_mag_r_ab"]
        features["ls_g-z_color"] = features["ls_mag_g_ab"] - features["ls_mag_z_ab"]
        features["ls_g-w1_color"] = features["ls_mag_g_ab"] - features["ls_mag_w1_ab"]
        features["ls_g-w2_color"] = features["ls_mag_g_ab"] - features["ls_mag_w2_ab"]
        features["ls_r-z_color"] = features["ls_mag_r_ab"] - features["ls_mag_z_ab"]
        features["ls_r-w1_color"] = features["ls_mag_r_ab"] - features["ls_mag_w1_ab"]
        features["ls_r-w2_color"] = features["ls_mag_r_ab"] - features["ls_mag_w2_ab"]
        features["ls_z-w1_color"] = features["ls_mag_z_ab"] - features["ls_mag_w1_ab"]
        features["ls_z-w2_color"] = features["ls_mag_z_ab"] - features["ls_mag_w2_ab"]
        features["ls_w1-w2_color"] = features["ls_mag_w1_ab"] - features["ls_mag_w2_ab"]
        return features

In [None]:
features = assmbled_srg_data[["ls_mag_g_ab", "ls_mag_r_ab", "ls_mag_z_ab", "ls_mag_w1_ab", "ls_mag_w2_ab"]]
features["ln_x"] = np.log10(assmbled_srg_data["ero_flux_05_20"])
features = calc_ls_colors(features)
features = features.loc[features.notna().all(axis=1)]


In [None]:
triton_data = pd.read_parquet(ASSEMBLED_DATA_PATH / "triton_data.parquet")

tde_surveys = [TRANSIENT_SURVEY.TDEs2, TRANSIENT_SURVEY.TDEs4, TRANSIENT_SURVEY.TDEs5, TRANSIENT_SURVEY.TDEs2r7]

triton_data["Class"] = triton_data.apply(lambda x: TRANSIENT_SURVEY[x["Prog"]], axis=1)
triton_data["Class"] = triton_data.apply(lambda x: ("TDE" if x["Class"] in tde_surveys else "QSO"), axis=1)

features = triton_data[["Class", "ls_mag_g_ab", "ls_mag_r_ab", "ls_mag_z_ab", "ls_mag_w1_ab", "ls_mag_w2_ab"]]
features = features.loc[features.replace([-np.inf, np.inf], np.nan).notna().all(axis=1)]
features = calc_ls_colors(features)

features_to_show = features[["Class", "ls_g-r_color", "ls_g-w1_color", "ls_r-z_color", "ls_r-w1_color", "ls_z-w1_color", "ls_w1-w2_color"]]

sns.pairplot(features_to_show, hue="Class")

In [None]:
features = triton_data[["Class", "psdr2_g_kron", "psdr2_r_kron", "psdr2_i_kron", "psdr2_z_kron", "psdr2_y_kron", "psdr2_g_psf",
                        "psdr2_r_psf", "psdr2_i_psf", "psdr2_z_psf", "psdr2_y_psf", "psdr2_g-r_psf", "psdr2_g-i_psf", "psdr2_g-z_psf",
                        "psdr2_g-y_psf", "psdr2_g_psf-kron", "psdr2_r-i_psf", "psdr2_r-z_psf", "psdr2_r-y_psf", "psdr2_r_psf-kron",
                        "psdr2_i-z_psf", "psdr2_i-y_psf", "psdr2_i_psf-kron", "psdr2_z-y_psf", "psdr2_z_psf-kron", "psdr2_y_psf-kron",
                        "ls_mag_g_ab", "ls_mag_r_ab", "ls_mag_z_ab", "ls_mag_w1_ab", "ls_mag_w2_ab"]]
features = features.loc[features.replace([-np.inf, np.inf], np.nan).notna().all(axis=1)]

features_to_show = features[["Class", "psdr2_i-z_psf", "psdr2_i-y_psf", "psdr2_i_psf-kron", "psdr2_z-y_psf", "psdr2_z_psf-kron", "psdr2_y_psf-kron"]]

sns.pairplot(features_to_show, hue="Class", markers=["x", "+"])

In [None]:
pca = PCA(5)

features = triton_data[["Class", "psdr2_g_kron", "psdr2_r_kron", "psdr2_i_kron", "psdr2_z_kron", "psdr2_y_kron", "psdr2_g_psf",
                        "psdr2_r_psf", "psdr2_i_psf", "psdr2_z_psf", "psdr2_y_psf", "psdr2_g-r_psf", "psdr2_g-i_psf", "psdr2_g-z_psf",
                        "psdr2_g-y_psf", "psdr2_g_psf-kron", "psdr2_r-i_psf", "psdr2_r-z_psf", "psdr2_r-y_psf", "psdr2_r_psf-kron",
                        "psdr2_i-z_psf", "psdr2_i-y_psf", "psdr2_i_psf-kron", "psdr2_z-y_psf", "psdr2_z_psf-kron", "psdr2_y_psf-kron",
                        ]]
features = features.loc[features.replace([-np.inf, np.inf], np.nan).notna().all(axis=1)]

features_pca = pca.fit_transform(features.iloc[:, 1:].values)

data = pd.DataFrame(features_pca, index=features.index)
data["Class"] = features["Class"]

plt.figure(figsize=(20, 20))
sns.pairplot(data, hue="Class", markers=["x", "+"])
plt.show()