In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

In [None]:
# load visual inspection
dirname = '/home/alberto/almacen/Selected_LAEs/obs_mock_vi/'

# Sid
vi_path_name = f'{dirname}/ALTEROS_VI_250.txt'
data = pd.read_csv(vi_path_name, sep=' ', header=None)
is_real_sgl = data[2] == 'r'
sgl = {
    'class': data[1],
    'mock_real': is_real_sgl
}

# Daniele
vi_path_name = f'{dirname}/20230222_classification_dani_first.txt'
data = pd.read_csv(vi_path_name, sep=' ', header=None)
ds = {
    'class': data[1]
}

# Pablo
vi_path_name = f'{dirname}/visual_inspection_cat20230222_pablo.csv'
data = pd.read_csv(vi_path_name)
is_real_pam = data['Real/Mock'] == 'r'
pam = {
    'class': data['CLASS'],
    'mock_real': is_real_pam
}

# AlbertoFS
vi_path_name = f'{dirname}/afs_class.csv'
data = pd.read_csv(vi_path_name)
is_real_afs = data['Mock?'] == 'r'
afs = {
    'class': data['Key'],
    'mock_real': is_real_pam
}

# AlbertoFS
vi_path_name = f'{dirname}/att_class.csv'
data = pd.read_csv(vi_path_name, header=None)
att = {
    'class': data[1],
}

N_cand = len(is_real_pam)

# 4 out of 5
all_class = {'class': np.ones(N_cand) * 4}
person_list = [sgl, ds, afs, att, pam]
for cl in range(1, 4):
    this_cl_count = np.zeros(N_cand)
    idk_count = np.zeros(N_cand)
    for person in person_list:
        this_cl_count = this_cl_count + (person['class'] == cl)
        idk_count = idk_count + (person['class'] == 4)

    mask_vi_confidence = (this_cl_count >= 3) & (this_cl_count + idk_count == 5)
    all_class['class'][mask_vi_confidence] = cl

In [None]:
# Load real values
id_arr_to_plot = np.load(f'{dirname}/id_arr_to_plot.npy')
shuffle_indices = np.load(f'{dirname}/shuffle_indices.npy')
with open(f'{dirname}/mock_sel_dict.pkl', 'rb') as f:
    mock_sel_dict = pickle.load(f)

mock_class = np.ones(N_cand).astype(int) * -1 # -1 means real source
class_dict = {
    'LAE_QSO': 1,
    'CONT_QSO': 2,
    'GAL': 3,
    'SFG': 5
}
for i in range(128):
    src = shuffle_indices[i]
    cl = mock_sel_dict['class'][id_arr_to_plot[np.where(shuffle_indices == src)]][0]
    mock_class[src] = class_dict[cl]

sum(mock_class == 1)

In [None]:
selection = pd.read_csv('csv/selection.csv')
starprob_sel = selection['starprob'].to_numpy()
starprob = np.ones(N_cand) * -1
starprob[shuffle_indices[128:]] = starprob_sel

zspec_sel = selection['SDSS_zspec'].to_numpy()
zspec = np.ones(N_cand) * -1
zspec[shuffle_indices[128:]] = zspec_sel

src_sel = selection['src'].to_numpy()
src = np.ones(N_cand) * -1
src[shuffle_indices[128:]] = src_sel

In [None]:
is_real = np.ones(N_cand).astype(bool)
is_real[shuffle_indices[:128]] = False

In [None]:
def plot_conf_mat_vi(class_dict, name):
    if 'mock_real' in class_dict.keys():
        print(f'Mock or real guess: {sum(class_dict["mock_real"] == is_real)}/{N_cand}')

    # Confussion matrix
    cf_mat = np.zeros((3, 4))
    for reality in range(3):
        for guess in range(4):
            cf_mat[reality, guess] = sum((class_dict['class'] == guess + 1)
                                        & (mock_class == reality + 1))

    fig, ax = plt.subplots()

    sns.heatmap(cf_mat, ax=ax, annot=True, cbar=False)

    ax.set_ylabel('REAL class', fontsize=18)
    ax.set_xlabel(f'{name}\'s VI class', fontsize=18)

    ax.set_yticklabels(['LAE_QSO', 'CONT_QSO', 'GAL'])
    ax.set_xticklabels(['LAE_QSO', 'CONT_QSO', 'GAL', 'IDK'])

    plt.show()

plot_conf_mat_vi(sgl, 'Sid')
plot_conf_mat_vi(pam, 'Pablo')
plot_conf_mat_vi(ds, 'Daniele')
plot_conf_mat_vi(afs, 'AlbertoFS')
plot_conf_mat_vi(att, 'AlbertoTT')
plot_conf_mat_vi(all_class, 'All')

In [None]:
print('REAL OBS. DATA')
mask_real = mock_class == -1
print(f'QSO LAEs : {sum(mask_real & (all_class["class"] == 1))}')
print(f'QSO Cont.: {sum(mask_real & (all_class["class"] == 2))}')
print(f'Gal Cont.: {sum(mask_real & (all_class["class"] == 3))}')
print(f'No class : {sum(mask_real & (all_class["class"] == 4))}')

In [None]:
36 + 10 + 15

In [None]:
fig, ax = plt.subplots()

bins = np.linspace(-0.1, 1, 30)
ax.hist(starprob[all_class['class'] == 3], bins,
        histtype='step', lw=2, label='CLASS=GAL')
ax.hist(starprob[all_class['class'] == 2], bins + 0.02,
        histtype='step', lw=2, label='CLASS=QSO_CONT')
ax.hist(starprob[all_class['class'] == 4], bins + 0.01,
        histtype='step', lw=2, label='CLASS=IDK')
ax.legend(loc=9)
ax.set_xlabel('starprob')

plt.show()

In [None]:
print(list(src[mask_real & (all_class['class'] == 3)].astype(int)))

## count removed sources
print()
print(f'QSOs: {sum((mask_real & (all_class["class"] == 2)) | ((zspec > 0) & (zspec < 1.9)))}')
print(f'GALs: {sum((mask_real & (all_class["class"] == 3)) | (mask_real & (starprob < 0.1)))}')
print(f'No class, no LAE: {sum(mask_real & (all_class["class"] == 4) & np.isnan(zspec) & (starprob >= 0.1))}')

all_class_Arr = all_class['class']
all_class_Arr[((zspec > 0) & (zspec < 1.9))] = 2
all_class_Arr[starprob <= 0.1] = 3

# save VI class
a = all_class_Arr[shuffle_indices][128:]
np.save('npy/class_vi_all.npy', a)