In [None]:
import pickle
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from astropy.io import fits
from astropy.table import Table
plt.rcParams.update({'font.family': 'serif', 'font.size': 12})

import dask
import dask.dataframe as dd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.options.mode.chained_assignment = None
font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        }

In [None]:
table_no_lya = pd.read_pickle('NEW_COSMOS')
table_lya=pd.read_pickle('SC4K_COSMOS')

### Extracting 5 subsamples from the Non-LAE Sample that have the same (i) size, (ii) I-band distribution and (iii) redshift distribution as the LAE Sample.

In [None]:
z_bins = [2, 3, 4, 5, 6]
m_bins = [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
z_column_lya = 'Redshift'
z_column_no_lya = 'lp_zBEST'
mag_column = 'HSC_i_MAG_APER3'
id_column = 'ID'

# Binning data
pdz_bins_lya = pd.cut(table_lya[z_column_lya], bins=z_bins)
pdm_bins_lya = pd.cut(table_lya[mag_column], bins=m_bins)
bin_counts = table_lya.groupby([pdz_bins_lya, pdm_bins_lya]).size()

# Prepare the non-LyA table for selection
table_for_selection = table_no_lya.copy()

# Initialize samples
n_samples = 5
samples = []

for i in range(n_samples):
    dfs = []
    idx_to_drop = []

    for (z_bin, m_bin), count in bin_counts.items():
        if count == 0:
            continue

        selection = (table_for_selection[z_column_no_lya].between(z_bin.left, z_bin.right) &
                     table_for_selection[mag_column].between(m_bin.left, m_bin.right))

        padding = 0.1
        while table_for_selection.loc[selection].shape[0] < count:
            selection = (table_for_selection[z_column_no_lya].between(z_bin.left - padding, z_bin.right + padding) &
                         table_for_selection[mag_column].between(m_bin.left - padding, m_bin.right + padding))
            padding += 0.1  

        n_to_select = table_for_selection.loc[selection].shape[0]
        sub_sample = table_for_selection.loc[selection].sample(n=min(count, n_to_select), replace=False)
        dfs.append(sub_sample)
        idx_to_drop += sub_sample[id_column].tolist()

    table_for_selection = table_for_selection.loc[~table_for_selection[id_column].isin(idx_to_drop)]
    samples.append(pd.concat(dfs))

### Example of the redshift and I-band distribution for one subsample

In [None]:
plt.hist(x=np.array(table_lya["HSC_i_MAG_APER3"]), bins=m_bins, ec='black',color='r',
                            alpha=0.6,label='LAE')         
plt.hist(x=np.array(samples[1]["HSC_i_MAG_APER3"]), bins=m_bins, ec='black',color='b',
                            alpha=0.3,label='NON-LAE')   
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.title('I Band Distribution (Sample 1)')
plt.ylim([0,3000])
plt.xlim([20, 30])
plt.legend()

In [None]:
plt.hist(x=np.array(table_lya["Redshift"]), bins=z_bins, ec='black', color='r',
                            alpha=0.6,label='LAE')         
plt.hist(x=np.array(samples[1]["lp_zBEST"]), bins=z_bins, ec='black', color='b',
                            alpha=0.3,label='NON-LAE')   
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Redshift')
plt.ylabel('Frequency')
plt.title('Redshift Distribution (Sample 1)')
plt.xlim([0, 8])
plt.legend()

In [None]:
sample_nolya1 = samples[0]
sample_nolya2 = samples[1]
sample_nolya3 = samples[2]
sample_nolya4 = samples[3]
sample_nolya5 = samples[4]

sample_nolya1['Class']=0
sample_nolya2['Class']=0
sample_nolya3['Class']=0
sample_nolya4['Class']=0
sample_nolya5['Class']=0
table_lya['Class']=1

df1=pd.concat([table_lya,sample_nolya1])
df2=pd.concat([table_lya,sample_nolya2])
df3=pd.concat([table_lya,sample_nolya3])
df4=pd.concat([table_lya,sample_nolya4])
df5=pd.concat([table_lya,sample_nolya5])

df1['isLya'] = np.where(df1['Class']== 1,1,0)
df2['isLya'] = np.where(df2['Class']== 1,1,0)
df3['isLya'] = np.where(df3['Class']== 1,1,0)
df4['isLya'] = np.where(df4['Class']== 1,1,0)
df5['isLya'] = np.where(df5['Class']== 1,1,0)

df1['noLya'] = np.where((df1['Class']== 0),1,0)
df2['noLya'] = np.where((df2['Class']== 0),1,0)
df3['noLya'] = np.where((df3['Class']== 0),1,0)
df4['noLya'] = np.where((df4['Class']== 0),1,0)
df5['noLya'] = np.where((df5['Class']== 0),1,0)

df1.to_pickle('Sample1')
df2.to_pickle('Sample2')
df3.to_pickle('Sample3')
df4.to_pickle('Sample4')
df5.to_pickle('Sample5')

In [None]:
filtered_table = pd.concat([samples[0],samples[1],samples[2],samples[3],samples[4]])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

colors = {'LAE': '#4daf4a', 'Non-LAE': '#e41a1c'}
markers = {'LAE': 'o', 'Non-LAE': 's'}

bin_edges = np.array([7,8,8.5,9,9.5,10,10.5,11.5])  
bin_midpoints = 0.5 * (bin_edges[:-1] + bin_edges[1:])

def weighted_avg(group):
    return np.average(group['lp_SFR_best'], weights=group['lp_zPDF'])

def get_errors(group):
    sfr_median = group['lp_SFR_med'].median()
    mass_median = group['lp_mass_med'].median()
    return pd.Series({
        'sfr_median': sfr_median,
        'sfr_min': sfr_median - group['lp_SFR_med_min68'].median(),
        'sfr_max': group['lp_SFR_med_max68'].median() - sfr_median,
        'mass_median': mass_median,
        'mass_min': mass_median - group['lp_mass_med_min68'].median(),
        'mass_max': group['lp_mass_med_max68'].median() - mass_median
    })

table_lya['lp_mass_bin'] = pd.cut(table_lya['lp_mass_best'], bins=bin_edges)
aggregate_lya = table_lya.groupby('lp_mass_bin', observed=True).apply(weighted_avg)

filtered_table['lp_mass_bin'] = pd.cut(filtered_table['lp_mass_best'], bins=bin_edges)
aggregate_filtered = filtered_table.groupby('lp_mass_bin', observed=True).apply(weighted_avg)

errors_lya = table_lya.groupby('lp_mass_bin', observed=True).apply(get_errors)
errors_filtered = filtered_table.groupby('lp_mass_bin', observed=True).apply(get_errors)

fig, ax = plt.subplots(figsize=(7, 5))

ax.errorbar(bin_midpoints, errors_lya['sfr_median'].abs(), 
            xerr=[errors_lya['mass_min'].abs(), errors_lya['mass_max'].abs()],
            yerr=[errors_lya['sfr_min'].abs(), errors_lya['sfr_max'].abs()],
            fmt=markers['LAE'], color=colors['LAE'], linestyle='-', 
            linewidth=2, markersize=8, alpha=0.8, label='LAE Sample')

ax.errorbar(bin_midpoints, errors_filtered['sfr_median'].abs(), 
            xerr=[errors_filtered['mass_min'].abs(), errors_filtered['mass_max'].abs()],
            yerr=[errors_filtered['sfr_min'].abs(), errors_filtered['sfr_max'].abs()],
            fmt=markers['Non-LAE'], color=colors['Non-LAE'], linestyle='-', 
            linewidth=2, markersize=8, alpha=0.8, label='Non-LAE Sample')

ax.set_xlabel(r'$\log_{10}(M_{\ast}/M_{\odot})$')
ax.set_ylabel(r'SFR (M$_{\odot}$/yr)')
ax.set_xlim([7, 12])
ax.set_ylim([-1, 2.5])
ax.grid(True, linestyle='--', alpha=0.3)
ax.legend()
plt.tight_layout()
#plt.savefig('Mass_vs_SFR_median.png', dpi=1000)
plt.show()

In [None]:
plt.rcParams.update({'font.family': 'serif', 'font.size': 12})


columns_to_plot = ["CFHT_ustar_MAG_APER3", "HSC_g_MAG_APER3",
                   "HSC_r_MAG_APER3", "HSC_i_MAG_APER3", "HSC_z_MAG_APER3","UVISTA_Y_MAG_APER3", "UVISTA_J_MAG_APER3",
                   "UVISTA_H_MAG_APER3", "UVISTA_Ks_MAG_APER3"]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))

axes = axes.flatten()

for i, column in enumerate(columns_to_plot):
    sns.histplot(table_lya[column], ax=axes[i], kde=True, label='LAE', color='#4daf4a', stat='density', common_norm=False)

    sns.histplot(filtered_table[column], ax=axes[i], kde=True, label='nLAE', color='#e41a1c', stat='density', common_norm=False)

    axes[i].set_xlabel(column + ' ' + '[AB]')
    axes[i].set_ylabel('Frequency')
    #axes[i].legend()

    box_text_lya = f"Max: {table_lya[column].max():.2f}\nMin: {table_lya[column].min():.2f}\nMedian: {table_lya[column].median():.2f}\nStd: {table_lya[column].std():.2f}\nNaNs: {table_lya[column].isna().sum()}"
    axes[i].text(0.65, 0.95, box_text_lya, transform=axes[i].transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='#4daf4a', alpha=0.5))

    box_text_filtered = f"Max: {filtered_table[column].max():.2f}\nMin: {filtered_table[column].min():.2f}\nMedian: {filtered_table[column].median():.2f}\nStd: {filtered_table[column].std():.2f}\nNaNs: {filtered_table[column].isna().sum()}"
    axes[i].text(0.65, 0.68, box_text_filtered, transform=axes[i].transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='#e41a1c', alpha=0.5))

plt.tight_layout()
#plt.title('Magnitudes Distributions of the LAE and Non-LAE Samples')
#plt.savefig('Magnitudes_distribution(lae and non-lae).png', dpi=1000)

plt.show()