In [1]:
# Importing modules
import numpy as np
import pandas as pd
import numba
import pyarrow.dataset as ds

In [2]:
# Pandas display options (personal preference)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [3]:
# Importing the G09 parquet file
dataset = ds.dataset("../data/G09_1p1_Z22_unmasked.parquet", format="parquet")
table = dataset.to_table()

# Setting uberID as index
df = table.to_pandas().set_index('uberID')
df.index=df.index.astype('int64')

del dataset,table

In [4]:
# Getting rid of weird spaces in the class column
df=df.replace({'class': {'ambiguous ': 'ambiguous',
                         'star      ': 'star', 
                         'galaxy    ': 'galaxy', 
                         'artefact  ':'artefact'}})

In [5]:
# Making a number of cuts: magnitude, mask, starmask and removing artefacts
df=df[(df['mag_Zt']<21.2) & 
      (df['mask']==0) & 
      (df['starmask']==0) & 
      (df['class']!='artefact')]

In [6]:
# Getting a 1/10 subset of the data (for the purpose of this tutorial)
df=df[::100]

In [7]:
# Reading the u and no u labels from notebooks 1 and 2, and creating new dataframes
df_u=pd.read_csv('../data/G09_u_labels.csv').set_index('uberID')
df_no_u=pd.read_csv('../data/G09_no_u_labels.csv').set_index('uberID')

In [8]:
# Creating new columns in the main dataframe with u and no u labels
df['cluster_label']=np.zeros(len(df))
df['cluster_label_no_u']=np.zeros(len(df))

In [9]:
# Filling the new columns with u and no u labels
df.loc[df_u.index,'cluster_label']=df_u['cluster_label'].apply(str)
df.loc[df_no_u.index,'cluster_label_no_u']=df_no_u['cluster_label'].apply(str)

In [10]:
labels=['star','galaxy']

In [11]:
# Creating a 'final label' column full of 1's to combine the two labels
df['final_label']=np.ones(len(df))

In [12]:
# First applying the no u labels to the final label
for label in labels:
    df.loc[df[df['cluster_label_no_u']==label].index,'final_label']=label

In [13]:
# Then applying the u labels to the final label, overwriting any existing no u labels
for label in labels:
    df.loc[df[df['cluster_label']==label].index,'final_label']=label

In [14]:
# Convering final label to string
df['final_label']=df['final_label'].apply(str)

In [15]:
# Creating a new column to indicate whether a source has any missing bands.
# If final label still == 1, that indicates a label hasn't been added, so it has a missing band
df['missing_bands']=True
df.loc[df[df['final_label']!='1.0'].index,'missing_bands']=False

In [16]:
# For the sources with missing labels we then overwrite with class
df.loc[df[df['final_label']=='1.0'].index,'final_label'] = df[df['final_label']=='1.0']['class']

In [17]:
# Sources with missing bands are only 1.5% of the sample
len(df[df['missing_bands']==True])/len(df)

0.01547957284216714

In [18]:
# Saving
df[['class','missing_bands','final_label']].to_csv('../data/G09_labels.csv')