# Explanation

In `get_fluxes.ipynb` I used a sql query that downloaded the position and fluxes of every HSC object in the COSMOS field (pdr1, median seeing, forced table).  From there, I created a basic classifier that gave a rough probability of a particular galaxy being low-mass and low-redshift.  I saved the ids of the best 100 candidates as the "best" sample; the worst 100 candidates were saved as the "worst" sample; another 100 candidates were randomly chosen (from the training set) as the "random" sample.

Unfortunately, I have no information about the size of these galaxies, so I don't know how big their postage stamps should be. This notebook takes in the ids of each dataset, queries the remote database on their shapes, and saves the results locally.

# Code
 **Remember to set your credentials within `hsc_credentials.py` !**

In [None]:
from hsc_credentials import credential

In [None]:
from hscReleaseQuery import query_wrapper

In [None]:
import numpy as np
import glob
import os

In [None]:
filenames = glob.glob("../quick_sample/*.csv")
filenames_dict = {os.path.basename(filename).replace(".csv","").replace("ids_","") :  filename
                 for filename in filenames}

# Build the queries

In [None]:
def build_query_from_ids_filename(ids_filename):
    ids = np.loadtxt(ids_filename, dtype=int)
    ids = sorted(ids)
    
    ids_str = ""
    for id in ids:
        ids_str += "    {:d},\n".format(id)
    ids_str = ids_str.rstrip(",\n")
    
    shapes_sql = """
SELECT 
    object_id, 
    ra, dec,
    gshape_sdss_11, gshape_sdss_22, gshape_sdss_12, gshape_sdss_flags,
    rshape_sdss_11, rshape_sdss_22, rshape_sdss_12, rshape_sdss_flags,
    ishape_sdss_11, ishape_sdss_22, ishape_sdss_12, ishape_sdss_flags,
    zshape_sdss_11, zshape_sdss_22, zshape_sdss_12, zshape_sdss_flags,
    yshape_sdss_11, yshape_sdss_22, yshape_sdss_12, yshape_sdss_flags
FROM
    pdr1_cosmos_widedepth_median.forced
WHERE
    object_id IN (
{}
    ) """.format(ids_str)
    
    return shapes_sql

# Make the queries

In [None]:
for label in filenames_dict:
    shapes_sql = build_query_from_ids_filename(filenames_dict[label])
    
    preview_results = False
    delete_job = True
    out_format = "sqlite3"
    
    output_filename = "shapes_{}.sqlite3".format(label)
    print(output_filename)

    with open(output_filename, mode="wb") as output_file:
        query_wrapper(credential, shapes_sql, preview_results, delete_job, 
                      out_format, output_file,
                      nomail=True,
                     )

# Check if it worked

In [None]:
!ls -lh shapes_*.sqlite3

In [None]:
import pandas as pd

In [None]:
df_best = pd.read_sql_table("table_1", 
                            "sqlite:///{}".format("shapes_best.sqlite3"), 
                            index_col="object_id")

df_worst = pd.read_sql_table("table_1", 
                            "sqlite:///{}".format("shapes_worst.sqlite3"), 
                            index_col="object_id")

df_random = pd.read_sql_table("table_1", 
                            "sqlite:///{}".format("shapes_random.sqlite3"), 
                            index_col="object_id")

df_best["type"] = "best"
df_worst["type"] = "worst"
df_random["type"] = "random"

df_all = pd.concat([df_best, df_worst, df_random])

In [None]:
df_all.head()

## Check that ids are distinct

In [None]:
set(df_best.index) & set(df_worst.index)

In [None]:
set(df_best.index) & set(df_random.index)

In [None]:
set(df_worst.index) & set(df_random.index)

# What fraction of objects have bad shapes?

In [None]:
bad_shapes = df_all[[
    "gshape_sdss_flags",
    "rshape_sdss_flags",
    "ishape_sdss_flags",
    "zshape_sdss_flags",
    "yshape_sdss_flags",
]].max(axis=1)

In [None]:
df_best[[
    "gshape_sdss_flags",
    "rshape_sdss_flags",
    "ishape_sdss_flags",
    "zshape_sdss_flags",
    "yshape_sdss_flags",
]].max(axis=1).mean()

In [None]:
df_worst[[
    "gshape_sdss_flags",
    "rshape_sdss_flags",
    "ishape_sdss_flags",
    "zshape_sdss_flags",
    "yshape_sdss_flags",
]].max(axis=1).mean()

In [None]:
df_random[[
    "gshape_sdss_flags",
    "rshape_sdss_flags",
    "ishape_sdss_flags",
    "zshape_sdss_flags",
    "yshape_sdss_flags",
]].max(axis=1).mean()

# Find largest galaxy (to set size)
**remember:** shape is in units of `arcsec`$^2$

In [None]:
df_all[~bad_shapes][[
    "gshape_sdss_11", "gshape_sdss_12", "gshape_sdss_22",
    "rshape_sdss_11", "rshape_sdss_12", "rshape_sdss_22",
    "ishape_sdss_11", "ishape_sdss_12", "ishape_sdss_22",
    "zshape_sdss_11", "zshape_sdss_12", "zshape_sdss_22",
    "yshape_sdss_11", "yshape_sdss_12", "yshape_sdss_22",
]].max(axis=1)

## Get information on the largest galaxy
[Note: this is only for galaxies with valid shapes in *every* band]

In [None]:
id_largest = df_all[~bad_shapes][[
    "gshape_sdss_11", "gshape_sdss_12", "gshape_sdss_22",
    "rshape_sdss_11", "rshape_sdss_12", "rshape_sdss_22",
    "ishape_sdss_11", "ishape_sdss_12", "ishape_sdss_22",
    "zshape_sdss_11", "zshape_sdss_12", "zshape_sdss_22",
    "yshape_sdss_11", "yshape_sdss_12", "yshape_sdss_22",
]].max(axis=1).argmax()

id_largest

In [None]:
df_all.loc[id_largest]