# What do I want?

Previously in `HSC_COSMOS_filtering.ipynb` I tested out some basic classifiers to get a smaller sample set, while still keeping completeness high.  I tested two basic classifiers: a RandomForest classifier and a Logistic Regression classifier.

For my training data, I started by getting objects and labels from COSMOS. For input features, I then matched those COSMOS galaxies to their nearest HSC counterpart. I then used HSC i-band magnitude, along with HSC g-r, r-i, i-z, z-y colors.

Choosing some arbitrary thresholds, I got similar results for the Random Forest and the Logistic Regression classifiers. In this notebook I'll look at the full ROC curves for both classifiers, in hopes of better understanding my results.

In [None]:
import numpy as np

import datasets
from matching import Matches

In [None]:
COSMOS = datasets.COSMOS("COSMOS_reference.sqlite")

In [None]:
HSC = datasets.HSC("HSC_COSMOS_median_forced.sqlite3")

In [None]:
COSMOS.df.shape

In [None]:
HSC.df.shape

In [None]:
downsample_factor_COSMOS = 1
mask_COSMOS_downsample = (COSMOS.df.index % downsample_factor_COSMOS == 0)

downsample_factor_HSC = 1
mask_HSC_downsample = (HSC.df.index % downsample_factor_HSC == 0)


matches = Matches(COSMOS,
                  HSC,
                  mask_catalog_1=mask_COSMOS_downsample,
                  mask_catalog_2=mask_HSC_downsample,
                 )

In [None]:
low_z = (matches.catalog_1.df[matches.mask_catalog_1].photo_z < .15)
low_mass = (matches.catalog_1.df[matches.mask_catalog_1].mass_med < 9)

# Create classification labels

Class A: matched **and** (low redshift + low mass)

Class B: matched **but not** (low redshift + low mass)

In [None]:
class_a = matches.mask_match & (low_z & low_mass)
class_b = matches.mask_match & ~(low_z & low_mass)

In [None]:
idxs = matches.idx[class_a.values]

In [None]:
class_a.mean()

In [None]:
class_b.mean()

In [None]:
class_a.sum() / (class_a.sum() + class_b.sum())

In [None]:
class_b.sum() / (class_a.sum() + class_b.sum())

In [None]:
matches.catalog_2.df["low_z_low_mass"] = False
# only change the flag if the object has been matched to, and if it is in class_a
matches.catalog_2.df.loc[matches.catalog_2.df.index[matches.mask_catalog_2][idxs],
                         ["low_z_low_mass"]] = True

In [None]:
# this is different from `class_a.mean()`
# because COSMOS galaxies (which `class_a` referred to)
# don't have a 1-to-1 map to HSC galaxies, even if they map
# (we expect ~2 HSC galaxies to map to a given COSMOS galaxy, on average)

# why this is lower, rather than higher, I don't know
matches.catalog_2.df["low_z_low_mass"].mean()

In [None]:
matches.catalog_2.df.low_z_low_mass.describe()

# Create a dataframe of just the matched galaxies

In [None]:
matched_hsc = matches.catalog_2.df[matches.mask_catalog_2].iloc[matches.idx[matches.mask_match]]

In [None]:
matched_hsc.describe()

In [None]:
matched_hsc.head()

## Turn fluxes into rough colors
Yes, I know these aren't exactly the right colors since I'm not including zero-points, but that shouldn't affect the results.

(When I get a chance, I'll re-download the dataset so that it includes magnitudes not just fluxes)

In [None]:
matched_hsc["g_minus_r"] = -.4*np.log10(matched_hsc["gcmodel_flux"] / matched_hsc["rcmodel_flux"])
matched_hsc["r_minus_i"] = -.4*np.log10(matched_hsc["rcmodel_flux"] / matched_hsc["icmodel_flux"])
matched_hsc["i_minus_z"] = -.4*np.log10(matched_hsc["icmodel_flux"] / matched_hsc["zcmodel_flux"])
matched_hsc["z_minus_y"] = -.4*np.log10(matched_hsc["zcmodel_flux"] / matched_hsc["ycmodel_flux"])

For now, filter out bad photometry. Later I could consider passing this into the classifier, as an imputed/sentinel value

In [None]:
mask =    np.isfinite(matched_hsc["g_minus_r"]) & np.isfinite(matched_hsc["r_minus_i"]) \
        & np.isfinite(matched_hsc["i_minus_z"]) & np.isfinite(matched_hsc["z_minus_y"]) \
        & np.isfinite(matched_hsc["icmodel_flux"]) \
        & (~matched_hsc.gcmodel_flux_flags) & (~matched_hsc.rcmodel_flux_flags) \
        & (~matched_hsc.icmodel_flux_flags) & (~matched_hsc.zcmodel_flux_flags) \
        & (~matched_hsc.ycmodel_flux_flags)

matched_hsc = matched_hsc[mask]

matched_hsc["log_icmodel_flux"] = np.log10(matched_hsc["icmodel_flux"])

In [None]:
matched_hsc.shape

In [None]:
features = matched_hsc.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                              "log_icmodel_flux"]]

target = matched_hsc.loc[:,["low_z_low_mass"]]

In [None]:
target.mean()

# Build Classifiers

## Partition training and testing sets

In [None]:
testing_fraction = .1
test_set_indices = np.random.choice(target.index.values, 
                                    replace=False,
                                    size=int(testing_fraction*target.size)
                                   )

training_set_indices = np.array(list(set(target.index.values) - set(test_set_indices)))

features_train = features.loc[training_set_indices]
features_test  = features.loc[test_set_indices]

target_train   = target.loc[training_set_indices]
target_test    = target.loc[test_set_indices]

true_a =  target_test.values.flatten()
true_b = ~target_test.values.flatten()

In [None]:
def get_classification_characteristics(target_prob, threshold_prob, verbose=False):

    target_prediction = (target_prob > threshold_prob)
    
    prediction_a =  target_prediction
    prediction_b = ~target_prediction
    
    completeness = (true_a & prediction_a).sum() / (true_a).sum() 
    
    purity = (true_a & prediction_a).sum() / (prediction_a).sum() 
    
    sample_size_reduction = prediction_a.size / prediction_a.sum()
    
    true_positives  = np.sum(true_a & prediction_a)
    false_positives = np.sum(true_b & prediction_a)
    
    true_negatives  = np.sum(true_b & prediction_b)
    false_negatives = np.sum(true_a & prediction_b)
    
    true_positive_rate = true_positives / true_a.sum()
    false_positive_rate = false_positives / true_b.sum()
    
    if verbose:
        print("completeness:          ", completeness)
        print("purity:                ", purity)
        print("sample_size_reduction: ", sample_size_reduction)
        print("true  positive rate:   ", true_positive_rate)
        print("false positive rate:   ", false_positive_rate)
        
    return {
        "completeness": completeness,
        "purity": purity,
        "sample_size_reduction": sample_size_reduction,
        "threshold_prob": threshold_prob,
        "true_positive_rate": true_positive_rate,
        "false_positive_rate": false_positive_rate,
           }

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_RF = RandomForestClassifier()
classifier_RF = classifier_RF.fit(features_train, target_train.values.flatten())

target_prob_RF = classifier_RF.predict_proba(features_test)[:,1]
print("min prob: ", target_prob_RF.min())
print("max prob: ", target_prob_RF.max())


In [None]:
get_classification_characteristics(target_prob_RF, .01, verbose=True)

In [None]:
threshold_probs = np.linspace(0, 1, num=100)[1:-1]
results_RF = [get_classification_characteristics(target_prob_RF, threshold_prob)
              for threshold_prob in threshold_probs]

In [None]:
completenesses_RF         = [result["completeness"] for result in results_RF]
purities_RF               = [result["purity"] for result in results_RF]
sample_size_reductions_RF = [result["sample_size_reduction"] for result in results_RF]
true_positive_rates_RF    = [result["true_positive_rate"] for result in results_RF]
false_positive_rates_RF   = [result["false_positive_rate"] for result in results_RF]

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(class_weight="balanced")
classifier_LR = classifier_LR.fit(features_train, np.array(target_train.values.flatten(), dtype=int))

target_prob_LR = classifier_LR.predict_proba(features_test)[:,1]
print("min prob: ", target_prob_LR.min())
print("max prob: ", target_prob_LR.max())

In [None]:
get_classification_characteristics(target_prob_LR, .01, verbose=True)

In [None]:
threshold_probs = np.linspace(0, 1)[1:-1]
results_LR = [get_classification_characteristics(target_prob_LR, threshold_prob)
              for threshold_prob in threshold_probs]

In [None]:
completenesses_LR         = [result["completeness"] for result in results_LR]
purities_LR               = [result["purity"] for result in results_LR]
sample_size_reductions_LR = [result["sample_size_reduction"] for result in results_LR]
true_positive_rates_LR    = [result["true_positive_rate"] for result in results_LR]
false_positive_rates_LR   = [result["false_positive_rate"] for result in results_LR]


# Get specific galaxies

In [None]:
best_dwarfs_args  = np.argpartition(target_prob_RF, target_prob_RF.size-100)[-100:]
worst_dwarfs_args = np.argpartition(target_prob_RF, 100)[:100]

best_dwarfs_ids  = target_test.iloc[best_dwarfs_args].index
worst_dwarfs_ids = target_test.iloc[worst_dwarfs_args].index

In [None]:
random_ids = np.random.choice(training_set_indices,
                              replace=False,
                              size=100,
                             )

### Do they give reasonable dwarf fractions?

In [None]:
matched_hsc.loc[best_dwarfs_ids].low_z_low_mass.mean()

In [None]:
matched_hsc.loc[worst_dwarfs_ids].low_z_low_mass.mean()

In [None]:
matched_hsc.loc[random_ids].low_z_low_mass.mean()

### Save the indices to disk

In [None]:
np.savetxt("quick_sample/ids_best.csv",   best_dwarfs_ids,  fmt="%d")
np.savetxt("quick_sample/ids_worst.csv",  worst_dwarfs_ids, fmt="%d")
np.savetxt("quick_sample/ids_random.csv", random_ids,       fmt="%d")

# What image size do I need?

For this, you'll need to use `data.get_shapes.ipynb` to query + store the object shapes from the remote database.