# What do I want?

Previously in `HSC_COSMOS_filtering.ipynb` I tested out some basic classifiers to get a smaller sample set, while still keeping completeness high.  I tested two basic classifiers: a RandomForest classifier and a Logistic Regression classifier.

For my training data, I started by getting objects and labels from COSMOS. For input features, I then matched those COSMOS galaxies to their nearest HSC counterpart. I then used HSC i-band magnitude, along with HSC g-r, r-i, i-z, z-y colors.

Choosing some arbitrary thresholds, I got similar results for the Random Forest and the Logistic Regression classifiers. In this notebook I'll look at the full ROC curves for both classifiers, in hopes of better understanding my results.

In [1]:
import numpy as np

import datasets
from matching import Matches

In [2]:
COSMOS = datasets.COSMOS("COSMOS_reference.sqlite")

In [3]:
HSC = datasets.HSC("HSC_COSMOS_median_forced.sqlite3")

In [4]:
COSMOS.df.shape

(518331, 13)

In [5]:
HSC.df.shape

(928534, 20)

In [6]:
matches_df = Matches.load_from_filename("data/matches.sqlite3")

In [7]:
combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med

photometry_cols = [
    "gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags",
    "rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags",
    "icmodel_flux","icmodel_flux_err","icmodel_flux_flags",
    "zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags",
    "ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags",
]

for col in photometry_cols:
    combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values

In [8]:
low_z    = (combined.photo_z  < .15)
low_mass = (combined.log_mass < 9)

# Create classification labels

Class A: matched **and** (low redshift + low mass)

Class B: matched **but not** (low redshift + low mass)

In [9]:
class_a =  (low_z & low_mass)
class_b = ~(low_z & low_mass)

In [10]:
combined["low_z_low_mass"] = class_a
combined.head()

Unnamed: 0_level_0,catalog_2_ids,sep,match,error,ra,dec,photo_z,log_mass,gcmodel_flux,gcmodel_flux_err,...,icmodel_flux,icmodel_flux_err,icmodel_flux_flags,zcmodel_flux,zcmodel_flux_err,zcmodel_flux_flags,ycmodel_flux,ycmodel_flux_err,ycmodel_flux_flags,low_z_low_mass
catalog_1_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
218604,43158996781122114,0.114389,True,False,149.749393,1.618068,0.3797,11.0761,7.760156e-29,4.177573e-31,...,6.303301e-28,1.4154499999999999e-30,False,8.553727e-28,1.656983e-30,False,1.004911e-27,3.037335e-30,False,False
219656,43158447025298860,0.471546,True,False,150.388349,1.614538,2.3343,8.99275,5.800470000000001e-31,2.102453e-31,...,2.3484539999999998e-30,5.105159000000001e-31,False,1.967417e-30,4.0421610000000005e-31,False,4.328439e-31,1.392682e-30,False,False
219741,43158447025298862,0.202378,True,False,150.402935,1.614631,2.1991,9.71373,8.775548000000001e-31,2.110055e-31,...,1.666095e-30,3.575105e-31,False,1.105359e-30,4.225365e-31,False,2.3476309999999998e-30,1.4324569999999999e-30,False,False
219743,43158584464246387,0.207967,True,False,150.295083,1.614662,2.4407,9.77811,1.312252e-30,3.044583e-31,...,3.989077e-30,5.39698e-31,False,4.8179319999999995e-30,7.052368e-31,False,3.960493e-30,2.091224e-30,False,False
219744,43158584464253383,0.295316,True,False,150.239919,1.614675,0.2079,7.04224,8.597155e-31,1.9996840000000002e-31,...,1.373718e-30,3.604136e-31,False,2.152825e-30,5.385489e-31,False,6.61298e-31,1.594763e-30,False,False


## Turn fluxes into rough colors
Yes, I know these aren't exactly the right colors since I'm not including zero-points, but that shouldn't affect the results.

(When I get a chance, I'll re-download the dataset so that it includes magnitudes not just fluxes)

In [11]:
combined["g_minus_r"] = -.4*np.log10(combined["gcmodel_flux"] / combined["rcmodel_flux"])
combined["r_minus_i"] = -.4*np.log10(combined["rcmodel_flux"] / combined["icmodel_flux"])
combined["i_minus_z"] = -.4*np.log10(combined["icmodel_flux"] / combined["zcmodel_flux"])
combined["z_minus_y"] = -.4*np.log10(combined["zcmodel_flux"] / combined["ycmodel_flux"])

  if __name__ == '__main__':
  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  app.launch_new_instance()
  app.launch_new_instance()


For now, filter out bad photometry. Later I could consider passing this into the classifier, as an imputed/sentinel value

In [12]:
mask =    np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
        & np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
        & np.isfinite(combined["icmodel_flux"]) \
        & (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
        & (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
        & (~combined.ycmodel_flux_flags)

combined = combined[mask]

combined["log_icmodel_flux"] = np.log10(combined["icmodel_flux"])

In [13]:
combined.shape

(332712, 29)

In [14]:
features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                              "log_icmodel_flux"]]

target = combined.loc[:,["low_z_low_mass"]]

In [15]:
target.mean()

low_z_low_mass    0.022124
dtype: float64

# Build Classifiers

## Partition training and testing sets

In [16]:
testing_fraction = .1
test_set_indices = np.random.choice(target.index.values, 
                                    replace=False,
                                    size=int(testing_fraction*target.size)
                                   )

training_set_indices = np.array(list(set(target.index.values) - set(test_set_indices)))

features_train = features.loc[training_set_indices]
features_test  = features.loc[test_set_indices]

target_train   = target.loc[training_set_indices]
target_test    = target.loc[test_set_indices]

true_a =  target_test.values.flatten()
true_b = ~target_test.values.flatten()

In [17]:
def get_classification_characteristics(target_prob, threshold_prob, verbose=False):

    target_prediction = (target_prob > threshold_prob)
    
    prediction_a =  target_prediction
    prediction_b = ~target_prediction
    
    completeness = (true_a & prediction_a).sum() / (true_a).sum() 
    
    purity = (true_a & prediction_a).sum() / (prediction_a).sum() 
    
    sample_size_reduction = prediction_a.size / prediction_a.sum()
    
    true_positives  = np.sum(true_a & prediction_a)
    false_positives = np.sum(true_b & prediction_a)
    
    true_negatives  = np.sum(true_b & prediction_b)
    false_negatives = np.sum(true_a & prediction_b)
    
    true_positive_rate = true_positives / true_a.sum()
    false_positive_rate = false_positives / true_b.sum()
    
    if verbose:
        print("completeness:          ", completeness)
        print("purity:                ", purity)
        print("sample_size_reduction: ", sample_size_reduction)
        print("true  positive rate:   ", true_positive_rate)
        print("false positive rate:   ", false_positive_rate)
        
    return {
        "completeness": completeness,
        "purity": purity,
        "sample_size_reduction": sample_size_reduction,
        "threshold_prob": threshold_prob,
        "true_positive_rate": true_positive_rate,
        "false_positive_rate": false_positive_rate,
           }

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

classifier_RF = RandomForestClassifier()
classifier_RF = classifier_RF.fit(features_train, target_train.values.flatten())

target_prob_RF = classifier_RF.predict_proba(features_test)[:,1]
print("min prob: ", target_prob_RF.min())
print("max prob: ", target_prob_RF.max())


min prob:  0.0
max prob:  1.0


In [19]:
get_classification_characteristics(target_prob_RF, .01, verbose=True)

completeness:           0.444134078212
purity:                 0.0570301291248
sample_size_reduction:  5.96682209469
true  positive rate:    0.444134078212
false positive rate:    0.161511288589


{'completeness': 0.44413407821229051,
 'false_positive_rate': 0.16151128858854247,
 'purity': 0.05703012912482066,
 'sample_size_reduction': 5.9668220946915351,
 'threshold_prob': 0.01,
 'true_positive_rate': 0.44413407821229051}

In [20]:
threshold_probs = np.linspace(0, 1, num=100)[1:-1]
results_RF = [get_classification_characteristics(target_prob_RF, threshold_prob)
              for threshold_prob in threshold_probs]

In [21]:
completenesses_RF         = [result["completeness"] for result in results_RF]
purities_RF               = [result["purity"] for result in results_RF]
sample_size_reductions_RF = [result["sample_size_reduction"] for result in results_RF]
true_positive_rates_RF    = [result["true_positive_rate"] for result in results_RF]
false_positive_rates_RF   = [result["false_positive_rate"] for result in results_RF]

## Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(class_weight="balanced")
classifier_LR = classifier_LR.fit(features_train, np.array(target_train.values.flatten(), dtype=int))

target_prob_LR = classifier_LR.predict_proba(features_test)[:,1]
print("min prob: ", target_prob_LR.min())
print("max prob: ", target_prob_LR.max())

min prob:  4.54654787459e-05
max prob:  0.999505817399


In [23]:
get_classification_characteristics(target_prob_LR, .01, verbose=True)

completeness:           0.998603351955
purity:                 0.0215063466282
sample_size_reduction:  1.00075197016
true  positive rate:    0.998603351955
false positive rate:    0.999262786054


{'completeness': 0.99860335195530725,
 'false_positive_rate': 0.99926278605436958,
 'purity': 0.021506346628165793,
 'sample_size_reduction': 1.0007519701618239,
 'threshold_prob': 0.01,
 'true_positive_rate': 0.99860335195530725}

In [24]:
threshold_probs = np.linspace(0, 1)[1:-1]
results_LR = [get_classification_characteristics(target_prob_LR, threshold_prob)
              for threshold_prob in threshold_probs]

In [25]:
completenesses_LR         = [result["completeness"] for result in results_LR]
purities_LR               = [result["purity"] for result in results_LR]
sample_size_reductions_LR = [result["sample_size_reduction"] for result in results_LR]
true_positive_rates_LR    = [result["true_positive_rate"] for result in results_LR]
false_positive_rates_LR   = [result["false_positive_rate"] for result in results_LR]


# Get specific galaxies

In [26]:
best_dwarfs_args  = np.argpartition(target_prob_RF, target_prob_RF.size-100)[-100:]
worst_dwarfs_args = np.argpartition(target_prob_RF, 100)[:100]

best_dwarfs_ids_cosmos  = target_test.iloc[best_dwarfs_args].index
worst_dwarfs_ids_cosmos = target_test.iloc[worst_dwarfs_args].index

best_dwarf_ids_hsc = combined.loc[best_dwarfs_ids_cosmos].catalog_2_ids
worst_dwarf_ids_hsc = combined.loc[worst_dwarfs_ids_cosmos].catalog_2_ids

In [27]:
random_ids_cosmos = np.random.choice(training_set_indices,
                              replace=False,
                              size=100,
                             )

random_ids_hsc = combined.loc[random_ids_cosmos].catalog_2_ids

## Check: do any HSC ids overlap?
By design the COSMOS ids shouldn't overlap, but the COSMOS id -> HSC id mapping isn't necessarily unique.

In [28]:
set(best_dwarf_ids_hsc.values) & set(worst_dwarf_ids_hsc.values)

set()

In [29]:
set(best_dwarf_ids_hsc.values) & set(random_ids_hsc.values)

set()

In [30]:
set(worst_dwarf_ids_hsc.values) & set(random_ids_hsc.values)

set()

### Do they give reasonable dwarf fractions?

In [31]:
combined.loc[best_dwarfs_ids_cosmos].low_z_low_mass.mean()

0.62

In [32]:
combined.loc[worst_dwarfs_ids_cosmos].low_z_low_mass.mean()

0.02

In [33]:
combined.loc[random_ids_cosmos].low_z_low_mass.mean()

0.029999999999999999

### Save the indices to disk

In [34]:
np.savetxt("quick_sample/ids_best.csv",   best_dwarf_ids_hsc.values,  fmt="%d")
np.savetxt("quick_sample/ids_worst.csv",  worst_dwarf_ids_hsc.values, fmt="%d")
np.savetxt("quick_sample/ids_random.csv", random_ids_hsc.values,      fmt="%d")

# What image size do I need?

For this, you'll need to use `data.get_shapes.ipynb` to query + store the object shapes from the remote database.

# Build a list to send to Song

In [35]:
ids_best   = np.loadtxt("quick_sample/ids_best.csv",   dtype=int)
ids_worst  = np.loadtxt("quick_sample/ids_worst.csv",  dtype=int)
ids_random = np.loadtxt("quick_sample/ids_random.csv", dtype=int)

## Check if any HSC ids are duplicated

In [36]:
assert( len(set(best_dwarf_ids_hsc)) == 100 )

In [37]:
assert( len(set(worst_dwarf_ids_hsc)) == 100 )

In [38]:
assert( len(set(random_ids_hsc)) == 100 )

### But are multiple cosmos galaxies matched to any of those HSC id's?
This will be a problem because there will be two masses / redshifts attached to a given HSC example.

In [39]:
df_best = combined[combined.catalog_2_ids.isin(best_dwarf_ids_hsc)]
df_best.shape

(100, 29)

In [40]:
df_worst = combined[combined.catalog_2_ids.isin(worst_dwarf_ids_hsc)]
df_worst.shape

(100, 29)

In [41]:
df_random = combined[combined.catalog_2_ids.isin(random_ids_hsc)]
df_random.shape

(100, 29)

# Disambiguate matches
Simply select the closest of the COSMOS galaxies, and discard all others

In [42]:
from collections import Counter

def copy_and_filter_duplicates(df_old, verbose=False):
    df = df_old.copy()
    counts = Counter(df.catalog_2_ids)
    
    ambiguous_galaxy_hsc_ids = []
    for hsc_id in counts:
        if counts[hsc_id] > 1:
            ambiguous_galaxy_hsc_ids.append(hsc_id)
            if verbose:
                print(hsc_id, counts[hsc_id])
            
            ambiguous_matches = df[df.catalog_2_ids == hsc_id]
            better_match_cosmos_id = ambiguous_matches.sep.argmin()
            worse_match_ids = set(ambiguous_matches.index) - set([better_match_cosmos_id])
            
            for worse_match_id in worse_match_ids:
                df = df[df.index != worse_match_id ]
        
    return df   

In [43]:
df_best = copy_and_filter_duplicates(df_best, verbose=True)

In [44]:
df_worst = copy_and_filter_duplicates(df_worst, verbose=True)

In [45]:
df_random = copy_and_filter_duplicates(df_random, verbose=True)

## Create a csv file for Song

In [46]:
with open("data/galaxies_for_song.csv", mode="w") as f:
    f.write("# object_id,ra,dec,z\n")
    
    kwargs = dict(
        header=None,
        index=False,
        float_format="%lf",
    )
    
    columns = ["catalog_2_ids", "ra", "dec", "photo_z"]
        
    df_best[columns].to_csv(f, **kwargs)
    
    df_worst[columns].to_csv(f, **kwargs)
    
    df_worst[columns].to_csv(f, **kwargs)


In [47]:
!head data/galaxies_for_song.csv

# object_id,ra,dec,z
43158447025292832,150.404404,1.624559,0.070100
43158447025314177,150.363379,1.644248,0.181100
43158721903223995,149.995748,1.669957,0.097300
43158859342178879,149.931247,1.729403,0.150000
43158447025318498,150.459376,1.741018,0.085000
43158447025318544,150.432815,1.742775,0.007800
43158996781127776,149.724603,1.747211,0.065200
43158447025319231,150.396595,1.756353,0.220000
43158447025297655,150.415714,1.759544,0.190000


# Look into the duplicates (WIP)

In [48]:
# stop "Run All" from going below here
assert(False)

AssertionError: 

In [None]:
combined[combined.catalog_2_ids == ambiguous_galaxy_hsc_ids[0]]

In [None]:
combined[combined.catalog_2_ids == ambiguous_galaxy_hsc_ids[1]]

In [None]:
COSMOS.df.loc[[801103, 801106]]

# Old material?

In [None]:
# stop "Run All" from going below here
assert(False)

In [None]:
combined_tmp = matches.catalog_2.df[matches.mask_catalog_2].iloc[matches.idx[matches.mask_match]]

In [None]:
ids_best   = np.loadtxt("quick_sample/ids_best.csv",   dtype=int)
ids_worst  = np.loadtxt("quick_sample/ids_worst.csv",  dtype=int)
ids_random = np.loadtxt("quick_sample/ids_random.csv", dtype=int)




In [None]:
for i, photo_z in enumerate(matches.catalog_1.df[matches.mask_catalog_1].photo_z):
    if not matches.mask_match[i]:
        continue
    hsc_id = matches.catalog_2.df[matches.mask_catalog_2].index[matches.idx[i]]
#     if hsc_id not in combined_tmp.index:
#         print("bad id: ", hsc_id)
#         break
    combined_tmp.loc[hsc_id, "photo_z"] = photo_z
#     print(hsc_id, photo_z)
    
#     break
# combined["photo_z"] = 

In [None]:
hsc_id in combined_tmp.index

In [None]:
hsc_id in matches.catalog_2.df[matches.mask_catalog_2].iloc[matches.idx[matches.mask_match]].index

In [None]:
hsc_id in combined_tmp.index

In [None]:
combined_tmp.loc[hsc_id]

In [None]:
id_tmp = 43158996781122114
combined_tmp.loc[id_tmp, "ra"]

In [None]:
combined_tmp.photo_z