In [2]:
import pandas as pd
import os
from PIL import Image

# GZ2 data filtering

In [2]:
gz2_data = pd.read_csv("../data/gz2_hart16.csv")
gz2_no_artifacts = gz2_data[gz2_data['gz2_class'] != 'A']
gz2_clean = gz2_no_artifacts[(gz2_no_artifacts['t01_smooth_or_features_a02_features_or_disk_flag'] == 1) | (gz2_no_artifacts['t01_smooth_or_features_a01_smooth_flag'] == 1)]
gz2_clean_unambiguous = gz2_clean[~((gz2_clean['t01_smooth_or_features_a02_features_or_disk_flag'] == 1) & (gz2_clean['t01_smooth_or_features_a01_smooth_flag'] == 1))].reset_index(drop=True)
gz2_clean_unambiguous["Class"] = gz2_clean_unambiguous.apply(lambda r: "Spiral" if r["gz2_class"][0] == "S" else "Elliptical", axis=1)
gz2_clean_unambiguous = gz2_clean_unambiguous.rename(columns={"dr7objid": "Id"})
gz2_transformed = gz2_clean_unambiguous[["Id", "Class"]]
gz2_transformed

Unnamed: 0,Id,Class
0,587732591714893851,Spiral
1,588009368545984617,Spiral
2,587741723357282317,Spiral
3,587729751132209314,Elliptical
4,587735742617616406,Spiral
...,...,...
154882,588297864173256986,Elliptical
154883,587741602566897791,Elliptical
154884,587734841742459081,Elliptical
154885,587741531712782408,Elliptical


# GZ1 data filtering

In [3]:
# from astropy.coordinates import SkyCoord
# from astropy import units as unit

# def map_ra_dec(ra, dec):
#     coords = SkyCoord(ra, dec, unit=(unit.deg, unit.deg)).transform_to("fk5")
#     print(coords)
#     return (coords.ra, coords.dec)


gz1_data = pd.read_csv("../data/GalaxyZoo1_DR_table2.csv")
gz1_clean = gz1_data[(gz1_data["SPIRAL"] == 1) | (gz1_data["ELLIPTICAL"] == 1)].reset_index(drop=True)
gz1_clean["Class"] = gz1_clean.apply(lambda r: "Spiral" if r["SPIRAL"] == 1 else "Elliptical", axis=1)
# gz1_clean["ra"], gz1_clean["dec"] = gz1_clean.apply(lambda r: map_ra_dec(r["RA"], r["DEC"]), axis=1)
gz1_clean = gz1_clean.rename(columns={"OBJID": "Id"})
gz1_transformed = gz1_clean[["Id", "Class"]]
gz1_transformed

Unnamed: 0,Id,Class
0,587727227300741210,Spiral
1,587730774962536596,Elliptical
2,587727223024189605,Spiral
3,587727221950382424,Spiral
4,587727178449485858,Spiral
...,...,...
252410,587727225153257594,Spiral
252411,587727227837612104,Spiral
252412,587730774962536585,Spiral
252413,587727226763870322,Spiral


# Merge datasets, remove galaxies with non-matching predictions

In [4]:
combined_dataset = pd.merge(gz1_transformed, gz2_transformed, on="Id", how="outer", suffixes=("_gz1", "_gz2"))
print("GZ1 + GZ2 - intersection:", len(gz1_transformed) + len(gz2_transformed) - len(gz2_transformed[gz2_transformed["Id"].isin(gz1_transformed["Id"])]))
print("Merged dataset:", len(combined_dataset))
both_predictions_present = combined_dataset[(~combined_dataset["Class_gz1"].isna()) & (~combined_dataset["Class_gz2"].isna())]
both_predictions_present
predictions_not_matching = both_predictions_present[both_predictions_present["Class_gz1"] != both_predictions_present["Class_gz2"]]
print("Predictions that don't match:", len(predictions_not_matching))
final_combined_dataset = combined_dataset.drop(predictions_not_matching.index).reset_index(drop=True)

final_combined_dataset["Class"] = final_combined_dataset.apply(lambda r: r["Class_gz2"] if pd.isna(r["Class_gz1"]) else r["Class_gz1"], axis=1)
final_combined_dataset["InitialDataset"] = final_combined_dataset.apply(lambda r: "Both" if r["Class_gz1"] == r["Class_gz2"] else "GZ2" if pd.isna(r["Class_gz1"]) else "GZ1", axis=1)
final_combined_dataset.to_csv('../data/final_dataset_classes.csv', header=True, index=False)

final_combined_dataset

GZ1 + GZ2 - intersection: 319991
Merged dataset: 319991
Predictions that don't match: 4049


Unnamed: 0,Id,Class_gz1,Class_gz2,Class,InitialDataset
0,587727227300741210,Spiral,,Spiral,GZ1
1,587730774962536596,Elliptical,,Elliptical,GZ1
2,587727223024189605,Spiral,,Spiral,GZ1
3,587727221950382424,Spiral,,Spiral,GZ1
4,587727178449485858,Spiral,,Spiral,GZ1
...,...,...,...,...,...
315937,587734621636460661,,Elliptical,Elliptical,GZ2
315938,587741532251422867,,Elliptical,Elliptical,GZ2
315939,588297864173256986,,Elliptical,Elliptical,GZ2
315940,587734841742459081,,Elliptical,Elliptical,GZ2


# CasJobs query to select spectrocsopic data

```sql
SELECT c.*, s.z, s.zConf, s.zErr, p.petroMag_r, p.extinction_r
into mydb.GalaxyPhotometricData
from DR7..SpecObj as s
  JOIN DR7..PhotoObj AS p ON s.bestObjID = p.objID
  JOIN MyDb.CombinedGalaxyDataset as c 
  ON c.Id = p.objID
WHERE (s.specClass = dbo.fSpecClass('GALAXY')) AND (p.primTarget & (dbo.fPrimTarget('TARGET_GALAXY')) > 0)
  AND s.z < 0.35
```

# Prepare dataset splits based on the spectroscopic data

In [8]:
final_data_with_redshift = pd.read_csv("../data/GalaxyPhotometricData_ALL.csv")
print(len(final_data_with_redshift[final_data_with_redshift["z"] < 0.15]))
print(len(final_data_with_redshift[(final_data_with_redshift["z"] >= 0.15) & (final_data_with_redshift["z"] < 0.25)]))
print(len(final_data_with_redshift[(final_data_with_redshift["z"] >= 0.25) & (final_data_with_redshift["z"] < 0.35)]))
print(len(final_data_with_redshift[final_data_with_redshift["z"] >= 0.35]))

print(final_data_with_redshift["z"].min())
Q1 = final_data_with_redshift["z"].quantile(0.25)
Q3 = final_data_with_redshift["z"].quantile(0.75)
IQR = Q3 - Q1
IQR, Q1, Q3, Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

263317
47147
3374
47
0.0100081


(0.0666249, 0.0648641, 0.131489, -0.035073250000000014, 0.23142635)

In [3]:
# Filter by redshift range. In this file, only galaxies with available data are remaining
# The file is taken from the output of the CasJob from the previous step
final_data_with_redshift = pd.read_csv("../data/GalaxyPhotometricData.csv")
final_data_below_threshold= final_data_with_redshift[final_data_with_redshift["z"] <= 0.35]

print("Final dataset size:", len(final_data_below_threshold), end='\n\n')

# Shuffle the data
all_data = final_data_below_threshold.sample(frac=1, random_state= 777).reset_index(drop=True)
print(all_data["z"].min(), all_data["z"].max())

training_data = all_data.sample(frac= 0.8, replace= False, random_state= 777).sort_index()
validation_and_test_data = all_data.drop(training_data.index)

validation_data = validation_and_test_data.sample(frac= 0.5, replace= False, random_state= 777)
test_data = validation_and_test_data.drop(validation_data.index)

print(training_data["Class"].value_counts())
print(validation_data["Class"].value_counts())
print(test_data["Class"].value_counts())

training_data.to_csv("../data/training_galaxy_data.csv", header= True)
validation_data.to_csv("../data/validation_galaxy_data.csv", header=True)
test_data.to_csv("../data/test_galaxy_data.csv", header= True)

Final dataset size: 313838

0.0100081 0.349972
Spiral        178674
Elliptical     72396
Name: Class, dtype: int64
Spiral        22265
Elliptical     9119
Name: Class, dtype: int64
Spiral        22231
Elliptical     9153
Name: Class, dtype: int64


In [94]:
BATCH_SIZE = 1024

def pad_dataset_with_existing_values(dataset):
    remaining = BATCH_SIZE - (len(dataset) % BATCH_SIZE)
    padding_data = dataset.sample(n=remaining, random_state= 777)
    return dataset.append(padding_data, ignore_index=True)

padded_training_dataset = pad_dataset_with_existing_values(training_data)
print(len(padded_training_dataset) / BATCH_SIZE, len(padded_training_dataset) % BATCH_SIZE)

246.0 0


In [95]:
import os
from tqdm import tqdm_notebook

def map_frame(frame):
    return frame.apply(lambda row:
    (
        row['Id'],
        row['Class']
    ), axis=1)

def create_dataset(name, dataset):
    os.makedirs(name)
    os.makedirs(name + "/" + "Spiral")
    os.makedirs(name + "/" + "Elliptical")

    galaxies = map_frame(dataset)

    failed_galaxies = []
    for galaxy in tqdm_notebook(galaxies):
        try:
            
            filename = str(galaxy[0]) + ".jpg"
            # due to padding, some IDs will be the same, give them unique name
            if(os.path.exists(f"{name}/{galaxy[1]}/{filename}")):
                filename = str(galaxy[0]) + "_repeated.jpg" 
            image = Image.open("../data/resized_images/" + str(galaxy[0]) + ".jpg")
            image.save(f"{name}/{galaxy[1]}/{filename}")
        except Exception as e:
            failed_galaxies.append(galaxy[0])

    print("Failed to process:", len(failed_galaxies))
    return failed_galaxies

In [96]:
create_dataset("../data/Dataset/Training", padded_training_dataset)
create_dataset("../data/Dataset/Validation", validation_data)
create_dataset("../data/Dataset/Test", test_data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for galaxy in tqdm_notebook(galaxies):


  0%|          | 0/251904 [00:00<?, ?it/s]

Failed to process: 0


  0%|          | 0/31384 [00:00<?, ?it/s]

Failed to process: 0


  0%|          | 0/31384 [00:00<?, ?it/s]

Failed to process: 0


[]