In [3]:
import pandas as pd
import os
from PIL import Image

# GZ2 data filtering

In [4]:
gz2_data = pd.read_csv("../data/gz2_hart16.csv")
gz2_no_artifacts = gz2_data[gz2_data['gz2_class'] != 'A']
gz2_clean = gz2_no_artifacts[(gz2_no_artifacts['t01_smooth_or_features_a02_features_or_disk_flag'] == 1) | (gz2_no_artifacts['t01_smooth_or_features_a01_smooth_flag'] == 1)]
gz2_clean_unambiguous = gz2_clean[~((gz2_clean['t01_smooth_or_features_a02_features_or_disk_flag'] == 1) & (gz2_clean['t01_smooth_or_features_a01_smooth_flag'] == 1))].reset_index(drop=True)
gz2_clean_unambiguous["Class"] = gz2_clean_unambiguous.apply(lambda r: "Spiral" if r["gz2_class"][0] == "S" else "Elliptical", axis=1)
gz2_clean_unambiguous = gz2_clean_unambiguous.rename(columns={"dr7objid": "Id"})
gz2_transformed = gz2_clean_unambiguous[["Id", "Class"]]
gz2_transformed

Unnamed: 0,Id,Class
0,587732591714893851,Spiral
1,588009368545984617,Spiral
2,587741723357282317,Spiral
3,587729751132209314,Elliptical
4,587735742617616406,Spiral
...,...,...
154882,588297864173256986,Elliptical
154883,587741602566897791,Elliptical
154884,587734841742459081,Elliptical
154885,587741531712782408,Elliptical


# GZ1 data filtering

In [5]:
# from astropy.coordinates import SkyCoord
# from astropy import units as unit

# def map_ra_dec(ra, dec):
#     coords = SkyCoord(ra, dec, unit=(unit.deg, unit.deg)).transform_to("fk5")
#     print(coords)
#     return (coords.ra, coords.dec)


gz1_data = pd.read_csv("../data/GalaxyZoo1_DR_table2.csv")
gz1_clean = gz1_data[(gz1_data["SPIRAL"] == 1) | (gz1_data["ELLIPTICAL"] == 1)].reset_index(drop=True)
gz1_clean["Class"] = gz1_clean.apply(lambda r: "Spiral" if r["SPIRAL"] == 1 else "Elliptical", axis=1)
# gz1_clean["ra"], gz1_clean["dec"] = gz1_clean.apply(lambda r: map_ra_dec(r["RA"], r["DEC"]), axis=1)
gz1_clean = gz1_clean.rename(columns={"OBJID": "Id"})
gz1_transformed = gz1_clean[["Id", "Class"]]
gz1_transformed

Unnamed: 0,Id,Class
0,587727227300741210,Spiral
1,587730774962536596,Elliptical
2,587727223024189605,Spiral
3,587727221950382424,Spiral
4,587727178449485858,Spiral
...,...,...
252410,587727225153257594,Spiral
252411,587727227837612104,Spiral
252412,587730774962536585,Spiral
252413,587727226763870322,Spiral


# Merge datasets, remove galaxies with non-matching predictions

In [None]:
combined_dataset = pd.merge(gz1_transformed, gz2_transformed, on="Id", how="outer", suffixes=("_gz1", "_gz2"))
print("GZ1 + GZ2 - intersection:", len(gz1_transformed) + len(gz2_transformed) - len(gz2_transformed[gz2_transformed["Id"].isin(gz1_transformed["Id"])]))
print("Merged dataset:", len(combined_dataset))
both_predictions_present = combined_dataset[(~combined_dataset["Class_gz1"].isna()) & (~combined_dataset["Class_gz2"].isna())]
both_predictions_present
predictions_not_matching = both_predictions_present[both_predictions_present["Class_gz1"] != both_predictions_present["Class_gz2"]]
print("Predictions that don't match:", len(predictions_not_matching))
final_combined_dataset = combined_dataset.drop(predictions_not_matching.index).reset_index(drop=True)

final_combined_dataset["Class"] = final_combined_dataset.apply(lambda r: r["Class_gz2"] if pd.isna(r["Class_gz1"]) else r["Class_gz1"], axis=1)
final_combined_dataset["InitialDataset"] = final_combined_dataset.apply(lambda r: "Both" if r["Class_gz1"] == r["Class_gz2"] else "GZ2" if pd.isna(r["Class_gz1"]) else "GZ1", axis=1)
final_combined_dataset.to_csv('../data/final_dataset_classes.csv', header=True, index=False)
final_combined_dataset[final_combined_dataset["Id" == "587729781738307895"]]

In [14]:
final_combined_dataset[final_combined_dataset["Id"] == 587729781738307895]

Unnamed: 0,Id,Class_gz1,Class_gz2,Class,InitialDataset
304191,587729781738307895,,Elliptical,Elliptical,GZ2


In [72]:
final_combined_dataset_with_redshift = pd.read_csv("../data/CombinedGalaxyDatasetWithRedshift_tomasmuzas_1.csv")
final_combined_dataset_with_redshift[final_combined_dataset_with_redshift["z"] < 0.4]["z"]

0         0.077923
1         0.071558
2         0.071975
3         0.054841
4         0.080338
            ...   
315236    0.046335
315237    0.077674
315238    0.112736
315239    0.118850
315240    0.025774
Name: z, Length: 315168, dtype: float64

# Prepare dataset splits

In [7]:
# Shuffle the data
all_data = final_combined_dataset.sample(frac=1, random_state= 777).reset_index(drop=True)

training_data = all_data.sample(frac= 0.8, replace= False, random_state= 777).sort_index()
validation_and_test_data = all_data.drop(training_data.index)

validation_data = validation_and_test_data.sample(frac= 0.5, replace= False, random_state= 777)
test_data = validation_and_test_data.drop(validation_data.index)

print(training_data["Class"].value_counts(normalize= True))
print(validation_data["Class"].value_counts(normalize= True))
print(test_data["Class"].value_counts(normalize= True))

Spiral        0.711134
Elliptical    0.288866
Name: Class, dtype: float64
Spiral        0.710673
Elliptical    0.289327
Name: Class, dtype: float64
Spiral        0.714693
Elliptical    0.285307
Name: Class, dtype: float64


In [34]:
BATCH_SIZE = 1024

def pad_dataset_with_existing_values(dataset):
    remaining = BATCH_SIZE - (len(dataset) % BATCH_SIZE)
    padding_data = dataset.sample(n=remaining, random_state= 777)
    return dataset.append(padding_data, ignore_index=True)

padded_training_dataset = pad_dataset_with_existing_values(training_data)
padded_validation_dataset = pad_dataset_with_existing_values(validation_data)
padded_test_dataset = pad_dataset_with_existing_values(test_data)
print(len(padded_training_dataset) / BATCH_SIZE, len(padded_training_dataset) % BATCH_SIZE)
print(len(padded_validation_dataset) / BATCH_SIZE, len(padded_validation_dataset) % BATCH_SIZE)
print(len(padded_test_dataset) / BATCH_SIZE, len(padded_test_dataset) % BATCH_SIZE)

247.0 0
31.0 0
31.0 0


In [8]:
import os
from tqdm import tqdm_notebook

def map_frame(frame):
    return frame.apply(lambda row:
    (
        row['Id'],
        row['Class']
    ), axis=1)

def create_dataset(name, dataset):
    os.makedirs(name)
    os.makedirs(name + "/" + "Spiral")
    os.makedirs(name + "/" + "Elliptical")

    galaxies = map_frame(dataset)

    failed_galaxies = []
    for galaxy in tqdm_notebook(galaxies):
        try:
            filename = str(galaxy[0]) + ".jpg"
            image = Image.open("../data/resized_images/" + filename)
            image.save(f"{name}/{galaxy[1]}/{filename}")
        except Exception as e:
            failed_galaxies.append(galaxy[0])

    print("Failed to process:", len(failed_galaxies))
    return failed_galaxies

In [9]:
failed_training_galaxies = create_dataset("../data/Dataset/CombinedDataset_Training", training_data)
failed_validation_galaxies =  create_dataset("../data/Dataset/CombinedDataset_Validation", validation_data)
failed_test_galaxies = create_dataset("../data/Dataset/CombinedDataset_Test", test_data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for galaxy in tqdm_notebook(galaxies):


  0%|          | 0/252754 [00:00<?, ?it/s]

Failed to process: 0


  0%|          | 0/31594 [00:00<?, ?it/s]

Failed to process: 0


  0%|          | 0/31594 [00:00<?, ?it/s]

Failed to process: 0
