In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import csv

from hyperopt import fmin
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer

import matplotlib.pyplot as plt

from timeit import default_timer as timer

In [None]:
def generate_features(bands=['u','g','i','r','z'], use_stokes=True, use_averages=False):
    features = []

    base_features = [
                      'dered',
                      'petroRad',
                      'petroR50',
                      'petroR90',
                      'petro_R50_R90_ratio',
                      'petroMag',
                    ]
    
    stokes_features = [
                      'stokes_q',
                      'stokes_u',
                      'stokes_p'
                      ]

    average_features = [
        'avg_petro_rad',
        'avg_petro_R50',
        'avg_petro_R90',
        'avg_petro_R50_R90_ratio'
    ]
    
    average_stokes_features = [
        'avg_stokes_q',
        'avg_stokes_u',
    ]
    
    valid_colour_indexes = [
        'u_g_colour_index',
        'g_r_colour_index',
        'r_i_colour_index',
        'i_z_colour_index',
    ]
    
    for band in bands:
        for base_feature in base_features:
            feature = '{}_{}'.format(base_feature, band)
            features.append(feature)
            
        if use_stokes:
            for stokes_feature in stokes_features:
                feature = '{}_{}'.format(stokes_feature, band)
                features.append(feature)
        
        for band2 in bands:
            feature = '{}_{}_colour_index'.format(band, band2)
            if feature in valid_colour_indexes:
                petro_feature = 'petro_{}'.format(feature)
                features.append(feature)
                features.append(petro_feature)

    if use_averages:
        features.extend(average_features)
        if use_stokes:
            features.extend(average_stokes_features)

    return features

In [None]:
SPIRIAL_GALAXY_TYPE    = 0
ELLIPTICAL_GALAXY_TYPE = 1
UNKNOWN_GALAXY_TYPE    = 2

features = generate_features()

target_column = 'galaxy_type'

CONFIDENCE_LEVEL = 0.8

In [None]:
input_data = pd.read_csv('data/input.csv')

In [None]:
data = input_data.copy()
combined_spiral = data.spiralclock + data.spiralanticlock + data.edgeon
data['galaxy_type'] = UNKNOWN_GALAXY_TYPE
data['combined_spiral'] = combined_spiral
data.loc[data.debiased_elliptical > CONFIDENCE_LEVEL, 'galaxy_type'] = ELLIPTICAL_GALAXY_TYPE
data.loc[data.debiased_spiral > CONFIDENCE_LEVEL, 'galaxy_type'] = SPIRIAL_GALAXY_TYPE

num_of_elliptical = data[data.galaxy_type == ELLIPTICAL_GALAXY_TYPE].size
num_of_spirial = data[data.galaxy_type == SPIRIAL_GALAXY_TYPE].size
num_of_unknown = data[data.galaxy_type == UNKNOWN_GALAXY_TYPE].size
total_count = data.size

print(num_of_elliptical / total_count)
print(num_of_spirial / total_count)
print(num_of_unknown / total_count)
print(num_of_spirial / (num_of_elliptical + num_of_spirial))

In [None]:
# http://skyserver.sdss.org/dr12/SkyserverWS/ImgCutout/getjpeg?ra=224.5941&dec=-1.09&width=512
from urllib.request import urlopen
from PIL import Image

In [None]:
GZ_IMAGE_SIZE = 424
BASE_CUTOUT_SCALE = 0.008

def download_image(row, image_size=GZ_IMAGE_SIZE, padding_scale=1.0):
    petroRad = row['petroRad_r']
    ra = row['ra']
    dec = row['dec']
    scale = BASE_CUTOUT_SCALE * GZ_IMAGE_SIZE/image_size * petroRad * padding_scale

    url = f'http://skyserver.sdss.org/dr15/SkyserverWS/ImgCutout/getjpeg?ra={ra}&dec={dec}&width={image_size}&height={image_size}&scale={scale}'
    return Image.open(urlopen(url))


In [None]:
img = download_image(data.loc[0])
plt.imshow(img, cmap=plt.get_cmap('gray'))

In [None]:
img = download_image(data.loc[0], image_size=224)
plt.imshow(img, cmap=plt.get_cmap('gray'))

In [None]:
orig_size = 424
small_size = 64

scale = small_size/float(orig_size)
small_img = img.resize((64,64), Image.ANTIALIAS)
plt.imshow(small_img, cmap=plt.get_cmap('gray'))

In [None]:
rand_scale = np.random.uniform(0.9, 1.1)
new_size = int(rand_scale * orig_size)
new_size
resized_img = img.resize((new_size, new_size), Image.ANTIALIAS)
plt.imshow(resized_img, cmap=plt.get_cmap('gray'))

In [None]:
left = (orig_size - 212)/2
top = left
right = (orig_size + 212)/2
bottom = right

cropped_image = img.crop((left, top, right, bottom))
plt.imshow(cropped_image, cmap=plt.get_cmap('gray'))

In [None]:
def crop_dimensions(curr_size, new_size, top_offset=0, left_offset=0):
    top = int((curr_size - new_size)/2 + top_offset)
    bottom = top + new_size
    left = int((curr_size - new_size)/2 + left_offset)
    right = left + new_size
    
    return (left, top, right, bottom)

def centre_crop(img):
#    return img.crop(crop_dimensions(image_size, crop_size))
    (left, top, right, bottom) = crop_dimensions(image_size, crop_size)
    return img[left:right,top:bottom,:]

def create_crops(img, size=224):
    (width, height) = img.size
    imgs = []
    
    imgs.append(img.crop(crop_dimensions(width, size)))
    # do the middle third range in the quadrant
    max_offset = (width - size)/3
    min_offset = max_offset / 2
    for idx in range(0,4):
        offset = np.random.uniform(min_offset, max_offset, 2)
        if idx < 2:
            offset[0] = -offset[0]
        if idx % 2 == 0:
            offset[1] = -offset[1]
        
        cropped_img = img.crop(crop_dimensions(width, size, top_offset=int(offset[0]), left_offset=int(offset[1])))
        cropped_img.show()
        imgs.append(cropped_img)

    return imgs

In [None]:
def plot_images(imgs, size=batch_size):
    num_rows = int(np.ceil(size/3.0))
    print(num_rows)
    figsize_y = 5 * num_rows

    fig = plt.figure(figsize=(20,figsize_y))
    for idx in range(0, size):
        img = imgs[idx]
        # make scale between 0 and 1.0 plotting
        img_min = img.min()
        img_max = img.max()
        img = (img - img_min) / (img_max - img_min)

        fig.add_subplot(num_rows, 3, idx + 1)
        plt.imshow(img, cmap=plt.get_cmap('gray'))

    plt.show()

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
K.set_image_dim_ordering('tf')

def augment_images(datagen, X_train, y_train):
    imgs = X_train.copy()
    if not datagen is None:
        imgs = apply_augmentation(datagen, X_train, y_train)

    result_imgs = np.empty((imgs.shape[0], crop_size, crop_size, 3))
    for idx, img in enumerate(imgs):
        result_imgs[idx] = centre_crop(img)
        
    plot_images(result_imgs)
    return result_imgs    
    
def apply_augmentation(datagen, X_train, y_train):
    # Convert to float32 in here
    X_train = X_train.astype('float32')
    datagen.fit(X_train)

    for X_batch, y_batch in datagen.flow(X_train, y_train, shuffle=False, batch_size=batch_size):
        return X_batch


## Load Image Cutouts

In [None]:
batch_size = 9
image_size = 350
crop_size = 224
padding_scale = float(350)/crop_size

X_train = np.empty((batch_size, image_size, image_size, 3), dtype=int)
y_train = []

for idx in range(0, batch_size):
    img = download_image(data.loc[idx], image_size=image_size, padding_scale=padding_scale)
    X_train[idx] = np.asarray(img)
    y_train.append(data.loc[idx, 'galaxy_type'])



# Data Augmentation

In [None]:
augmented_imgs = np.empty((8, batch_size, crop_size, crop_size, 3))
augmented_imgs[0] = augment_images(None, X_train, y_train)

## Normalise Features

In [None]:
channels = np.moveaxis(X_train, 3, 0)
fill = int(np.mean(channels))
fill

In [None]:
datagen = ImageDataGenerator(featurewise_center=True,
                             featurewise_std_normalization=True
                            )
augmented_imgs[1] = augment_images(datagen, X_train, y_train)

## Random Rotations

In [None]:
datagen = ImageDataGenerator(
                             rotation_range=180,fill_mode='constant',cval=fill
                            )
augmented_imgs[2] = augment_images(datagen, X_train, y_train)

## Random Shifts

In [None]:
shift = 0.1
datagen = ImageDataGenerator(
                             width_shift_range=shift,
                             height_shift_range=shift,
                             fill_mode='constant',
                             cval=fill
                            )

augmented_imgs[3] = augment_images(datagen, X_train, y_train)

## Random Flips

In [None]:
datagen = ImageDataGenerator(horizontal_flip=True,
                             vertical_flip=True
                            )
augmented_imgs[4] = augment_images(datagen, X_train, y_train)

## Random Scaling

In [None]:
datagen = ImageDataGenerator(rescale=0.1, fill_mode='constant')

augmented_imgs[5] = augment_images(datagen, X_train, y_train)

## Samplewise normalisation

In [None]:
datagen = ImageDataGenerator(samplewise_center=True,
                             samplewise_std_normalization=True
                            )
augmented_imgs[6] = augment_images(datagen, X_train, y_train)

## Multiple Augmentations

In [None]:
shift = 0.1
datagen = ImageDataGenerator(featurewise_center=True,
                             featurewise_std_normalization=True,
#                              samplewise_center=True,
#                              samplewise_std_normalization=True,
#                              width_shift_range=shift,
#                              height_shift_range=shift,
                             horizontal_flip=True,
                             vertical_flip=True,
                             fill_mode='constant',
                             rotation_range=180,
                             rescale=0.1,
                             brightness_range=(0.9,1.1),
                             cval=fill
                            )

augmented_imgs[7] = augment_images(datagen, X_train, y_train)

In [None]:
for augmentations in np.moveaxis(augmented_imgs, 0, 1):
    plot_images(augmentations, size=8)

In [None]:
base_img = augmented_imgs[0]/255
mod_img = augmented_imgs[7]

In [None]:
np.min(mod_img)