In [None]:
from __future__ import print_function

import lightgbm as lgb
import numpy as np
import pandas as pd
import csv
import os

from astropy.io import fits

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer

import matplotlib.pyplot as plt

from timeit import default_timer as timer

import keras
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, GlobalAveragePooling2D, Flatten, Dropout
from keras.optimizers import Adam, SGD, Adagrad, Adamax, RMSprop
#from keras.callbacks import ModelCheckpoint, LearningRateScheduler
#from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l1, l2
from keras import backend as K
from keras.models import Model
from keras.datasets import cifar10


In [None]:
def generate_features(bands=['u','g','i','r','z'], use_stokes=False, use_averages=False, use_normal_colour_index=False):
    features = []

    base_features = [
                      'dered',
                      'petroRad',
                      'petroR50',
                      'petroR90',
                      'petro_R90_R50_ratio',
                      'petroMag',
                    ]
    
    stokes_features = [
                      'stokes_q',
                      'stokes_u',
                      'stokes_p'
                      ]

    average_features = [
        'avg_petro_rad',
        'avg_petro_R50',
        'avg_petro_R90',
        'avg_petro_R90_R50_ratio'
    ]
    
    average_stokes_features = [
        'avg_stokes_q',
        'avg_stokes_u',
    ]
    
    valid_colour_indexes = [
        'u_g_colour_index',
        'g_r_colour_index',
        'r_i_colour_index',
        'i_z_colour_index',
    ]
    
    for band in bands:
        for base_feature in base_features:
            feature = '{}_{}'.format(base_feature, band)
            features.append(feature)
            
        if use_stokes:
            for stokes_feature in stokes_features:
                feature = '{}_{}'.format(stokes_feature, band)
                features.append(feature)
        
        for band2 in bands:
            feature = '{}_{}_colour_index'.format(band, band2)
            if feature in valid_colour_indexes:
                petro_feature = 'petro_{}'.format(feature)
                features.append(petro_feature)
                if use_normal_colour_index:
                    features.append(feature)

    if use_averages:
        features.extend(average_features)
        if use_stokes:
            features.extend(average_stokes_features)

    return features

In [None]:
SPIRIAL_GALAXY_TYPE    = 0
ELLIPTICAL_GALAXY_TYPE = 1
UNKNOWN_GALAXY_TYPE    = 2

features = generate_features(use_normal_colour_index=False)

target_column = 'z'

CONFIDENCE_LEVEL = 0.8

In [None]:
features

In [None]:
input_data = pd.read_csv('data/input.csv')

In [None]:
data = input_data.copy()
data = data[np.all([data.z <= 0.4, data.z >= 0], axis=0)]
combined_spiral = data.spiralclock + data.spiralanticlock + data.edgeon
data['galaxy_type'] = UNKNOWN_GALAXY_TYPE
data['combined_spiral'] = combined_spiral
data.loc[data.debiased_elliptical > CONFIDENCE_LEVEL, 'galaxy_type'] = ELLIPTICAL_GALAXY_TYPE
data.loc[data.debiased_spiral > CONFIDENCE_LEVEL, 'galaxy_type'] = SPIRIAL_GALAXY_TYPE

# Add petroR50/petroR90
data['petro_R90_R50_ratio_u'] = data.petroR90_u / data.petroR50_u
data['petro_R90_R50_ratio_g'] = data.petroR90_g / data.petroR50_g
data['petro_R90_R50_ratio_r'] = data.petroR90_r / data.petroR50_r
data['petro_R90_R50_ratio_i'] = data.petroR90_i / data.petroR50_i
data['petro_R90_R50_ratio_z'] = data.petroR90_z / data.petroR50_z
data['avg_petro_rad'] = (data.petroRad_u + data.petroRad_g + data.petroRad_r + data.petroRad_i + data.petroRad_z)/5
data['avg_petro_R50'] = (data.petroR50_u + data.petroR50_g + data.petroR50_r + data.petroR50_i + data.petroR50_z)/5
data['avg_petro_R90'] = (data.petroR90_u + data.petroR90_g + data.petroR90_r + data.petroR90_i + data.petroR90_z)/5
data['avg_petro_R90_R50_ratio'] = data.avg_petro_R90 / data.avg_petro_R50

data['u_g_colour_index'] = data.dered_u - data.dered_g
data['g_r_colour_index'] = data.dered_g - data.dered_r
data['r_i_colour_index'] = data.dered_r - data.dered_i
data['i_z_colour_index'] = data.dered_i - data.dered_z

# does average of stokes in different bands really matter?
data['avg_stokes_u'] = (data.stokes_u_u + data.stokes_u_g + data.stokes_u_r + data.stokes_u_i + data.stokes_u_z)/5
data['avg_stokes_q'] = (data.stokes_q_u + data.stokes_q_g + data.stokes_q_r + data.stokes_q_i + data.stokes_q_z)/5

# Average of petro rad
data['avg_petro_rad'] = (data.petroRad_u + data.petroRad_g + data.petroRad_r + data.petroRad_i + data.petroRad_z)/5

# Petro Mag colour index
data['petro_u_g_colour_index'] = data.petroMag_u - data.petroMag_g
data['petro_g_r_colour_index'] = data.petroMag_g - data.petroMag_r
data['petro_r_i_colour_index'] = data.petroMag_r - data.petroMag_i
data['petro_i_z_colour_index'] = data.petroMag_i - data.petroMag_z

# Stokes P
data['stokes_p_u'] = np.sqrt(np.power(data.stokes_q_u, 2) + np.power(data.stokes_u_u, 2))
data['stokes_p_g'] = np.sqrt(np.power(data.stokes_q_g, 2) + np.power(data.stokes_u_g, 2))
data['stokes_p_i'] = np.sqrt(np.power(data.stokes_q_i, 2) + np.power(data.stokes_u_i, 2))
data['stokes_p_r'] = np.sqrt(np.power(data.stokes_q_r, 2) + np.power(data.stokes_u_r, 2))
data['stokes_p_z'] = np.sqrt(np.power(data.stokes_q_z, 2) + np.power(data.stokes_u_z, 2))

num_of_elliptical = data[data.galaxy_type == ELLIPTICAL_GALAXY_TYPE].size
num_of_spirial = data[data.galaxy_type == SPIRIAL_GALAXY_TYPE].size
num_of_unknown = data[data.galaxy_type == UNKNOWN_GALAXY_TYPE].size
total_count = data.size

print(num_of_elliptical / total_count)
print(num_of_spirial / total_count)
print(num_of_unknown / total_count)
print(num_of_spirial / (num_of_elliptical + num_of_spirial))

known_data = data[data.galaxy_type != UNKNOWN_GALAXY_TYPE]

In [None]:
def split_train(X, y, random_state=None, num_bins=24, normalise=True, min_y=-3, max_y=3, test_size=0.2):
    # normalise y first to make sure we can bin properly
    y_tmp = y.copy()
    if normalise:
        y_normaliser = PowerTransformer()
        y_tmp = y_normaliser.fit_transform(y.reshape(-1,1))

    bins = np.linspace(min_y, max_y, num_bins)
    y_binned = np.digitize(y_tmp, bins)
    
    return train_test_split(X, y, test_size=test_size, stratify=y_binned, random_state=random_state)

In [None]:
# num_bins = 4
# bins = np.linspace(z_min, z_max, num_bins)
# y_binned = np.digitize(known_data, bins)
# for bin_num in range(num_bins):
#     print(f'bin {bin_num} has {len(y_binned[y_binned == bin_num])} records')

# split = StratifiedShuffleSplit(n_splits=num_bins, test_size=0.2)

# for train_index, test_index in split.split(X, y_binned):
#     print("TRAIN:", train_index, "TEST:", test_index)
X = known_data[features]
y = known_data['z'].values
X_train, X_test, y_train, y_test = split_train(X, y)

In [None]:
plt.hist(y_train, bins=24)

In [None]:
x_scaler = PowerTransformer()
y_scaler = PowerTransformer()
X_train_norm = x_scaler.fit_transform(X_train)
y_train_norm = y_scaler.fit_transform(y_train.reshape(-1, 1))

# X_t, X_v, y_t, y_v = train_test_split(X_train_norm, y_train_norm, test_size=0.2)

In [None]:
plt.hist(y_train_norm, bins=24)

In [None]:
y_scaler.lambdas_

In [None]:
def create_nn(input_shape, output_shape, dense_units=1024, dropout_rate_1=0.1, dropout_rate_2=0.3, l1_reg=0.1, l2_reg=0.01, lr=0.00001):
    input = Input(shape=input_shape, name='input_1')
    x = Dense(dense_units,
              kernel_initializer='random_normal',
#               kernel_regularizer=l2(l2_reg),
              name='hidden_layer_1',
              use_bias=False,
             activation='relu'
             )(input)
#     x = Dropout(dropout_rate_1)(x)
    x = BatchNormalization()(x)
    x = Dense(dense_units,
              kernel_initializer='random_normal',
#               kernel_regularizer=l2(l2_reg),
              name='hidden_layer_2',
              use_bias=False,
              activation='relu'
             )(x)
    x = BatchNormalization()(x)
    x = Dense(dense_units,
              kernel_initializer='random_normal',
#               kernel_regularizer=l2(l2_reg),
              name='hidden_layer_3',
              use_bias=False,
              activation='relu'
             )(x)
    x = BatchNormalization()(x)
    x = Dense(dense_units,
              kernel_initializer='random_normal',
#               kernel_regularizer=l2(l2_reg),
              name='hidden_layer_4',
              use_bias=False,
              activation='relu'
             )(x)
#    x = Dropout(dropout_rate_2)(x)
    x = BatchNormalization()(x)
    x = Dense(output_shape,
              kernel_initializer='random_normal',
#               kernel_regularizer=l2(l2_reg),
              use_bias=False,
              name='output'
             )(x)

#    optimizer = SGD(lr=0.00001, momentum=0.9, nesterov=True)
    optimizer = Adam(lr=lr)
    
    model = Model(inputs=input, outputs=x)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

model = create_nn((X.shape[1],), 1, dense_units=2048, lr=0.00003)
model.summary()


In [None]:
# def generate_train_validate(X, y, test_size=0.2, bins=24, random_state=1138):
#     X_train, X_valid, y_train, y_valid = split_train(X, y, normalise=False, bins=bins, test_size=test_size, random_state=random_state)

X_train, X_valid, y_train, y_valid = split_train(X_train_norm, y_train_norm, normalise=False, random_state=1138)
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

model.reset_states()
#model.fit(X_train_norm, y_train_norm, epochs=60, batch_size=100, callbacks=callbacks, validation_split=0.2)
model.fit(X_train, y_train, epochs=100, validation_data=(X_train, y_train), batch_size=32, shuffle=True)

In [None]:
model.predict(X_test_norm), y_test_norm

## Hyperparameter Search

In [None]:
from hyperopt import fmin
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
N_FOLDS = 5
ITERATION = 0

hp_out_file = 'gbm_hp_trials.csv'
of_connection = open(hp_out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()

def objective(x_train, y_train, random_state=42, stratified=True):    
    def _objective(params, n_folds=N_FOLDS):
        # Keep track of evals
        global ITERATION
        
        print(params)

        ITERATION += 1

        # 1. Create + compile model
        params['dense_units'] = int(params['dense_units'])
        params['batch_size'] = int(params['batch_size'])
        params['input_shape'] = (x_train.shape[1],)
        params['output_shape'] = 1
        callbacks = [EarlyStopping(monitor='mean_squared_error', patience=2)]
        
        model = KerasRegressor(build_fn=create_nn, verbose=0, epochs=100, **params)
        
        # 2. Do Cross Validation
        
        start = timer()
        cv = ShuffleSplit(n_splits=n_folds, test_size=0.2, random_state=random_state)
        scores = cross_val_score(model, x_train, y_train, cv=cv, fit_params={'callbacks': callbacks })
        run_time = timer() - start

        loss = max(scores)

        # Round that returned the highest cv score
        n_estimators = int(np.argmax(scores) + 1)

        if ITERATION % 10 == 0:
            # Display the information
            display('Iteration {}: {} Fold CV Loss {:.5f}'.format(ITERATION, N_FOLDS, loss))

        of_connection = open(hp_out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, ITERATION, n_estimators, run_time])
        of_connection.close()

        # Dictionary with information for evaluation
        return {'loss': loss, 'params': params, 'iteration': ITERATION,
                'estimators': n_estimators, 
                'train_time': run_time, 'status': STATUS_OK}

    return _objective

In [None]:
space = {
    'dense_units': hp.choice('dense_units', [128, 256, 512, 1024, 2028]),
    'lr': hp.loguniform('lr', np.log(0.00001), np.log(0.1)),
    'dropout_rate_1': hp.choice('dropout_rate_1', [0.1, 0.2, 0.3, 0.4, 0.5]),
    'l2_reg': hp.uniform('l1_reg', 0.0, 1.0),
    'batch_size': hp.quniform('batch_size', 10, 100, 20),
}


In [None]:
MAX_EVALS = 100

tpe_algorithm = tpe.suggest
bayes_trials = Trials()

best = fmin(fn = objective(X_train_norm, y_train_norm), space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials, rstate=np.random.RandomState(1138))

In [None]:
bayes_trials.best_trial['result']

In [None]:
model = create_nn((X.shape[1],), 1, dense_units=2028, l2_reg=0.964981502445198, dropout_rate_1=0.2, lr=0.00001)
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

model.reset_states()
model.fit(X_train_norm, y_train_norm, epochs=100, batch_size=100, callbacks=callbacks, validation_split=0.2)

In [None]:
X = np.random.uniform(size=(1,4))
X

In [None]:
y = [0.5]

In [None]:
input_layer = Input(shape=(X_train_norm.shape[1],), name='input_1')
output_layer = Dense(1)(input_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='mse', optimizer='adagrad')
model.summary()

model.fit(X_train_norm[0:1], y_train_norm[0], epochs=100)

In [None]:
y_train_norm[0]

In [None]:
model.predict(X_train_norm[0:1])

In [None]:
K.epsilon()

In [None]:
y_norm = y_scaler.fit_transform(y.reshape(-1, 1))
y_scaler.lambdas_

In [None]:
np.min(y_norm), np.max(y_norm)

In [None]:
bins = np.linspace(-3, 3, 11) 
bins

In [None]:
y_binned = np.digitize(y_norm, bins)
y_binned

In [None]:
y_binned.shape

In [None]:
bincount = np.bincount(y_binned.ravel())
bincount

In [None]:
plt.plot(bincount)

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True) 
for train_idx, test_idx in skf.split(X, y_binned):
#     X_train = X.iloc[train_idx,:]
#     y_train = y[train_idx]
    print(train_idx.shape, test_idx.shape)


In [None]:
X = X.reindex()