In [14]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [3]:
data = h5py.File("../dataset/data_sample.hdf5", "r")
print("data keys: "+str(list(data.keys())))

images = data["images"][:]
labels = data["labels"][:]
obs_days = data["observation_days"][:]
print("image shape: (sample, x_size, y_size, epoch) = "+str(images.shape))

data keys: ['images', 'labels', 'observation_days']
image shape: (sample, x_size, y_size, epoch) = (72000, 21, 21, 48)


In [8]:
def subsample(n):
    sampled_indices = np.random.choice(images.shape[0], n)
    sampled_image = images[sampled_indices]
    sampled_image = np.transpose(sampled_image, [0, 3, 1, 2])

    sampled_x = sampled_image.reshape(-1, 21*21)
    sampled_y = np.repeat(labels[sampled_indices], 48)

    x_train, x_test, y_train, y_test = train_test_split(sampled_x, sampled_y, test_size=0.2, random_state=42)
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    return x_train, x_test, y_train, y_test

In [None]:
# kNN (500*48 samples)
x_train, x_test, y_train, y_test = subsample(n=500)
mod_knn_500 = KNeighborsClassifier(n_neighbors=3)
mod_knn_500.fit(x_train, y_train) 
print('Training accuracy: ', mod_knn_500.score(x_train, y_train))
print('Testing accuracy: ', mod_knn_500.score(x_train, y_train))

In [12]:
# kNN (5000*48 samples)
x_train, x_test, y_train, y_test = subsample(n=5000)
mod_knn_5000 = KNeighborsClassifier(n_neighbors=3)
mod_knn_5000.fit(x_train, y_train) 
print('Training accuracy: ', mod_knn_5000.score(x_train, y_train))
print('Testing accuracy: ', mod_knn_5000.score(x_train, y_train))

0.72946875

In [None]:
# SVM (500*48 samples)
x_train, x_test, y_train, y_test = subsample(n=500)
mod_svm = SVC()
mod_svm.fit(x_train, y_train) 
print('Training accuracy: ', mod_svm.score(x_train, y_train))
print('Testing accuracy: ', mod_svm.score(x_train, y_train))

In [None]:
# Random Forest (500*48 samples)
x_train, x_test, y_train, y_test = subsample(n=500)
mod_rf_500 = RandomForestClassifier()
mod_rf_500.fit(x_train, y_train) 
print('Training accuracy: ', mod_rf_500.score(x_train, y_train))
print('Testing accuracy: ', mod_rf_500.score(x_train, y_train))

In [None]:
# Random Forest (5000*48 samples)
x_train, x_test, y_train, y_test = subsample(n=5000)
mod_rf_5000 = RandomForestClassifier()
mod_rf_5000.fit(x_train, y_train) 
print('Training accuracy: ', mod_rf_5000.score(x_train, y_train))
print('Testing accuracy: ', mod_rf_5000.score(x_train, y_train))

In [11]:
# CNN (5000*48 samples)
dic = {'Asteroids':0, 'Constant':1, 'EmptyLigh':2, 'M33Cephei':3, 'RRLyrae':4, 'Supernova':5}
labels_digit = [dic[i] for i in labels]
labels_flatten = np.repeat(labels_digit, 48)
enc = OneHotEncoder()
labels_flatten_onehot = enc.fit_transform(labels_flatten.reshape(-1, 1))
flatten_data = images.transpose(0, 3, 1, 2).reshape((72000*48, 21, 21, 1))

sample_idx = np.random.choice(range(len(flatten_data)), 100000)
sample_data = flatten_data[sample_idx]
sample_labels = labels_flatten_onehot[sample_idx]

x_train, x_test, y_train, y_test = train_test_split(sample_data, sample_labels)

def init_model():
    
    model = Sequential()
    
    model.add(Conv2D(32, (4,4), input_shape=(21,21,1), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(MaxPooling2D((3,3), strides=(1,1)))
    
    model.add(Conv2D(64, (3,3), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(Conv2D(64, (3,3), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(MaxPooling2D((3,3), strides=(1,1)))
    
    model.add(Conv2D(32, (3,3), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(Conv2D(32, (3,3), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(Conv2D(32, (3,3), padding='same', activation='relu', kernel_initializer='uniform'))
    model.add(MaxPooling2D((3,3), strides=(1,1)))
    
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))

    return model

In [16]:
mod_cnn = init_model()
adam = keras.optimizers.Adam(lr=0.0001)
mod_cnn.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

mod_cnn.fit(x_train, y_train,
            batch_size=128,
            epochs=20,
            validation_data=(x_test, y_test),
            shuffle=True)

Train on 75000 samples, validate on 25000 samples
Epoch 1/20
 1024/75000 [..............................] - ETA: 18:00 - loss: 1.9475 - acc: 0.1768

KeyboardInterrupt: 