# LightGBM for Galaxy Image Classification
This is primarily to set a baseline model to beat for more sophisticated models like Deep Neural Networks

In [34]:
import lightgbm as lgb
import numpy as np
import h5py
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

In [10]:
import warnings
warnings.filterwarnings("ignore")

## Loading and Preprocessing Data

In [3]:
# To get the images and labels from file
with h5py.File('Galaxy10_DECals.h5', 'r') as F:
    images = np.array(F['images'])
    labels = np.array(F['ans'])

# To convert to desirable type
labels = labels.astype(np.float32)
images = images.astype(np.float32)

In [19]:
# specify train and test paths
TRAIN_PATH = 'data/train.bin'
TEST_PATH = 'data/test.bin'

# check if data already exists
if os.path.isfile(train_path) and os.path.isfile(test_path):
    train_data = lgb.Dataset(TRAIN_PATH)
    test_data = lgb.Dataset(TEST_PATH)
else:
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=.1)

    # reshaping input data to 2D for tabular input to xgb
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)

    # loading data into lightgbm dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # saving lightgbm dataset for future loading
    train_data.save_binary(TRAIN_PATH)
    test_data.save_binary(TEST_PATH)

KeyboardInterrupt: 

## Training Simple LightGBM Model
w/o hyperparameter tuning

In [24]:
MODEL_PATH = 'models'

In [32]:
# specifying training parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': {'multi_logloss'},
    'num_class': 10,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
}

In [47]:
# save model to file
SAVE_PATH = os.path.join(MODEL_PATH, 'simple_lgb_galaxy_classifier.txt')

# load model if already available
if os.path.isfile(save_path):
    print('Loading model...')
    lgb_clf = lgb.load(save_path)
    
# else train and save
else:
    print('Starting training...')
    lgb_clf = lgb.train(params,
                    train_data,
                    num_boost_round=20,
                    valid_sets=test_data,
                    callbacks=[lgb.early_stopping(stopping_rounds=5)])
    
    print('Saving model...')
    lgb_clf.save_model(save_path)

# predicting
print('Starting predicting...')
y_pred = lgb_clf.predict(X_test, num_iteration=lgb_clf.best_iteration)
y_pred = [np.argmax(pred) for pred in y_pred] # take argmax of softmax output)

# eval
acc_sc = accuracy_score(y_test, y_pred)
print(f'The accuracy of the simple lightgbm model prediction is: {acc_sc}')

Starting predicting...
The accuracy of the simple lightgbm model prediction is: 0.560879368658399
