## BSS Identification Using Machine Learning

Yuvraj Sahu, Andrew Harvey, Elijah Flores

The University of Texas at Austin

In [1]:
# Import statements
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
# Global parameters used for running the experiments - note that these can be
# changed and tuned for running different experiments
g_data                 = pd.read_csv('LiStars.csv')
g_features             = ['Gmag', 'BPmag', 'RPmag', 'Gmagcor', '(G-RP)cor']
g_models               = [DecisionTreeClassifier, SVC, LogisticRegression]
g_result_column        = 'Note'
g_result_nss_indicator = 'N'
g_result_bss_indicator = 'BS'
g_num_iterations       = 500
g_initial_seed         = 737_132

In [3]:
# Translate an integer representing the combination ID (comb_id) into a list of
# features based on all of the available features
def translate_comb(features, comb_id):
    selected_features = []
    for i in range(len(features)):
        if (comb_id & (1 << i)) == 0:
            selected_features.append(features[i])
    return selected_features

In [4]:
# Retrieves the combination ID given the feature list an a subset of selected
# features; note that this function is not used in this program, but is meant
# to allow other researchers to more easily tailor this program to their needs
def get_comb_id(features, selected_features):
    inverse_id = 0
    for selected_feature in selected_features:
        inverse_id |= 1 << features.index(selected_feature)
    all_ones = (1 << len(features)) - 1
    return inverse_id ^ all_ones

In [5]:
# Selects a random subset of the data that contains at most new_size rows
def sample_data(data, new_size):
    data_size = data.shape[0]
    if new_size < data_size:
        sample = np.random.choice(data_size, size = new_size, replace = False)
        data = data[sample]
    return data

In [6]:
# Generates a test case (training and testing data for x and y) based on the
# subset of NSS and BSS parameters currently in use; returns the 4-tuple
# (x_train, x_test, y_train, y_test)
def generate_testcase(nss_sub, bss_sub):
    size = min(nss_sub.shape[0], bss_sub.shape[0])
    nss_sample = sample_data(nss_sub, size)
    bss_sample = sample_data(bss_sub, size)
    x = np.concatenate((nss_sample, bss_sample))
    y = np.concatenate((np.zeros(size, dtype=int), np.ones(size, dtype=int)))
    testcase = train_test_split(x, y, test_size = 0.3)
    return testcase

In [7]:
# Creates an instance of the given model and runs the test case on it,
# returning the accuracy achieved
def run_instance(model, testcase):
    x_train, x_test, y_train, y_test = testcase
    model_instance = model()
    model_instance.fit(x_train, y_train)
    y_pred = model_instance.predict(x_test)
    accuracy = np.sum(y_test == y_pred) / y_pred.size
    return accuracy

In [8]:
# Goes through each combination of features (except the empty set) and tests
# each model for the given number of iterations; returns a dictionary mapping
# model names to performance, where performance is represented as a list of
# accuracies indexed by combination ID
def evaluate_models(nss, bss, features, models, num_iterations):
    num_combs = (1 << len(features)) - 1
    output = {model.__name__: [0.0] * num_combs for model in models}
    for comb_id in range(num_combs):
        selected_features = translate_comb(features, comb_id)
        nss_sub = nss[selected_features].to_numpy()
        bss_sub = bss[selected_features].to_numpy()
        for iteration in range(num_iterations):
            testcase = generate_testcase(nss_sub, bss_sub)
            for model in models:
                accuracy = run_instance(model, testcase)
                output[model.__name__][comb_id] += accuracy / num_iterations
    return output

In [9]:
# Formats and prints the results outputted from the evaluate_models function
def print_results(results, features):
    num_combs = (1 << len(features)) - 1
    feature_text_len = sum(map(len, features)) + 2 * len(features)
    formatted_features = [
        ', '.join(translate_comb(features, comb_id)).ljust(feature_text_len)
        for comb_id in range(num_combs)
    ]

    for model_name, accuracies in results.items():
        print(f'### {model_name} Results ###')
        iterator = sorted(range(num_combs), key = lambda i: -accuracies[i])
        for i in iterator:
            formatted_accuracy_percent = format(accuracies[i] * 100, '.3f')
            print(f'{formatted_features[i]}{formatted_accuracy_percent}%')
        print()

In [10]:
# Sets a random seed (for reproducibility), finds the NSS and BSS data, and
# runs the experiments
def run_all_tests(data, features, models, result_column, result_nss_indicator,
                  result_bss_indicator, num_iterations, initial_seed):
    np.random.seed(initial_seed)
    result = data[result_column]
    nss = data[result == result_nss_indicator]
    bss = data[result == result_bss_indicator]
    output = evaluate_models(nss, bss, features, models, num_iterations)
    print_results(output, features)

In [11]:
# Runs the tests using the global parameters from above
run_all_tests(g_data, g_features, g_models, g_result_column,
              g_result_nss_indicator, g_result_bss_indicator, g_num_iterations,
              g_initial_seed)

### DecisionTreeClassifier Results ###
Gmagcor, (G-RP)cor                      76.731%
BPmag, (G-RP)cor                        76.065%
Gmag, BPmag, Gmagcor, (G-RP)cor         75.955%
BPmag, RPmag, (G-RP)cor                 75.873%
RPmag, Gmagcor, (G-RP)cor               75.841%
BPmag, Gmagcor, (G-RP)cor               75.816%
Gmag, RPmag, Gmagcor, (G-RP)cor         75.771%
Gmag, (G-RP)cor                         75.673%
Gmag, RPmag, (G-RP)cor                  75.673%
Gmag, BPmag, (G-RP)cor                  75.641%
Gmag, Gmagcor, (G-RP)cor                75.604%
Gmag, BPmag, RPmag, Gmagcor, (G-RP)cor  75.522%
RPmag, (G-RP)cor                        75.424%
BPmag, RPmag, Gmagcor, (G-RP)cor        75.269%
Gmag, BPmag, RPmag, (G-RP)cor           75.110%
Gmagcor                                 74.384%
Gmag, BPmag, Gmagcor                    74.114%
Gmag, RPmag, Gmagcor                    74.073%
RPmag, Gmagcor                          74.004%
BPmag, RPmag                            73.865%
B