# Setup

In [1]:
import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Load dataset into variable

In [2]:
import pandas as pd

def load_asteroid_data():
    #csv_path = "shuffled_asteroids_head.csv"
    csv_path = "combined.csv"
    return pd.read_csv(csv_path, low_memory=False)

# Removing unneeded features and null features

In [3]:
asteroids = load_asteroid_data()
#asteroids.head()
asteroids = asteroids.dropna(subset=['pha'])
asteroids = asteroids.dropna(subset=['sigma_e'])
asteroids = asteroids.dropna(subset=['neo'])
asteroids = asteroids.dropna(subset=['H'])
asteroids = asteroids.dropna(subset=['ma'])
asteroids = asteroids.drop(columns=['spkid', 'full_name', 'pdes', 'prefix', 'name', 'orbit_id', 'equinox', 'diameter', "diameter_sigma", "albedo"])
asteroids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10027 entries, 0 to 10026
Data columns (total 35 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         10027 non-null  object 
 1   neo        10027 non-null  object 
 2   pha        10027 non-null  object 
 3   H          10027 non-null  float64
 4   epoch      10027 non-null  float64
 5   epoch_mjd  10027 non-null  int64  
 6   epoch_cal  10027 non-null  float64
 7   e          10027 non-null  float64
 8   a          10027 non-null  float64
 9   q          10027 non-null  float64
 10  i          10027 non-null  float64
 11  om         10027 non-null  float64
 12  w          10027 non-null  float64
 13  ma         10027 non-null  float64
 14  ad         10027 non-null  float64
 15  n          10027 non-null  float64
 16  tp         10027 non-null  float64
 17  tp_cal     10027 non-null  float64
 18  per        10027 non-null  float64
 19  per_y      10027 non-null  float64
 20  moid  

# Converting object datatypes to numerical and standardizing data

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()

#splitting features into numerical and categorical
asteroids_num = asteroids.drop(columns=["id", "neo", "pha", "class"])
asteroids_id = asteroids[["id"]]
asteroids_cat = asteroids[["neo", "pha", "class"]]

#encode categorical features
asteroid_cat_encoded = ordinal_encoder.fit_transform(asteroids_cat)

#Normalizing numerical features
std_scaler = StandardScaler()
asteroid_num_scaled = std_scaler.fit_transform(asteroids_num)

    

# Creating features and labels 

In [5]:
X = np.concatenate((asteroid_cat_encoded, asteroid_num_scaled), axis=1)
X = np.delete(X, 2, 1)
y = asteroid_cat_encoded[:,2]

print(X.shape)
print(y.shape)

(10027, 33)
(10027,)


# Training and test split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Classification

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def logistic_regression():
    # Initialization and fitting logisitic regression
    log_reg = LogisticRegression(solver="newton-cg", random_state=42)
    log_reg.fit(X_train, y_train)

    #Make predictions
    y_pred = log_reg.predict(X_test)

    #Model metrics
    log_reg_accuracy = accuracy_score(y_test, y_pred)
    log_reg_microf1 = f1_score(y_test, y_pred, average="micro")
    log_reg_macrof1 = f1_score(y_test, y_pred, average="macro")

    return log_reg, log_reg_accuracy, log_reg_microf1, log_reg_macrof1

log_reg, log_reg_accuracy, log_reg_microf1, log_reg_macrof1 = logistic_regression()

print("Accuracy: ", log_reg_accuracy)
print("Micro F1 scorre: ", log_reg_microf1)
print("Macro F1 score: ", log_reg_macrof1)

Accuracy:  0.9506480558325024
Micro F1 scorre:  0.9506480558325024
Macro F1 score:  0.6539133467541507


# Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
def randomForest():
    rndFr = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
    rndFr.fit(X_train, y_train)
    y_pred_rf = rndFr.predict(X_test)
    rndFr_accuracy = accuracy_score(y_test, y_pred_rf)
    rndFr_microf1 = f1_score(y_test, y_pred_rf, average="micro")
    rndFr_macrof1 = f1_score(y_test, y_pred_rf, average="macro")
    return rndFr, rndFr_accuracy, rndFr_microf1, rndFr_macrof1

rndFr, rndFr_accuracy, rndFr_microf1, rndFr_macrof1 = randomForest()

print("Accuracy: ", rndFr_accuracy)
print("Micro F1 scorre: ", rndFr_microf1)
print("Macro F1 score: ", rndFr_macrof1)

Accuracy:  0.9835493519441675
Micro F1 scorre:  0.9835493519441675
Macro F1 score:  0.8244815821560008


# Multi Layer Perceptron

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

def MLP():

    learning_rate = 0.01
    optimizer = Adam(learning_rate=learning_rate)
    
    # Define the MLP
    model = keras.models.Sequential([
        keras.layers.Dense(128, activation="relu", input_shape=X_train.shape[1:]),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid")
    ])

    # Compile the model
    model.compile(loss="binary_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    # Train the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=32,
                        validation_data=(X_test, y_test))

    # Evaluate the model
    model.evaluate(X_test, y_test)
    y_pred = model.predict(X_test)
    y_pred.round(2)

    # Calculate scores
    mlp_accuracy = accuracy_score(y_test, y_pred)
    mlp_microf1 = f1_score(y_test, y_pred, average="micro")
    mlp_macrof1 = f1_score(y_test, y_pred, average="macro")

    return model, history, mlp_accuracy, mlp_microf1, mlp_macrof1

mlp_model, mlp_history, mlp_accuracy, mlp_microf1, mlp_macrof1 = MLP()

print("MLP Test Accuracy:", mlp_accuracy)
print("MLP Micro F1 Score:", mlp_microf1)
print("MLP Macro F1 Score:", mlp_macrof1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
MLP Test Accuracy: 0.1794616151545364
MLP Micro F1 Score: 0.17946161515453637
MLP Macro F1 Score: 0.0338123415046492


# XGBOOST

In [13]:
import xgboost as xgb

def xgboost_clf():
    # Initialize XGBoost
    xgb_clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

    # Train the model
    xgb_clf.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test)

    # Model metrics
    xgb_accuracy = accuracy_score(y_test, y_pred)
    xgb_microf1 = f1_score(y_test, y_pred, average="micro")
    xgb_macrof1 = f1_score(y_test, y_pred, average="macro")

    return xgb_clf, xgb_accuracy, xgb_microf1, xgb_macrof1

xgb_model, xgb_accuracy, xgb_microf1, xgb_macrof1 = xgboost_clf()

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Micro F1 Score:", xgb_microf1)
print("XGBoost Macro F1 Score:", xgb_macrof1)


XGBoost Accuracy: 0.9925224327018943
XGBoost Micro F1 Score: 0.9925224327018943
XGBoost Macro F1 Score: 0.8677746762929767
