In [7]:
# https://www.kaggle.com/datasets/karnikakapoor/satellite-orbital-catalog

In [8]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import class_weight
import tensorflow as tf

In [9]:
# download kaggle dataset from google drive, and import as a pandas dataframe
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1i4FdBT71ale29-1ido9Q0HNeNzOZ6lFN')

# display some info about the dataframe
display(df.head())
df.info()

Unnamed: 0,norad_id,name,object_type,satellite_constellation,altitude_km,altitude_category,orbital_band,congestion_risk,inclination,eccentricity,launch_year_estimate,days_in_orbit_estimate,orbit_lifetime_category,mean_motion,epoch,data_source,snapshot_date,country,last_seen
0,900,CALSPHERE 1,PAYLOAD,Other,976.868247,Low LEO,LEO-Polar,LOW,90.2215,0.00271,2023,0,<1yr,13.763481,2025-12-03 11:44:40.165728,celestrak,2025-12-03,US,2025-12-03
1,902,CALSPHERE 2,PAYLOAD,Other,1061.675587,Mid LEO,LEO-Polar,LOW,90.2363,0.002044,2023,0,<1yr,13.528815,2025-12-03 06:12:53.330976,celestrak,2025-12-03,US,2025-12-03
2,1361,LCS 1,PAYLOAD,Other,2787.874819,High LEO,MEO,LOW,32.1427,0.001343,2023,0,<1yr,9.893094,2025-12-03 11:26:30.164064,celestrak,2025-12-03,US,2025-12-03
3,1512,TEMPSAT 1,PAYLOAD,Other,1133.286101,Mid LEO,LEO-Polar,HIGH,89.9888,0.007142,2023,0,<1yr,13.335811,2025-12-03 09:48:38.369088,celestrak,2025-12-03,US,2025-12-03
4,1520,CALSPHERE 4A,PAYLOAD,Other,1123.330697,Mid LEO,LEO-Polar,HIGH,89.9092,0.006823,2023,0,<1yr,13.362367,2025-12-03 09:46:39.199296,celestrak,2025-12-03,US,2025-12-03


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13610 entries, 0 to 13609
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   norad_id                 13610 non-null  int64  
 1   name                     13610 non-null  object 
 2   object_type              13610 non-null  object 
 3   satellite_constellation  13610 non-null  object 
 4   altitude_km              13610 non-null  float64
 5   altitude_category        13610 non-null  object 
 6   orbital_band             13610 non-null  object 
 7   congestion_risk          13610 non-null  object 
 8   inclination              13610 non-null  float64
 9   eccentricity             13610 non-null  float64
 10  launch_year_estimate     13610 non-null  int64  
 11  days_in_orbit_estimate   13610 non-null  int64  
 12  orbit_lifetime_category  13610 non-null  object 
 13  mean_motion              13610 non-null  float64
 14  epoch                 

In [10]:
# separate features (x) and target (y)
x = df.drop('congestion_risk', axis=1)
y = df['congestion_risk']

# exclude descriptive columns not used for training data
exclude_columns = ['norad_id', 'name', 'epoch', 'data_source', 'snapshot_date', 'last_seen']
categorical_cols = ['object_type', 'satellite_constellation', 'altitude_category', 'orbital_band', 'orbit_lifetime_category', 'country']

# drop excluded columns
x_processed = x.drop(columns=exclude_columns, errors='ignore')

# use ordinal encoding to change categorical data to numerical for training
encoder = OrdinalEncoder()
x_processed[categorical_cols] = encoder.fit_transform(x_processed[categorical_cols])

# fit the encoder on the entire target variable 'y' to ensure all possible labels are learned
y_encoded_full = encoder.fit_transform(y.values.reshape(-1, 1))
num_classes = len(encoder.categories_[0]) # get the number of unique classes

# split data into training and testing sets with the processed data
x_train, x_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)

# now transform y_train and y_test using the fitted encoder
y_train_encoded = encoder.transform(y_train.values.reshape(-1, 1)).flatten()
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).flatten()

In [11]:
# define a fully connected (dense) neural network model
cnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(x_train.shape[1],)), # input layer with number of features
    tf.keras.layers.Dropout(0.5), # dropout layer to prevent overfitting
    tf.keras.layers.Dense(64, activation='relu'), # hidden layer
    tf.keras.layers.Dropout(0.3), # dropout layer to prevent overfitting
    tf.keras.layers.Dense(num_classes, activation='softmax') # output layer with number of classes
])

# compile the model
cnn_model.compile(optimizer='adam', # popular and efficient algorithm
              loss='sparse_categorical_crossentropy', # sparse categorical cross-entropy is appropriate for multi-class classification problems
              metrics=['accuracy'])

# fit the model
cnn_model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# evaluate the model
lossCNN, accuracyCNN = cnn_model.evaluate(x_test, y_test_encoded)
print('CNN Test accuracy:', accuracyCNN)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.6967 - loss: 170.0155 - val_accuracy: 0.8246 - val_loss: 15.7885
Epoch 2/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6907 - loss: 35.1784 - val_accuracy: 0.8251 - val_loss: 0.7830
Epoch 3/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7800 - loss: 6.5387 - val_accuracy: 0.8251 - val_loss: 0.6160
Epoch 4/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8109 - loss: 2.3573 - val_accuracy: 0.8251 - val_loss: 0.5904
Epoch 5/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8165 - loss: 1.7016 - val_accuracy: 0.8251 - val_loss: 0.5419
Epoch 6/10
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8190 - loss: 1.3572 - val_accuracy: 0.8251 - val_loss: 0.5328
Epoch 7/10
[1m273/273

In [12]:
# I picked a fully connection neural network model for my problem as they are suitable for tabular data and effective with multi-class classification problems
# The neural network still performs well, with a 0.817 accuracy, but is significantly less accurate than the Random Forest Classifier used in the traditional machine learning approach
# Due to the nature of the data, the Random Forest Classifier is highly effective as decision trees excel on data that splits cleanly on features in this way, so it is more effective than the neural network approach that is less suited to this specific task