In [1]:
# https://www.kaggle.com/datasets/karnikakapoor/satellite-orbital-catalog

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OrdinalEncoder
import tensorflow as tf

In [3]:
# download dataset from kaggle, and import as a pandas dataframe
df = pd.read_csv('https://storage.googleapis.com/kagglesdsdata/datasets/8617423/13886558/current_catalog.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251127%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251127T143930Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=a8a3a54593be9274dc49333cc6a2c7dfd359a496aac094ceaac1bf2dd637bbe2421bf5af805a8a4b65e16af91d568442701719fcb7f32e8c933451397c9c0519ae8498532eadd14c1683f133f24d612456c992596152c5853975862fe8339d551b8a77aa78540cb5ec7bdd8d32593c10c189d912c902d0cece92ff7cf2efb1233211c04405f3476814c577846a3985acd360c5cda740592a801314d8ebc68acd343b7693a64da49f3d12fb1a4662f33bda3ffc52e06420fcb7156eadfaabecdec543f58e0467dd337bc8207ae92813bb0f04b8829652a67f52d554316f6bc1b297d16c80ff0ebfdc2d48a70b8a6dffaeb052f77b573a9251895def1f6df4edf6')

# display some info about the dataframe
display(df.head())
df.info()

Unnamed: 0,norad_id,name,object_type,satellite_constellation,altitude_km,altitude_category,orbital_band,congestion_risk,inclination,eccentricity,launch_year_estimate,days_in_orbit_estimate,orbit_lifetime_category,mean_motion,epoch,data_source,snapshot_date,country,last_seen
0,900,CALSPHERE 1,PAYLOAD,Other,976.915793,Low LEO,LEO-Polar,LOW,90.2213,0.002653,2023,0,<1yr,13.763347,2025-11-26 06:44:57.351840,celestrak,2025-11-26,US,2025-11-26
1,902,CALSPHERE 2,PAYLOAD,Other,1061.677512,Mid LEO,LEO-Polar,LOW,90.2361,0.002055,2023,0,<1yr,13.52881,2025-11-26 12:27:23.935392,celestrak,2025-11-26,US,2025-11-26
2,1361,LCS 1,PAYLOAD,Other,2787.875122,High LEO,MEO,LOW,32.144,0.001339,2023,0,<1yr,9.893094,2025-11-26 07:22:32.521440,celestrak,2025-11-26,US,2025-11-26
3,1512,TEMPSAT 1,PAYLOAD,Other,1133.289158,Mid LEO,LEO-Polar,HIGH,89.9893,0.007141,2023,0,<1yr,13.335803,2025-11-26 04:38:37.453920,celestrak,2025-11-26,US,2025-11-26
4,1520,CALSPHERE 4A,PAYLOAD,Other,1123.33786,Mid LEO,LEO-Polar,HIGH,89.9086,0.006851,2023,0,<1yr,13.362348,2025-11-26 09:00:18.881856,celestrak,2025-11-26,US,2025-11-26


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13507 entries, 0 to 13506
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   norad_id                 13507 non-null  int64  
 1   name                     13507 non-null  object 
 2   object_type              13507 non-null  object 
 3   satellite_constellation  13507 non-null  object 
 4   altitude_km              13507 non-null  float64
 5   altitude_category        13507 non-null  object 
 6   orbital_band             13507 non-null  object 
 7   congestion_risk          13507 non-null  object 
 8   inclination              13507 non-null  float64
 9   eccentricity             13507 non-null  float64
 10  launch_year_estimate     13507 non-null  int64  
 11  days_in_orbit_estimate   13507 non-null  int64  
 12  orbit_lifetime_category  13507 non-null  object 
 13  mean_motion              13507 non-null  float64
 14  epoch                 

In [13]:
# separate features (x) and target (y)
x = df.drop('congestion_risk', axis=1)
y = df['congestion_risk']

# exclude descriptive columns not used for training data
exclude_columns = ['norad_id', 'name', 'epoch', 'data_source', 'snapshot_date', 'last_seen']
categorical_cols = ['object_type', 'satellite_constellation', 'altitude_category', 'orbital_band', 'orbit_lifetime_category', 'country']

# drop excluded columns
x_processed = x.drop(columns=exclude_columns, errors='ignore')

# use ordinal encoding to change categorical data to numerical for training
encoder = OrdinalEncoder()
x_processed[categorical_cols] = encoder.fit_transform(x_processed[categorical_cols])

# fit the encoder on the entire target variable 'y' to ensure all possible labels are learned
y_encoded_full = encoder.fit_transform(y.values.reshape(-1, 1))
num_classes = len(encoder.categories_[0]) # get the number of unique classes

# split data into training and testing sets with the processed data
x_train, x_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)

# now transform y_train and y_test using the fitted encoder
y_train_encoded = encoder.transform(y_train.values.reshape(-1, 1)).flatten()
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).flatten()

In [20]:
# define a fully connected (dense) neural network model
cnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(x_train.shape[1],)), # input layer with number of features
    tf.keras.layers.Dropout(0.5), # dropout layer to prevent overfitting
    tf.keras.layers.Dense(64, activation='relu'), # hidden layer
    tf.keras.layers.Dropout(0.3), # dropout layer to prevent overfitting
    tf.keras.layers.Dense(num_classes, activation='softmax') # output layer with number of classes
])

# compile the model
cnn_model.compile(optimizer='adam', # popular and efficient algorithm
              loss='sparse_categorical_crossentropy', # sparse categorical cross-entropy is appropriate for multi-class classification problems
              metrics=['accuracy'])

# fit the model
cnn_model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# evaluate the model
lossCNN, accuracyCNN = cnn_model.evaluate(x_test, y_test_encoded)
print('CNN Test accuracy:', accuracyCNN)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6295 - loss: 243.8904 - val_accuracy: 0.8251 - val_loss: 13.7763
Epoch 2/10
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6935 - loss: 34.2627 - val_accuracy: 0.8251 - val_loss: 1.9782
Epoch 3/10
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7641 - loss: 8.4307 - val_accuracy: 0.8251 - val_loss: 0.8475
Epoch 4/10
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7948 - loss: 3.9513 - val_accuracy: 0.8251 - val_loss: 0.6337
Epoch 5/10
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8200 - loss: 2.1954 - val_accuracy: 0.8251 - val_loss: 0.5514
Epoch 6/10
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8132 - loss: 1.4281 - val_accuracy: 0.8251 - val_loss: 0.5194
Epoch 7/10
[1m271/271[0m [32m━━━

In [19]:
# I picked a fully connection neural network model for my problem as they are suitable for tabular data and effective with multi-class classification problems
# The neural network still performs well, with a 0.817 accuracy, but is significantly less accurate than the Random Forest Classifier used in the traditional machine learning approach
# Due to the nature of the data, the Random Forest Classifier is highly effective as decision trees excel on data that splits cleanly on features in this way, so it is more effective than the neural network approach that is less suited to this specific task