In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt   
import seaborn as sns
from scipy.stats import zscore

In [11]:
df = pd.read_parquet("c:\\Users\\tkbar\\OneDrive\\Desktop\\ADS508 Final Project\\ADS508-GroupProject\\Data to use\\full_esa_data.parquet")

print(df.shape)
print(df.columns)

(81941, 21)
Index(['cosparId', 'vimpelId', 'satno', 'name', 'objectClass', 'mass', 'shape',
       'width', 'height', 'depth', 'diameter', 'span', 'xSectMax', 'xSectMin',
       'xSectAvg', 'firstEpoch', 'mission', 'predDecayDate', 'active',
       'cataloguedFragments', 'onOrbitCataloguedFragments'],
      dtype='object')


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81941 entries, 0 to 81940
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   cosparId                    67666 non-null  object 
 1   vimpelId                    14933 non-null  float64
 2   satno                       63052 non-null  float64
 3   name                        67390 non-null  object 
 4   objectClass                 81941 non-null  object 
 5   mass                        34050 non-null  float64
 6   shape                       33235 non-null  object 
 7   width                       18430 non-null  float64
 8   height                      32680 non-null  float64
 9   depth                       18403 non-null  float64
 10  diameter                    14809 non-null  float64
 11  span                        32380 non-null  float64
 12  xSectMax                    32980 non-null  float64
 13  xSectMin                    329

In [8]:
df.describe()

Unnamed: 0,vimpelId,satno,mass,width,height,depth,diameter,span,xSectMax,xSectMin,xSectAvg,cataloguedFragments,onOrbitCataloguedFragments
count,14933.0,63052.0,34050.0,18430.0,32680.0,18403.0,14809.0,32380.0,32980.0,32980.0,32920.0,81941.0,81941.0
mean,95640.43,31543.718455,1807.384763,2.231862,3.278075,1.888098,2.30913,9.730365,28.481166,4.637487,17.281073,0.354926,0.175224
std,45331.95,18220.71731,7269.503234,2.070336,5.173099,2.977385,1.43835,152.02679,101.742724,37.008288,77.054413,17.771147,11.462052
min,9600.0,1.0,0.0015,0.0,0.001,0.012,0.001,0.02,0.00013,0.0,0.000102,0.0,0.0
25%,69211.0,15766.75,130.82,0.4,0.3,0.5,1.4,1.5,1.375159,0.2311,1.1,0.0,0.0
50%,79106.0,31534.5,500.0,2.2,1.2,1.5,2.4,6.5,16.219661,0.81,11.087467,0.0,0.0
75%,141218.0,47302.25,1600.0,3.7,5.5,2.7,3.0,8.99,25.626695,4.523893,17.070629,0.0,0.0
max,1346400.0,63155.0,450000.0,72.8,60.0,108.5,41.14,19200.0,8319.848402,1521.0,5762.4,3536.0,2562.0


**Lot of null values so need to decide what to do with them.  We'll start be removing columns that we won't be using**

In [9]:
columns_to_drop = ['vimpelId', 'satno', 'predDecayDate', 'active', 'mission', 'shape', 'mass', 'width', 'height', 'depth', 'xSectMax', 'xSectMin']

In [10]:
df.drop(columns = columns_to_drop, inplace = True)

df.isna().sum()

cosparId                      14275
name                          14551
objectClass                       0
diameter                      67132
span                          49561
xSectAvg                      49021
firstEpoch                     4104
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

In [None]:
# change to Date time

df['firstEpoch'] = pd.to_datetime(df['firstEpoch'])

**If all rows are NA values or 0, drop them and reassess**

In [11]:
# Lot of rows are all NA or 0, but have positive values for first Epoch, so removing those

exclude_col = ['firstEpoch', 'objectClass']

# Drop rows where all values (except the exclude_col) are NA or 0
df = df[~((df.drop(columns=exclude_col).isna()) | (df.drop(columns=exclude_col) == 0)).all(axis=1)]

df.isna().sum()

cosparId                        458
name                            734
objectClass                       0
diameter                      53315
span                          35744
xSectAvg                      35204
firstEpoch                     4073
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

**Name and object class are still necessary, so if both are null then remove, otherwise keep and deal with the other missing data as needed**

In [12]:
df_cleaned = df[~df[['cosparId', 'name']].isna().all(axis = 1)]

df_cleaned.isna().sum()

cosparId                        458
name                            734
objectClass                       0
diameter                      53315
span                          35744
xSectAvg                      35204
firstEpoch                     4073
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

In [13]:
df_cleaned.to_csv("Cleaned_Data/cleaned_esa.csv", index = False)

# Use the data at this point to plot growth of objects with full objectClass differentiation before collapsing them into 4 classes

In [14]:
df_cleaned['objectClass'].value_counts()

objectClass
Payload                           22635
Payload Fragmentation Debris      15867
Rocket Fragmentation Debris       11853
Rocket Body                        8162
Payload Mission Related Object     4145
Rocket Mission Related Object      4083
Unknown                             719
Payload Debris                      341
Rocket Debris                       222
Other Debris                         52
Other Mission Related Object         45
Name: count, dtype: int64

In [15]:
df_leo_predict = df_cleaned.copy()

df_leo_predict['objectClass'] = df_leo_predict['objectClass'].apply(lambda x: 'Debris' if 'Debris' in x else x)
df_leo_predict.loc[df_leo_predict['objectClass'].str.contains('Mission'), 'objectClass'] = 'Debris'

In [41]:
standard_cols = ['diameter', 'span', 'xSectAvg', 'cataloguedFragments', 'onOrbitCataloguedFragments']

df_standardized = df_leo_predict.copy()

df_standardized[['diameter', 'span', 'xSectAvg']] = df_standardized[['diameter', 'span', 'xSectAvg']].apply(lambda x: x.fillna(x.median()))

df_standardized[standard_cols] = df_standardized[standard_cols].apply(zscore)

In [43]:
df_standardized.head(10)

Unnamed: 0,cosparId,name,objectClass,diameter,span,xSectAvg,firstEpoch,cataloguedFragments,onOrbitCataloguedFragments
0,1964-063C,Calsphere 1,Payload,-3.007907,-0.073223,-0.260538,1964-10-06,-0.021905,-0.016767
1,1964-063E,Calsphere 2,Payload,0.029411,-0.073223,-0.260538,1964-10-06,-0.021905,-0.016767
2,1965-034C,LCS 1,Payload,0.029411,-0.065878,-0.243743,1965-05-06,-0.021905,-0.016767
3,1965-065E,Tempsat 1,Payload,0.029411,-0.073223,-0.260538,1965-08-13,-0.021905,-0.016767
4,1965-065H,Calsphere 4,Payload,0.029411,-0.072842,-0.260092,1965-08-13,-0.021905,-0.016767
5,1967-053A,OPS 5712 payload 160 (Calsphere IV),Payload,0.029411,-0.071792,-0.258627,1967-05-31,-0.021905,-0.016767
6,1967-066E,LES 5,Payload,-1.757247,-0.06044,-0.222033,1967-07-01,-0.021905,-0.016767
7,1967-053F,Surcal 159 (Calsphere III),Payload,0.029411,-0.072842,-0.260092,1967-05-31,-0.021905,-0.016767
8,1967-053H,OPS 5712 payload 153 (NRL PL153),Payload,0.029411,0.095062,-0.23636,1967-05-31,-0.021905,-0.016767
9,1967-053J,Surcal 150B (NRL PL150B),Payload,0.029411,-0.07475,-0.261317,1967-05-31,-0.021905,-0.016767
