In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt   
import seaborn as sns

In [4]:
df = pd.read_parquet("Data to use/full_esa_data.parquet")

print(df.shape)
print(df.columns)

(81941, 21)
Index(['cosparId', 'vimpelId', 'satno', 'name', 'objectClass', 'mass', 'shape',
       'width', 'height', 'depth', 'diameter', 'span', 'xSectMax', 'xSectMin',
       'xSectAvg', 'firstEpoch', 'mission', 'predDecayDate', 'active',
       'cataloguedFragments', 'onOrbitCataloguedFragments'],
      dtype='object')


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81941 entries, 0 to 81940
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   cosparId                    67666 non-null  object 
 1   vimpelId                    14933 non-null  float64
 2   satno                       63052 non-null  float64
 3   name                        67390 non-null  object 
 4   objectClass                 81941 non-null  object 
 5   mass                        34050 non-null  float64
 6   shape                       33235 non-null  object 
 7   width                       18430 non-null  float64
 8   height                      32680 non-null  float64
 9   depth                       18403 non-null  float64
 10  diameter                    14809 non-null  float64
 11  span                        32380 non-null  float64
 12  xSectMax                    32980 non-null  float64
 13  xSectMin                    329

In [8]:
df.describe()

Unnamed: 0,vimpelId,satno,mass,width,height,depth,diameter,span,xSectMax,xSectMin,xSectAvg,cataloguedFragments,onOrbitCataloguedFragments
count,14933.0,63052.0,34050.0,18430.0,32680.0,18403.0,14809.0,32380.0,32980.0,32980.0,32920.0,81941.0,81941.0
mean,95640.43,31543.718455,1807.384763,2.231862,3.278075,1.888098,2.30913,9.730365,28.481166,4.637487,17.281073,0.354926,0.175224
std,45331.95,18220.71731,7269.503234,2.070336,5.173099,2.977385,1.43835,152.02679,101.742724,37.008288,77.054413,17.771147,11.462052
min,9600.0,1.0,0.0015,0.0,0.001,0.012,0.001,0.02,0.00013,0.0,0.000102,0.0,0.0
25%,69211.0,15766.75,130.82,0.4,0.3,0.5,1.4,1.5,1.375159,0.2311,1.1,0.0,0.0
50%,79106.0,31534.5,500.0,2.2,1.2,1.5,2.4,6.5,16.219661,0.81,11.087467,0.0,0.0
75%,141218.0,47302.25,1600.0,3.7,5.5,2.7,3.0,8.99,25.626695,4.523893,17.070629,0.0,0.0
max,1346400.0,63155.0,450000.0,72.8,60.0,108.5,41.14,19200.0,8319.848402,1521.0,5762.4,3536.0,2562.0


**Lot of null values so need to decide what to do with them.  We'll start be removing columns that we won't be using**

In [34]:
columns_to_drop = ['vimpelId', 'satno', 'predDecayDate', 'active', 'mission']

In [None]:
df.drop(columns = columns_to_drop, inplace = True)

df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = 'mission', inplace = True)


cosparId                        458
name                            734
objectClass                       0
mass                          34074
shape                         34889
width                         49694
height                        35444
depth                         49721
diameter                      53315
span                          35744
xSectMax                      35144
xSectMin                      35144
xSectAvg                      35204
firstEpoch                     4073
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

**If all rows are NA values or 0, drop them and reassess**

In [31]:
# Lot of rows are all NA or 0, but have positive values for first Epoch, so removing those

exclude_col = ['firstEpoch', 'objectClass']

# Drop rows where all values (except the exclude_col) are NA or 0
df = df[~((df.drop(columns=exclude_col).isna()) | (df.drop(columns=exclude_col) == 0)).all(axis=1)]

df.isna().sum()

cosparId                        458
name                            734
objectClass                       0
mass                          34074
shape                         34889
width                         49694
height                        35444
depth                         49721
diameter                      53315
span                          35744
xSectMax                      35144
xSectMin                      35144
xSectAvg                      35204
firstEpoch                     4073
mission                       45409
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

**Name and object class are still necessary, so if both are null then remove, otherwise keep and deal with the other missing data as needed**

In [39]:
df_cleaned = df[~df[['cosparId', 'name']].isna().all(axis = 1)]

df_cleaned.isna().sum()

cosparId                        458
name                            734
objectClass                       0
mass                          34074
shape                         34889
width                         49694
height                        35444
depth                         49721
diameter                      53315
span                          35744
xSectMax                      35144
xSectMin                      35144
xSectAvg                      35204
firstEpoch                     4073
cataloguedFragments               0
onOrbitCataloguedFragments        0
dtype: int64

In [42]:
df_cleaned.to_csv("Cleaned_Data/cleaned_esa.csv", index = False)