# Setup

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
import warnings
import matplotlib.pyplot as plt # data visualization

warnings.filterwarnings('ignore', category=FutureWarning)

In [5]:
# Loading The Dataset
df_asteroids = pd.read_csv('../Data/dataset.csv', low_memory=False)
df_asteroids.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,a0000001,2000001,1 Ceres,1,Ceres,,N,N,3.4,939.4,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,a0000002,2000002,2 Pallas,2,Pallas,,N,N,4.2,545.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,a0000003,2000003,3 Juno,3,Juno,,N,N,5.33,246.596,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,a0000004,2000004,4 Vesta,4,Vesta,,N,N,3.0,525.4,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,a0000005,2000005,5 Astraea,5,Astraea,,N,N,6.9,106.699,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


# **Data Exploration**

Column Descriptions provided with the dataset:

> * SPK-ID: Object primary SPK-ID
> * Object ID: Object internal database ID
> * Object fullname: Object full name/designation
> * pdes: Object primary designation
> * name: Object IAU name
> * NEO: Near-Earth Object (NEO) flag
> * PHA: Potentially Hazardous Asteroid (PHA) flag
> * H: Absolute magnitude parameter
> * Diameter: object diameter (from equivalent sphere) km Unit
> * Albedo: Geometric albedo
> * Diameter_sigma: 1-sigma uncertainty in object diameter km Unit
> * Orbit_id: Orbit solution ID
> * Epoch: Epoch of osculation in modified Julian day form
> * Equinox: Equinox of reference frame
> * e: Eccentricity
> * a: Semi-major axis au Unit
> * q: perihelion distance au Unit
> * i: inclination; angle with respect to x-y ecliptic plane
> * tp: Time of perihelion passage TDB Unit
> * moid_ld: Earth Minimum Orbit Intersection Distance au Unit

In [6]:
df_asteroids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958524 entries, 0 to 958523
Data columns (total 45 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              958524 non-null  object 
 1   spkid           958524 non-null  int64  
 2   full_name       958524 non-null  object 
 3   pdes            958524 non-null  object 
 4   name            22064 non-null   object 
 5   prefix          18 non-null      object 
 6   neo             958520 non-null  object 
 7   pha             938603 non-null  object 
 8   H               952261 non-null  float64
 9   diameter        136209 non-null  float64
 10  albedo          135103 non-null  float64
 11  diameter_sigma  136081 non-null  float64
 12  orbit_id        958524 non-null  object 
 13  epoch           958524 non-null  float64
 14  epoch_mjd       958524 non-null  int64  
 15  epoch_cal       958524 non-null  float64
 16  equinox         958524 non-null  object 
 17  e         

In [7]:
df_asteroids.describe()

Unnamed: 0,spkid,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,a,...,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms
count,958524.0,952261.0,136209.0,135103.0,136081.0,958524.0,958524.0,958524.0,958524.0,958524.0,...,938602.0,938602.0,938602.0,938602.0,938602.0,938598.0,938602.0,938602.0,938598.0,958522.0
mean,3810114.0,16.906411,5.506429,0.130627,0.479184,2458869.0,58868.78195,20196930.0,0.156116,2.902143,...,19.82929,1.168449,5.310234,1370062.0,1369977.0,21.31453,0.05060221,431278000.0,85258.15,0.561153
std,6831541.0,1.790405,9.425164,0.110323,0.782895,701.6716,701.671573,19303.54,0.092643,39.719503,...,2903.785,128.2231,1333.381,915899600.0,915899100.0,7197.034,9.814953,295304600000.0,27676810.0,2.7457
min,2000001.0,-1.1,0.0025,0.001,0.0005,2425052.0,25051.0,19270620.0,0.0,-14702.447872,...,1.9569e-11,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,0.0
25%,2239632.0,16.1,2.78,0.053,0.18,2459000.0,59000.0,20200530.0,0.092193,2.387835,...,1.462e-07,6.0959e-06,3.6194e-05,5.755e-05,2.5737e-05,2.3409e-08,2.7688e-09,0.00011109,1.7945e-05,0.51804
50%,2479262.0,16.9,3.972,0.079,0.332,2459000.0,59000.0,20200530.0,0.145002,2.646969,...,2.2719e-07,8.6888e-06,6.64255e-05,0.00010471,4.9001e-05,4.359e-08,4.638e-09,0.00022308,3.5017e-05,0.56628
75%,3752518.0,17.714,5.765,0.19,0.62,2459000.0,59000.0,20200530.0,0.20065,3.001932,...,6.5832e-07,1.5915e-05,0.0001609775,0.00031144,0.00017189,1.1966e-07,1.124e-08,0.00081396,9.775475e-05,0.613927
max,54017230.0,33.2,939.4,1.0,140.0,2459000.0,59000.0,20200530.0,1.855356,33488.895955,...,1015000.0,55330.0,1199100.0,884510000000.0,884510000000.0,5509700.0,7698.8,285310000000000.0,19107000000.0,2686.6


In [8]:
df_asteroids.shape

(958524, 45)

In [9]:
df_asteroids.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,a0000001,2000001,1 Ceres,1,Ceres,,N,N,3.4,939.4,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,a0000002,2000002,2 Pallas,2,Pallas,,N,N,4.2,545.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,a0000003,2000003,3 Juno,3,Juno,,N,N,5.33,246.596,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,a0000004,2000004,4 Vesta,4,Vesta,,N,N,3.0,525.4,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,a0000005,2000005,5 Astraea,5,Astraea,,N,N,6.9,106.699,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [13]:
df_asteroids['prefix'].unique()

array([nan, 'A'], dtype=object)

In [15]:
df_asteroids_without_ids = df_asteroids.drop(['id','spkid','full_name','pdes','name','prefix','equinox'],axis=1)

In [16]:
df_asteroids_without_ids.head()

Unnamed: 0,neo,pha,H,diameter,albedo,diameter_sigma,orbit_id,epoch,epoch_mjd,epoch_cal,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,N,N,3.4,939.4,0.09,0.2,JPL 47,2458600.5,58600,20190427.0,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,N,N,4.2,545.0,0.101,18.0,JPL 37,2459000.5,59000,20200531.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,N,N,5.33,246.596,0.214,10.594,JPL 112,2459000.5,59000,20200531.0,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,N,N,3.0,525.4,0.4228,0.2,JPL 35,2458600.5,58600,20190427.0,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,N,N,6.9,106.699,0.274,3.14,JPL 114,2459000.5,59000,20200531.0,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [17]:
df_asteroids_without_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958524 entries, 0 to 958523
Data columns (total 38 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   neo             958520 non-null  object 
 1   pha             938603 non-null  object 
 2   H               952261 non-null  float64
 3   diameter        136209 non-null  float64
 4   albedo          135103 non-null  float64
 5   diameter_sigma  136081 non-null  float64
 6   orbit_id        958524 non-null  object 
 7   epoch           958524 non-null  float64
 8   epoch_mjd       958524 non-null  int64  
 9   epoch_cal       958524 non-null  float64
 10  e               958524 non-null  float64
 11  a               958524 non-null  float64
 12  q               958524 non-null  float64
 13  i               958524 non-null  float64
 14  om              958524 non-null  float64
 15  w               958524 non-null  float64
 16  ma              958523 non-null  float64
 17  ad        

In [18]:
df_asteroids_dropna = df_asteroids_without_ids.dropna()

In [22]:
df_asteroids_dropna.isna().sum()

neo               0
pha               0
H                 0
diameter          0
albedo            0
diameter_sigma    0
orbit_id          0
epoch             0
epoch_mjd         0
epoch_cal         0
e                 0
a                 0
q                 0
i                 0
om                0
w                 0
ma                0
ad                0
n                 0
tp                0
tp_cal            0
per               0
per_y             0
moid              0
moid_ld           0
sigma_e           0
sigma_a           0
sigma_q           0
sigma_i           0
sigma_om          0
sigma_w           0
sigma_ma          0
sigma_ad          0
sigma_n           0
sigma_tp          0
sigma_per         0
class             0
rms               0
dtype: int64

In [23]:
df_asteroids_dropna.loc[:, ['neo','pha','H','class']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131142 entries, 0 to 909489
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   neo     131142 non-null  object 
 1   pha     131142 non-null  object 
 2   H       131142 non-null  float64
 3   class   131142 non-null  object 
dtypes: float64(1), object(3)
memory usage: 5.0+ MB


In [24]:
#The cols 'neo' and 'pha' are boolean but use Y/N characters.
#Lets change those to binary integers 1/0
df_asteroids_bool = df_asteroids_dropna

#replace Y with 1:True and N with 0:False in the 'neo' col
df_asteroids_bool.loc[:,'neo'] = df_asteroids_dropna.loc[:,'neo'].replace({'Y': 1,'N': 0})

#replace Y with 1:True and N with 0:False in the 'pha' col
df_asteroids_bool.loc[:,'pha'] = df_asteroids_dropna.loc[:,'pha'].replace({'Y': 1,'N': 0})

#verify
df_asteroids_bool.loc[:,['neo','pha']].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_asteroids_bool.loc[:,'neo'] = df_asteroids_dropna.loc[:,'neo'].replace({'Y': 1,'N': 0})
  df_asteroids_bool.loc[:,'neo'] = df_asteroids_dropna.loc[:,'neo'].replace({'Y': 1,'N': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_asteroids_bool.loc[:,'pha'] = df_asteroids_dropna.loc[:,'pha'].replace({'Y': 1,'N': 0})
  df_asteroids_bool.loc[:,'pha'] = df_asteroids_dropna.loc[:,'pha'].replace({'Y': 1,'N': 0})


neo  pha
0    0      130403
1    0         558
     1         181
dtype: int64