In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = 'plotly_white'
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Asteroid.csv')
print(df.head(3))

  exec(code_obj, self.user_global_ns, self.user_ns)


       full_name         a         e     G          i          om           w  \
0        1 Ceres  2.769165  0.076009  0.12  10.594067   80.305532   73.597694   
1       2 Pallas  2.772466  0.230337  0.11  34.836234  173.080063  310.048857   
2         3 Juno  2.669150  0.256942  0.32  12.988919  169.852760  248.138626   

          q        ad     per_y  ...  rot_per       GM     BV     UB  IR  \
0  2.558684  2.979647  4.608202  ...  9.07417  62.6284  0.713  0.426 NaN   
1  2.133865  3.411067  4.616444  ...  7.81320  14.3000  0.635  0.284 NaN   
2  1.983332  3.354967  4.360814  ...  7.21000      NaN  0.824  0.433 NaN   

  spec_B  spec_T  neo  pha     moid  
0      C       G    N    N  1.59478  
1      B       B    N    N  1.23324  
2     Sk       S    N    N  1.03454  

[3 rows x 27 columns]


**VARIABLE NAME	DESCRIPTION**

a	semi-major axis[au]

e	eccentricity

i	inclination wrt x-y ecliptic plane [deg]

om	longitude of the ascending node

w	argument of perihelion

q	perihelion distance [au]

ad	aphelion distance [au]

per_y	orbital period [years]

data_arc	data arc-span [d]

condition_code	orbit condition code

n_obs_use	number of observations used

H	absolute magnitude parameter

diameter	diameter of asteroid [km]

extent	object bi or tri-axial ellipsoid dimensions [km]

albedo	geometric albedo

rot_per	standard gravitational parameter [ 𝑚×𝐺 ]

bv	color index B-V magnitude difference

ub	color index U-B magnitude difference

IR	color index I-R magnitude difference

spec_B	spectral taxonomic type (SMASSII)

spec_T	spectral taxonomic type (Tholen)

neo	near earth object

pha	physically hazardous asteroid

moid	earth minimum orbit intersection distance [au]

In [14]:
df = df.drop(columns=['full_name', 'H', 'albedo', 'G', 'extent', 'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 839736 entries, 0 to 839735
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   a               839734 non-null  float64
 1   e               839736 non-null  float64
 2   i               839736 non-null  float64
 3   om              839736 non-null  float64
 4   w               839736 non-null  float64
 5   q               839736 non-null  float64
 6   ad              839730 non-null  float64
 7   per_y           839735 non-null  float64
 8   data_arc        823947 non-null  float64
 9   condition_code  838743 non-null  object 
 10  n_obs_used      839736 non-null  int64  
 11  diameter        137681 non-null  object 
 12  neo             839730 non-null  object 
 13  pha             822814 non-null  object 
 14  moid            822814 non-null  float64
dtypes: float64(10), int64(1), object(4)
memory usage: 96.1+ MB


In [15]:

df.diameter = pd.to_numeric(df.diameter, errors='coerce')
df = df.dropna(subset=['diameter'])
df.info()
df = df.dropna(subset=['data_arc'])
df.info()
df.condition_code.value_counts()
df.condition_code = pd.to_numeric(df.condition_code, errors='coerce')
df.neo = df.neo.replace({'N':0, 'Y':1})
df.pha = df.pha.replace({'N':0, 'Y':1})

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137680 entries, 0 to 810411
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   a               137680 non-null  float64
 1   e               137680 non-null  float64
 2   i               137680 non-null  float64
 3   om              137680 non-null  float64
 4   w               137680 non-null  float64
 5   q               137680 non-null  float64
 6   ad              137680 non-null  float64
 7   per_y           137680 non-null  float64
 8   data_arc        137540 non-null  float64
 9   condition_code  137680 non-null  object 
 10  n_obs_used      137680 non-null  int64  
 11  diameter        137680 non-null  float64
 12  neo             137680 non-null  object 
 13  pha             137680 non-null  object 
 14  moid            137680 non-null  float64
dtypes: float64(11), int64(1), object(3)
memory usage: 16.8+ MB
<class 'pandas.core.frame.DataFrame'>
Int64

In [16]:
for column in df.columns:
    if df[column].dtype == float:
        df[column] = df[column].astype('float32')

In [17]:
features = df.drop('diameter', axis=1)
target = df.diameter

In [18]:

X_train, X_test, y_train, y_test = train_test_split(features,
                                                   target,
                                                   test_size=0.2,
                                                   random_state=28)

In [19]:
forest = RandomForestRegressor(max_depth=32,
                              n_estimators=50)

In [20]:

for column in df.columns:
    print(column, np.sum(df[column].isna()))

a 0
e 0
i 0
om 0
w 0
q 0
ad 0
per_y 0
data_arc 0
condition_code 0
n_obs_used 0
diameter 0
neo 0
pha 0
moid 0


In [21]:
forest.fit(X_train,
          np.ravel(y_train))

RandomForestRegressor(max_depth=32, n_estimators=50)

In [22]:
y_pred = forest.predict(X_test)

In [25]:
r2_score(y_test,
        y_pred)

0.8005888213795604

In [37]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test,
                        y=y_pred,
                        mode='markers'))

fig.update_layout(title='Actual vs predicted diameters',
                 xaxis=dict(title='Actual diameters'),
                 yaxis=dict(title='Predicted diameters'),width=500,
    height=500)



