# Practical: Predicting the Diameter of an Asteroid Using NN
### Waleed Alsanie

## 1. Loading and Pre-processing the Data

We want to some preprocessing of the data using pandas.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Asteroid_updated.csv")
df

  df = pd.read_csv("Asteroid_updated.csv")


Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,...,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
0,Ceres,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,...,0.426,,C,G,0.12,1.594780,MBA,0.213885,1683.145708,77.372096
1,Pallas,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,...,0.284,,B,B,0.11,1.233240,MBA,0.213503,1686.155999,59.699133
2,Juno,2.669150,0.256942,12.988919,169.852760,248.138626,1.983332,3.354967,4.360814,72684.0,...,0.433,,Sk,S,0.32,1.034540,MBA,0.226019,1592.787285,34.925016
3,Vesta,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,...,0.492,,V,V,0.32,1.139480,MBA,0.271609,1325.432765,95.861936
4,Astraea,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,63507.0,...,0.411,,S,S,,1.095890,MBA,0.238632,1508.600458,282.366289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839709,,2.812945,0.664688,4.695700,183.310012,234.618352,0.943214,4.682676,4.717914,17298.0,...,,,,,,0.032397,APO,0.208911,1723.217927,156.905910
839710,,2.645238,0.259376,12.574937,1.620020,339.568072,1.959126,3.331350,4.302346,16.0,...,,,,,,0.956145,MBA,0.229090,1571.431965,13.366251
839711,,2.373137,0.202053,0.732484,176.499082,198.026527,1.893638,2.852636,3.655884,5.0,...,,,,,,0.893896,MBA,0.269600,1335.311579,355.351127
839712,,2.260404,0.258348,9.661947,204.512448,148.496988,1.676433,2.844376,3.398501,10.0,...,,,,,,0.680220,MBA,0.290018,1241.302609,15.320134


In [3]:
df.columns

Index(['name', 'a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc',
       'condition_code', 'n_obs_used', 'H', 'neo', 'pha', 'diameter', 'extent',
       'albedo', 'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T', 'G',
       'moid', 'class', 'n', 'per', 'ma'],
      dtype='object')

In [4]:
df.shape

(839714, 31)

The diameter column includes some empty (NaN) values. We need to delete these rows.

In [5]:
df = df.dropna(subset=['name', 'diameter'])
df.shape

(15124, 31)

Drop columns having empty values (NaN)

In [6]:
df = df.dropna(axis=1)
df.shape

(15124, 21)

We will delete the 'neo' (Near Earth Object), 'pha' (Potential Hazardous Asteroid) and 'class' columns as they contain non-numerical values.

In [7]:
df = df.drop(['neo', 'pha', 'class'], axis=1)
df.shape

(15124, 18)

## 2. Feature Selection

Set the Asteriod name as the index of the rows. Convert the values to float (in case there are some numbers representated as strings). 

In [8]:
df = df.set_index('name')
df = df.astype(float)

Draw a head map of the correlation matrix to select some features. 

In [9]:
import matplotlib.pyplot as plt

In [10]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').format(precision=2)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,diameter,moid,n,per,ma
a,1.0,-0.05,0.11,-0.01,0.02,0.91,0.96,0.93,0.03,0.62,-0.04,-0.34,0.4,0.91,-0.62,0.93,0.02
e,-0.05,1.0,0.15,0.01,0.0,-0.38,0.18,0.0,0.07,0.09,-0.04,0.06,-0.01,-0.36,0.18,0.0,-0.02
i,0.11,0.15,1.0,0.0,0.0,0.07,0.13,0.07,0.06,0.03,-0.05,-0.25,0.18,0.1,-0.06,0.07,0.01
om,-0.01,0.01,0.0,1.0,-0.13,-0.02,-0.01,-0.0,0.03,0.0,-0.03,-0.0,0.0,-0.02,0.03,-0.0,-0.0
w,0.02,0.0,0.0,-0.13,1.0,0.01,0.02,0.01,-0.03,0.0,0.02,-0.01,0.01,0.01,-0.02,0.01,0.01
q,0.91,-0.38,0.07,-0.02,0.01,1.0,0.76,0.81,0.01,0.45,-0.03,-0.35,0.4,1.0,-0.66,0.81,0.04
ad,0.96,0.18,0.13,-0.01,0.02,0.76,1.0,0.92,0.04,0.67,-0.05,-0.3,0.36,0.77,-0.53,0.92,0.01
per_y,0.93,0.0,0.07,-0.0,0.01,0.81,0.92,1.0,0.01,0.73,-0.04,-0.22,0.37,0.81,-0.36,1.0,0.0
data_arc,0.03,0.07,0.06,0.03,-0.03,0.01,0.04,0.01,1.0,-0.02,0.71,-0.77,0.54,0.01,-0.06,0.01,0.01
condition_code,0.62,0.09,0.03,0.0,0.0,0.45,0.67,0.73,-0.02,1.0,-0.05,-0.06,0.18,0.46,-0.06,0.73,-0.02


We will select a, q, ad, per_y, data_arc, n_obs_used, moid and per as the features to train our model.

## 3. Training

We will use **Pytorch** package to train our ANN. ANN package normally have a data structure called **Tensors**. These can be thought of as multidimensional arrays with performance optimisation. They support mathemtical operations, like gradient computation, that are necessary for training ANN. 
<br>
<br>
We will first install **Pytorch**.

In [11]:
!pip install torch



We will import the ANN module and the optimisation module of **Pytorch** as well as **Pytorch** itself. We will also import numpy to store the Dataframe data into a numpy array and then move it to a tensor.

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

Select the highest 8 correlated features and the diameter as the target value we want to predict.

In [14]:
features_np = df[['a', 'q', 'ad', 'per_y', 'data_arc', 'n_obs_used', 'moid', 'per']].values
target_np = df[['diameter']].values
features_np, target_np

(array([[2.76916515e+00, 2.55868360e+00, 2.97964671e+00, ...,
         1.00200000e+03, 1.59478000e+00, 1.68314571e+03],
        [2.77246592e+00, 2.13386493e+00, 3.41106691e+00, ...,
         8.49000000e+03, 1.23324000e+00, 1.68615600e+03],
        [2.66914952e+00, 1.98333205e+00, 3.35496699e+00, ...,
         7.10400000e+03, 1.03454000e+00, 1.59278729e+03],
        ...,
        [2.52500672e+00, 2.35410186e+00, 2.69591158e+00, ...,
         9.50000000e+01, 1.33716000e+00, 1.46552163e+03],
        [3.10269235e+00, 2.34337182e+00, 3.86201289e+00, ...,
         8.20000000e+01, 1.36204000e+00, 1.99621125e+03],
        [3.10748415e+00, 2.57470600e+00, 3.64026230e+00, ...,
         1.05000000e+02, 1.72469000e+00, 2.00083746e+03]]),
 array([[939.4  ],
        [545.   ],
        [246.596],
        ...,
        [  2.155],
        [  3.609],
        [  3.655]]))

Split the data into 70 percent training and 30 percent testing.

In [16]:
train_portion = int(features_np.shape[0] * 0.70)
train_features_np = features_np[0:train_portion, :]
test_features_np = features_np[train_portion:, :]
train_target_np = target_np[0: train_portion, :]
test_target_np = target_np[train_portion:, :]
#
train_features_np.shape, test_features_np.shape, train_target_np.shape, test_target_np.shape

((10586, 8), (4538, 8), (10586, 1), (4538, 1))

Create tensors from Numpy arrays.

In [17]:
train_features_tn = torch.tensor(train_features_np)
test_features_tn = torch.tensor(test_features_np)
train_target_tn = torch.tensor(train_target_np)
test_target_tn = torch.tensor(test_target_np)
#
train_features_tn.shape, test_features_tn.shape, train_target_tn.shape, test_target_tn.shape

(torch.Size([10586, 8]),
 torch.Size([4538, 8]),
 torch.Size([10586, 1]),
 torch.Size([4538, 1]))

Define the ANN artichecture as follows:
<br>
>1. 8 inputs.
>2. 16 neurons in the hidden layer with Leaky ReLU activation functions.
>3. 1 linear output.

In [18]:
nn_arch = nn.Sequential(nn.Linear(8, 16, dtype=torch.float64), nn.LeakyReLU(), nn.Linear(16, 1, dtype=torch.float64))
nn_arch

Sequential(
  (0): Linear(in_features=8, out_features=16, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=16, out_features=1, bias=True)
)

Train the network

In [21]:
n_epochs = 10000
opt = optim.Adam(nn_arch.parameters(), lr=1e-3)
best_loss = float('inf')

for epoch in range(1, n_epochs + 1):
    # feed the training features to the network.
    nn_output = nn_arch(train_features_tn)
    # Compute the loss with respect to the ground truth.
    loss = nn.MSELoss()(nn_output, train_target_tn)

    # Reset the gradient
    opt.zero_grad()
    # Backpropagate and update the weights
    loss.backward()
    opt.step()


    if loss.item() < best_loss:
        torch.save({
        'epoch': epoch,
        'model_state_dict': nn_arch.state_dict(),
        'optimizer_state_dict': opt.state_dict(),
        'loss': loss.item(),
        }, 'best_checkpoint.pt')
        best_loss = loss.item()

    if epoch == 1 or epoch % 10 == 0:
        print(f"Epoch {epoch}, Training loss {loss.item():.4f}")

Epoch 1, Training loss 502.6311
Epoch 10, Training loss 1701.5556
Epoch 20, Training loss 533.8069
Epoch 30, Training loss 563.2246
Epoch 40, Training loss 560.8805
Epoch 50, Training loss 526.4258
Epoch 60, Training loss 511.3374
Epoch 70, Training loss 505.5851
Epoch 80, Training loss 503.1863
Epoch 90, Training loss 502.5813
Epoch 100, Training loss 502.6621
Epoch 110, Training loss 502.5913
Epoch 120, Training loss 502.5376
Epoch 130, Training loss 502.5263
Epoch 140, Training loss 502.5038
Epoch 150, Training loss 502.4868
Epoch 160, Training loss 502.4685
Epoch 170, Training loss 502.4499
Epoch 180, Training loss 502.4312
Epoch 190, Training loss 502.4118
Epoch 200, Training loss 502.3913
Epoch 210, Training loss 502.3709
Epoch 220, Training loss 502.3498
Epoch 230, Training loss 502.3280
Epoch 240, Training loss 502.3062
Epoch 250, Training loss 502.2830
Epoch 260, Training loss 502.2595
Epoch 270, Training loss 502.2353
Epoch 280, Training loss 502.2112
Epoch 290, Training loss

Epoch 2380, Training loss 489.8721
Epoch 2390, Training loss 489.7975
Epoch 2400, Training loss 489.7227
Epoch 2410, Training loss 489.6480
Epoch 2420, Training loss 489.5737
Epoch 2430, Training loss 489.5012
Epoch 2440, Training loss 489.4792
Epoch 2450, Training loss 491.8908
Epoch 2460, Training loss 558.4195
Epoch 2470, Training loss 496.1207
Epoch 2480, Training loss 495.0084
Epoch 2490, Training loss 489.4211
Epoch 2500, Training loss 489.8700
Epoch 2510, Training loss 489.0400
Epoch 2520, Training loss 488.8915
Epoch 2530, Training loss 488.8337
Epoch 2540, Training loss 488.7287
Epoch 2550, Training loss 488.6444
Epoch 2560, Training loss 488.5691
Epoch 2570, Training loss 488.4951
Epoch 2580, Training loss 488.4208
Epoch 2590, Training loss 488.3476
Epoch 2600, Training loss 488.2737
Epoch 2610, Training loss 488.1999
Epoch 2620, Training loss 488.1260
Epoch 2630, Training loss 488.0578
Epoch 2640, Training loss 488.7390
Epoch 2650, Training loss 588.2588
Epoch 2660, Training

Epoch 4750, Training loss 474.8914
Epoch 4760, Training loss 474.8710
Epoch 4770, Training loss 474.7153
Epoch 4780, Training loss 474.6739
Epoch 4790, Training loss 474.6074
Epoch 4800, Training loss 474.5499
Epoch 4810, Training loss 474.4950
Epoch 4820, Training loss 474.4402
Epoch 4830, Training loss 474.3837
Epoch 4840, Training loss 474.3288
Epoch 4850, Training loss 474.2769
Epoch 4860, Training loss 474.2232
Epoch 4870, Training loss 474.1610
Epoch 4880, Training loss 474.1035
Epoch 4890, Training loss 474.0472
Epoch 4900, Training loss 473.9988
Epoch 4910, Training loss 474.2832
Epoch 4920, Training loss 507.2540
Epoch 4930, Training loss 499.9514
Epoch 4940, Training loss 475.3518
Epoch 4950, Training loss 478.7574
Epoch 4960, Training loss 475.0955
Epoch 4970, Training loss 473.6420
Epoch 4980, Training loss 473.8371
Epoch 4990, Training loss 473.5355
Epoch 5000, Training loss 473.5043
Epoch 5010, Training loss 473.4205
Epoch 5020, Training loss 473.3692
Epoch 5030, Training

Epoch 7120, Training loss 463.7152
Epoch 7130, Training loss 463.6752
Epoch 7140, Training loss 463.6351
Epoch 7150, Training loss 463.5951
Epoch 7160, Training loss 463.5622
Epoch 7170, Training loss 464.2395
Epoch 7180, Training loss 551.6522
Epoch 7190, Training loss 495.3797
Epoch 7200, Training loss 478.0332
Epoch 7210, Training loss 466.0843
Epoch 7220, Training loss 463.3998
Epoch 7230, Training loss 463.5686
Epoch 7240, Training loss 463.5250
Epoch 7250, Training loss 463.2513
Epoch 7260, Training loss 463.2336
Epoch 7270, Training loss 463.1667
Epoch 7280, Training loss 463.1299
Epoch 7290, Training loss 463.0902
Epoch 7300, Training loss 463.0507
Epoch 7310, Training loss 463.0128
Epoch 7320, Training loss 462.9748
Epoch 7330, Training loss 462.9369
Epoch 7340, Training loss 462.8989
Epoch 7350, Training loss 462.8608
Epoch 7360, Training loss 462.8226
Epoch 7370, Training loss 462.7843
Epoch 7380, Training loss 462.7459
Epoch 7390, Training loss 462.7074
Epoch 7400, Training

Epoch 9470, Training loss 456.0733
Epoch 9480, Training loss 456.0440
Epoch 9490, Training loss 456.0153
Epoch 9500, Training loss 455.9869
Epoch 9510, Training loss 455.9585
Epoch 9520, Training loss 455.9297
Epoch 9530, Training loss 455.9010
Epoch 9540, Training loss 455.8723
Epoch 9550, Training loss 455.8488
Epoch 9560, Training loss 456.7629
Epoch 9570, Training loss 596.1927
Epoch 9580, Training loss 456.5527
Epoch 9590, Training loss 456.6831
Epoch 9600, Training loss 455.8610
Epoch 9610, Training loss 455.7357
Epoch 9620, Training loss 455.6982
Epoch 9630, Training loss 455.6898
Epoch 9640, Training loss 455.6769
Epoch 9650, Training loss 455.6402
Epoch 9660, Training loss 455.5860
Epoch 9670, Training loss 455.5490
Epoch 9680, Training loss 455.5240
Epoch 9690, Training loss 455.4968
Epoch 9700, Training loss 455.4702
Epoch 9710, Training loss 455.4437
Epoch 9720, Training loss 455.4172
Epoch 9730, Training loss 455.3906
Epoch 9740, Training loss 455.3640
Epoch 9750, Training

Testing the model.

In [22]:
nn_arch.eval()
test_output = nn_arch(test_features_tn)
loss = nn.MSELoss()(test_output, test_target_tn)
print(loss.item())
test_output

141.290215365063


tensor([[  2.7739],
        [  4.3585],
        [  9.0754],
        ...,
        [-20.8843],
        [-26.0388],
        [-12.2846]], dtype=torch.float64, grad_fn=<AddmmBackward0>)