*Importing libraries* that will prove useful in the analyzing, visualizing and classificaton process.

In [1]:
import numpy as np
import pandas as pd
import math, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

### Loading the dataset

In [17]:
df = pd.read_csv('archive/neo_v2.csv', delimiter=',')

### Data Analysis

In [18]:
df.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [39]:
df.shape

(90836, 10)

In [19]:
df.columns

Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',
       'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',
       'absolute_magnitude', 'hazardous'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90836 entries, 0 to 90835
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  90836 non-null  int64  
 1   name                90836 non-null  object 
 2   est_diameter_min    90836 non-null  float64
 3   est_diameter_max    90836 non-null  float64
 4   relative_velocity   90836 non-null  float64
 5   miss_distance       90836 non-null  float64
 6   orbiting_body       90836 non-null  object 
 7   sentry_object       90836 non-null  bool   
 8   absolute_magnitude  90836 non-null  float64
 9   hazardous           90836 non-null  bool   
dtypes: bool(2), float64(5), int64(1), object(2)
memory usage: 5.7+ MB


#### Checking for Null values

In [121]:
pd.set_option('display.max_rows',900)
df_null= df[df.isnull()==True]
print(df_null)

       id name  est_diameter_min  est_diameter_max  relative_velocity  \
0     NaN  NaN               NaN               NaN                NaN   
1     NaN  NaN               NaN               NaN                NaN   
2     NaN  NaN               NaN               NaN                NaN   
3     NaN  NaN               NaN               NaN                NaN   
4     NaN  NaN               NaN               NaN                NaN   
...    ..  ...               ...               ...                ...   
90831 NaN  NaN               NaN               NaN                NaN   
90832 NaN  NaN               NaN               NaN                NaN   
90833 NaN  NaN               NaN               NaN                NaN   
90834 NaN  NaN               NaN               NaN                NaN   
90835 NaN  NaN               NaN               NaN                NaN   

       miss_distance orbiting_body sentry_object  absolute_magnitude hazardous  
0                NaN           NaN        

In [41]:
df_null.shape

(90836, 10)

In [102]:
df.hazardous.value_counts()

False    81996
True      8840
Name: hazardous, dtype: int64

In [99]:
df.isnull().sum()

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64

The descriptions above indicate that there are no NULL values present in our dataset.

We can also draw some interesting observations from the table of statistical expressions on the numerical columns.

In [106]:
df.describe()

Unnamed: 0,id,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
count,90836.0,90836.0,90836.0,90836.0,90836.0,90836.0
mean,14382880.0,0.127432,0.284947,48066.918918,37066550.0,23.527103
std,20872020.0,0.298511,0.667491,25293.296961,22352040.0,2.894086
min,2000433.0,0.000609,0.001362,203.346433,6745.533,9.23
25%,3448110.0,0.019256,0.043057,28619.020645,17210820.0,21.34
50%,3748362.0,0.048368,0.108153,44190.11789,37846580.0,23.7
75%,3884023.0,0.143402,0.320656,62923.604633,56549000.0,25.7
max,54275910.0,37.89265,84.730541,236990.128088,74798650.0,33.2


In [28]:
repeated_objects = df[df['id']==2469219]
repeated_objects.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
912,2469219,469219 Kamo`oalewa (2016 HO3),0.036187,0.080917,15334.830488,21053500.0,Earth,False,24.33,False
2169,2469219,469219 Kamo`oalewa (2016 HO3),0.036187,0.080917,15860.941084,22976780.0,Earth,False,24.33,False
3972,2469219,469219 Kamo`oalewa (2016 HO3),0.036187,0.080917,15382.529101,20321250.0,Earth,False,24.33,False
5128,2469219,469219 Kamo`oalewa (2016 HO3),0.036187,0.080917,15285.830876,23457150.0,Earth,False,24.33,False
6988,2469219,469219 Kamo`oalewa (2016 HO3),0.036187,0.080917,15464.296154,19547350.0,Earth,False,24.33,False
