In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/asteroid-dataset/dataset.csv


<div style="font-size:29px;"><b>Small Description of what i am about to do</b></div>

In [69]:
# the dataset was imported from Kaggle and it is the NASA JPL Asteroid Dataset
# it's a pretty huge dataset for Asteroids (it's the biggest one i worked with so far)
# The dataset contains various features related to asteroids, such as their size, orbital parameters, and other characteristics
# GOAL: classifying asteroids into different classes which will help us to identify Outer Main Belt (OMBs)
# and Mars-crossing Asteroids (MCAs), Main Belt Asteroids (MBAs) etc..

<div style="font-size:29px;">Necessary Imports</div>

In [70]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

<div style="font-size:29px;">Importing the Dataset</div>

In [6]:
asteroiddataset = pd.read_csv('/kaggle/input/asteroid-dataset/dataset.csv', low_memory=False)

In [7]:
asteroiddataset.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,a0000001,2000001,1 Ceres,1,Ceres,,N,N,3.4,939.4,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,a0000002,2000002,2 Pallas,2,Pallas,,N,N,4.2,545.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,a0000003,2000003,3 Juno,3,Juno,,N,N,5.33,246.596,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,a0000004,2000004,4 Vesta,4,Vesta,,N,N,3.0,525.4,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,a0000005,2000005,5 Astraea,5,Astraea,,N,N,6.9,106.699,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


<div style="font-size:29px;">Dropping rows with NaN values in the those columns</div>

In [60]:
# Check for NaN values in any column
asteroiddataset.isna().any()

id                False
spkid             False
full_name         False
pdes              False
name               True
prefix             True
neo               False
pha               False
H                  True
diameter          False
albedo            False
diameter_sigma     True
orbit_id          False
epoch             False
epoch_mjd         False
epoch_cal         False
equinox           False
e                 False
a                 False
q                 False
i                 False
om                False
w                 False
ma                False
ad                False
n                 False
tp                False
tp_cal            False
per               False
per_y             False
moid              False
moid_ld           False
sigma_e           False
sigma_a           False
sigma_q           False
sigma_i           False
sigma_om          False
sigma_w           False
sigma_ma          False
sigma_ad          False
sigma_n           False
sigma_tp        

In [61]:
asteroiddataset.dropna(subset=['diameter','albedo','H','diameter_sigma'],inplace=True)

<div style="font-size:29px;">Converting str values to numeric</div>

In [35]:
asteroiddataset['neo'].unique()
asteroiddataset['neo'] = asteroiddataset['neo'].map({'N': 0, 'Y': 1})

In [36]:
asteroiddataset['pha'].unique()
asteroiddataset['pha'] = asteroiddataset['pha'].map({'N': 0, 'Y': 1})

<div style="font-size:29px;">Feature Selection</div>

In [63]:
# Drop columns by specifying their names in a list
columns_to_drop = asteroiddataset.columns[0:6].tolist() + ['orbit_id', 'equinox','class']
X = asteroiddataset.drop(columns=columns_to_drop)
X.head()

Unnamed: 0,neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,...,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms
0,0,0,3.4,939.4,0.09,0.2,2458600.5,58600,20190427.0,0.076009,...,1.9569e-11,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,0.43301
1,0,0,4.2,545.0,0.101,18.0,2459000.5,59000,20200531.0,0.229972,...,8.8322e-08,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,0.35936
2,0,0,5.33,246.596,0.214,10.594,2459000.5,59000,20200531.0,0.256936,...,8.1392e-08,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,0.33848
3,0,0,3.0,525.4,0.4228,0.2,2458600.5,58600,20190427.0,0.088721,...,1.9286e-09,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,0.3998
4,0,0,6.9,106.699,0.274,3.14,2459000.5,59000,20200531.0,0.190913,...,6.0924e-08,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,0.52191


<div style="font-size: 29px;">Narrowing down the target</div>

In [39]:
asteroiddataset['class'].unique()

array(['MBA', 'OMB', 'MCA', 'AMO', 'IMB', 'TJN', 'CEN', 'APO', 'ATE',
       'AST', 'TNO'], dtype=object)

In [64]:
y = asteroiddataset['class'].values
X = X.values

In [52]:
X.shape

(135100, 36)

In [53]:
y.shape

(135100,)

<div style="font-size: 35px;"><b>KNN</b></div>

In [72]:
knn = KNeighborsClassifier(n_neighbors=368)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=127, stratify=y)
knn.fit(X_train, y_train)

In [73]:
knn.score(X_test, y_test)

0.9380254303524792