In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.decomposition import PCA

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/NASA.csv')

In [4]:
df

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,2512244,512244 (2015 YE18),0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...,...,...
90831,3763337,(2016 VX1),0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,54115824,(2021 CN5),0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


In [5]:
#as orbiting body has only earth as a value it has no significance in training, hence we drop it. similarly id and name also do not have any significance
X = df.drop(columns=['id','name','orbiting_body','sentry_object'])
X['hazardous'] = X['hazardous'].astype(int)

In [6]:
X_true = X[X['hazardous']==True].sample(8840*10, replace=True)
X_false = X[X['hazardous']==False]
X = pd.concat([X_true, X_false])

In [7]:
X

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
31214,0.628861,1.406177,103105.223917,6.010386e+07,18.13,1
48431,0.183889,0.411188,36052.916491,1.086220e+07,20.80,1
18906,0.253837,0.567597,44641.319962,2.603931e+07,20.10,1
47880,0.334622,0.748238,24399.525492,1.819126e+07,19.50,1
70460,0.127220,0.284472,66808.178894,1.422613e+07,21.60,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


In [8]:
df.isna().sum()

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64

In [9]:
X.corr()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
est_diameter_min,1.0,1.0,0.248475,0.108739,-0.636127,0.296766
est_diameter_max,1.0,1.0,0.248475,0.108739,-0.636127,0.296766
relative_velocity,0.248475,0.248475,1.0,0.28753,-0.372689,0.297886
miss_distance,0.108739,0.108739,0.28753,1.0,-0.196863,0.070137
absolute_magnitude,-0.636127,-0.636127,-0.372689,-0.196863,1.0,-0.634147
hazardous,0.296766,0.296766,0.297886,0.070137,-0.634147,1.0


In [10]:
Y = X['hazardous']
X = X.drop(columns=['hazardous'])

In [11]:
from sklearn.model_selection import train_test_split
  
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size = 0.30, random_state = 105)

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train[:1000], y_train[:1000])

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.515 total time=   0.1s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.1s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.515 total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.1s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.515 total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=0.01

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             verbose=3)

In [13]:
grid.best_params_

{'C': 0.1, 'gamma': 1}

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(random_state=42))])

In [15]:
predicts = clf.predict(X_test)

In [16]:
pd.DataFrame(predicts).value_counts()

1    32782
0    18337
dtype: int64

In [17]:
print(classification_report(y_test,predicts))

              precision    recall  f1-score   support

           0       0.99      0.74      0.85     24470
           1       0.81      0.99      0.89     26649

    accuracy                           0.87     51119
   macro avg       0.90      0.87      0.87     51119
weighted avg       0.90      0.87      0.87     51119



In [18]:
C = np.logspace(-4, 4, 50)
penalty = ['l2']
parameters = dict(C=C,penalty=penalty)
grid = GridSearchCV(LogisticRegression(), parameters, refit = True, verbose = 3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ..............C=0.0001, penalty=l2;, score=0.573 total time=   0.4s
[CV 2/5] END ..............C=0.0001, penalty=l2;, score=0.571 total time=   0.5s
[CV 3/5] END ..............C=0.0001, penalty=l2;, score=0.570 total time=   0.4s
[CV 4/5] END ..............C=0.0001, penalty=l2;, score=0.578 total time=   0.5s
[CV 5/5] END ..............C=0.0001, penalty=l2;, score=0.573 total time=   0.4s
[CV 1/5] END C=0.00014563484775012445, penalty=l2;, score=0.573 total time=   0.4s
[CV 2/5] END C=0.00014563484775012445, penalty=l2;, score=0.571 total time=   0.4s
[CV 3/5] END C=0.00014563484775012445, penalty=l2;, score=0.570 total time=   0.4s
[CV 4/5] END C=0.00014563484775012445, penalty=l2;, score=0.578 total time=   0.4s
[CV 5/5] END C=0.00014563484775012445, penalty=l2;, score=0.573 total time=   0.4s
[CV 1/5] END C=0.00021209508879201905, penalty=l2;, score=0.573 total time=   0.4s
[CV 2/5] END C=0.000212095088792019

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.09491506e-02, 5.96362332e-02, 8.68511374e-0...
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04]),
                         'penalty': ['l2']},
             verbose=3)

In [19]:
grid.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [20]:
clf = LogisticRegression(penalty='l2', C=0.0001)
clf.fit(X_train,y_train)
predicts = clf.predict(X_test)
print(confusion_matrix(y_test,predicts))
print(accuracy_score(y_test,predicts))

[[ 7776 16694]
 [ 4744 21905]]
0.580625599092314


In [21]:
pd.DataFrame(predicts).value_counts()

1    38599
0    12520
dtype: int64

In [22]:
print(classification_report(y_test,predicts))

              precision    recall  f1-score   support

           0       0.62      0.32      0.42     24470
           1       0.57      0.82      0.67     26649

    accuracy                           0.58     51119
   macro avg       0.59      0.57      0.55     51119
weighted avg       0.59      0.58      0.55     51119



In [23]:
X

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
31214,0.628861,1.406177,103105.223917,6.010386e+07,18.13
48431,0.183889,0.411188,36052.916491,1.086220e+07,20.80
18906,0.253837,0.567597,44641.319962,2.603931e+07,20.10
47880,0.334622,0.748238,24399.525492,1.819126e+07,19.50
70460,0.127220,0.284472,66808.178894,1.422613e+07,21.60
...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80


In [24]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_train)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

principalComponentsTest = pca.fit_transform(X_test)
principalDfTest = pd.DataFrame(data = principalComponentsTest
             , columns = ['principal component 1', 'principal component 2'])

In [25]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train[:1000], y_train[:1000])

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.515 total time=   0.1s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.1s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.510 total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.515 total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.1s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.510 total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.515 total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=0.01

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             verbose=3)

In [26]:
grid.best_params_

{'C': 0.1, 'gamma': 1}

In [27]:
clf = LogisticRegression()
clf.fit(principalDf,y_train)
predicts = clf.predict(principalDfTest)

In [28]:
print(classification_report(y_test,predicts))

              precision    recall  f1-score   support

           0       0.58      0.68      0.63     24470
           1       0.65      0.55      0.60     26649

    accuracy                           0.62     51119
   macro avg       0.62      0.62      0.61     51119
weighted avg       0.62      0.62      0.61     51119



In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))
clf.fit(principalDf, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(random_state=42))])

In [30]:
predicts = clf.predict(principalDfTest)

In [31]:
print(classification_report(y_test,predicts))

              precision    recall  f1-score   support

           0       0.67      0.50      0.58     24470
           1       0.63      0.78      0.70     26649

    accuracy                           0.65     51119
   macro avg       0.65      0.64      0.64     51119
weighted avg       0.65      0.65      0.64     51119

