In [1]:
import pandas as pd
import numpy as np

# Data Cleaning

In [2]:
#Reading csv and datacleanup
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
# Set features to be used as x values.
target = df['koi_disposition']
data = df.drop('koi_disposition', axis=1)
features = data.columns

In [4]:
#create train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1)

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Scale your data
X_scaler = MinMaxScaler().fit(X_train)

#Transformation
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(solver='liblinear', max_iter = 100)

In [11]:
#Fit model
lr_classifier.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [12]:
#Score
print(f"Training Data Score: {lr_classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr_classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8380698073621972
Testing Data Score: 0.8558352402745996


# Utilizing Grid Search to find best parameters

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [100, 10, 1.0, 0.1, 0.01],
              'penalty': ['l1', 'l2'],
             }

grid = GridSearchCV(lr_classifier, param_grid, verbose=3)

In [16]:
# Fit the model using the grid search estimator.
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..............................C=100, penalty=l1; total time=  15.3s
[CV 2/5] END ..............................C=100, penalty=l1; total time=  24.2s
[CV 3/5] END ..............................C=100, penalty=l1; total time=  28.1s
[CV 4/5] END ..............................C=100, penalty=l1; total time=  16.6s
[CV 5/5] END ..............................C=100, penalty=l1; total time=  14.7s
[CV 1/5] END ..............................C=100, penalty=l2; total time=   0.3s
[CV 2/5] END ..............................C=100, penalty=l2; total time=   0.3s
[CV 3/5] END ..............................C=100, penalty=l2; total time=   0.2s
[CV 4/5] END ..............................C=100, penalty=l2; total time=   0.3s
[CV 5/5] END ..............................C=100, penalty=l2; total time=   0.2s
[CV 1/5] END ...............................C=10, penalty=l1; total time=   3.0s
[CV 2/5] END ...............................C=10

GridSearchCV(estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01],
                         'penalty': ['l1', 'l2']},
             verbose=3)

In [17]:
# List the best parameters for this datase
print(grid.best_params_)

{'C': 100, 'penalty': 'l1'}


In [18]:
# List the best score
print(grid.best_score_)

0.8767854154083496


In [19]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
predictions

array(['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', ...,
       'FALSE POSITIVE', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [20]:
# Calculate classification report
# YOUR CODE HERE
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.87      0.67      0.75       404
     CONFIRMED       0.74      0.89      0.81       435
FALSE POSITIVE       0.99      1.00      0.99       909

      accuracy                           0.89      1748
     macro avg       0.87      0.85      0.85      1748
  weighted avg       0.90      0.89      0.89      1748



In [21]:
#Save model
import joblib
filename = 'lr_model.sav'
joblib.dump(lr_classifier, filename)

['lr_model.sav']

In [None]:
#loaded_model = joblib.load('lr_model.sav')
#loaded_model.score()