In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

## Preprocess the Data
* Use MinMaxScaler to scale the numerical data.
* Separate the data into training and testing data.

In [19]:
# Read the training data
training_df = pd.read_csv("Data/Cleaned_Data.csv") 
training_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_time0bk,koi_duration,ra,dec,koi_disposition
0,0,0,0,0,170.53875,2.9575,291.93423,48.141651,CONFIRMED
1,0,0,0,0,162.51384,4.507,291.93423,48.141651,CONFIRMED
2,0,1,0,0,175.850252,1.7822,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,170.307565,2.40641,285.53461,48.28521,FALSE POSITIVE
4,0,0,0,0,171.59555,1.6545,288.75488,48.2262,CONFIRMED


In [20]:
target_feature = "koi_disposition"

In [28]:
# Split the data
X = training_df.drop(columns=[target_feature])
y = training_df[[target_feature]].values.ravel() 
print(X.shape, y.shape)

(9564, 8) (9564,)


In [29]:
# Create a Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [30]:
# Scale the training data to better train
def scale_data(X, X_train, X_test):
    X_scaler = MinMaxScaler().fit(X)
    # apply the scale to training and testing data sets
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    #
    return X_train_scaled, X_test_scaled, X_scaler

X_train, X_test, X_scaler = scale_data(X, X_train, X_test)

In [62]:
classifier = LogisticRegression(
    max_iter=100,
    verbose=1,
)
classifier

LogisticRegression(verbose=1)

In [63]:
classifier.fit(
    X_train,
    y_train,
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


LogisticRegression(verbose=1)

In [64]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7741530740276035
Testing Data Score: 0.76620660811376


In [None]:
grid = GridSearchCV(
    model, 
    {'C': [1, 5, 10],
     'max_iter': [500, 1000, 5000]
              }, verbose=2)

grid.fit(X_train_new,y_train)

## Tune Model Parameters
* Use GridSearch to tune model parameters.
* Tune and compare at least two different classifiers.

## Reporting
* Create a README that reports a comparison of each model's performance as well as a summary about your findings and any assumptions you can make based on your model.
* * Is your model good enough to predict new exoplanets? 
* * Why or why not? 
* * What would make your model be better at predicting new exoplanets?