In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
import joblib

## Preprocess the Data
* Use MinMaxScaler to scale the numerical data.
* Separate the data into training and testing data.

In [2]:
# Read the training data
training_df = pd.read_csv("Data/Cleaned_Data.csv") 
training_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_time0bk,koi_duration,ra,dec,koi_disposition
0,0,0,0,0,170.53875,2.9575,291.93423,48.141651,CONFIRMED
1,0,0,0,0,162.51384,4.507,291.93423,48.141651,CONFIRMED
2,0,1,0,0,175.850252,1.7822,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,170.307565,2.40641,285.53461,48.28521,FALSE POSITIVE
4,0,0,0,0,171.59555,1.6545,288.75488,48.2262,CONFIRMED


In [3]:
target_feature = "koi_disposition"

In [4]:
# Split the data
X = training_df.drop(columns=[target_feature])
y = training_df[[target_feature]].values.ravel() 
print(X.shape, y.shape)

(9564, 8) (9564,)


In [5]:
# Create a Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
# Scale the training data to better train
def scale_data(X, X_train, X_test):
    X_scaler = MinMaxScaler().fit(X)
    # apply the scale to training and testing data sets
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    #
    return X_train_scaled, X_test_scaled, X_scaler

X_train, X_test, X_scaler = scale_data(X, X_train, X_test)

In [26]:
classifier = LogisticRegression(
    penalty='l1',
    dual=False,
    tol=0.01,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    solver='liblinear',
    max_iter=500,
    multi_class='auto',
    verbose=2
)
classifier

LogisticRegression(max_iter=500, penalty='l1', solver='liblinear', tol=0.01,
                   verbose=2)

In [27]:
classifier.fit(
    X_train,
    y_train,
)

[LibLinear]

LogisticRegression(max_iter=500, penalty='l1', solver='liblinear', tol=0.01,
                   verbose=2)

In [28]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7759654259026907
Testing Data Score: 0.767043078209954


In [29]:
joblib.dump(classifier, "Logistic_Regression_Classifier.sav")

['Logistic_Regression_Classifier.sav']

## Tune Model Parameters
* Use GridSearch to tune model parameters.

In [None]:
grid = GridSearchCV(
    classifier, 
    {
        "penalty": ["l1", "l2", "elasticnet", "none"],
        "tol": [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1],
        "C": [0.1, 0.5, 1, 5, 10, 50],
        "solver":["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
        "max_iter": [100, 500, 1000]
    }, 
    verbose=1
)
grid.fit(X_train,y_train)

In [25]:
print(f"Grid best score: {grid.best_score_}")
print(f"Grid best penalty: {grid.best_estimator_.penalty}")
print(f"Grid best tol: {grid.best_estimator_.tol}")
print(f"Grid best C: {grid.best_estimator_.C}")
print(f"Grid best solver: {grid.best_estimator_.solver}")
print(f"Grid best max_Iter: {grid.best_estimator_.max_iter}")

print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Grid best score: 0.7775007167883992
Grid best penalty: l1
Grid best tol: 0.01
Grid best C: 1
Grid best solver: liblinear
Grid best max_Iter: 500
Training Data Score: 0.7766624843161857
Testing Data Score: 0.767461313258051
