In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
import joblib

## Preprocess the Data
* Use MinMaxScaler to scale the numerical data.
* Separate the data into training and testing data.

In [2]:
# Read the training data
training_df = pd.read_csv("Data/Cleaned_Data.csv") 
training_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_disposition
0,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
1,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
2,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,FALSE POSITIVE
4,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,CONFIRMED


In [3]:
target_feature = "koi_disposition"

In [4]:
# Split the data
X = training_df.drop(columns=[target_feature])
y = training_df[[target_feature]].values.ravel() 
print(X.shape, y.shape)

(9201, 18) (9201,)


In [5]:
# Create a Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
# Scale the training data to better train
def scale_data(X, X_train, X_test):
    '''
        return the scaled training and testing data with the scaler
    '''
    X_scaler = MinMaxScaler().fit(X)
    # apply the scale to training and testing data sets
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    #
    return X_train_scaled, X_test_scaled, X_scaler

X_train, X_test, X_scaler = scale_data(X, X_train, X_test)

In [13]:
classifier = LogisticRegression(
    penalty="l1",
    dual=False,
    tol=0.000001,
    C=10,
    fit_intercept=True,
    intercept_scaling=1,
    solver="liblinear",
    max_iter=500,
    multi_class="auto",
    verbose=2
)
classifier

LogisticRegression(C=10, max_iter=500, penalty='l1', solver='liblinear',
                   tol=1e-06, verbose=2)

In [14]:
classifier.fit(
    X_train,
    y_train,
)

[LibLinear]

LogisticRegression(C=10, max_iter=500, penalty='l1', solver='liblinear',
                   tol=1e-06, verbose=2)

In [15]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8123188405797102
Testing Data Score: 0.8096479791395046


In [16]:
joblib.dump(classifier, "Logistic_Regression_Classifier.sav")

['Logistic_Regression_Classifier.sav']

## Tune Model Parameters
* Use GridSearch to tune model parameters.

In [None]:
grid = GridSearchCV(
    classifier, 
    {
        "penalty": ["l1", "l2"],
        "tol": [0.000001, 0.001, 0.1],
        "C": [0.1, 1, 10],
        "solver":["lbfgs", "liblinear", "sag", "saga"],
        "max_iter": [100, 500, 1000]
    }, 
    verbose=1
)
grid.fit(X_train,y_train)

In [12]:
print(f"Grid best score: {grid.best_score_}")
print(f"Grid best penalty: {grid.best_estimator_.penalty}")
print(f"Grid best tol: {grid.best_estimator_.tol}")
print(f"Grid best C: {grid.best_estimator_.C}")
print(f"Grid best solver: {grid.best_estimator_.solver}")
print(f"Grid best max_Iter: {grid.best_estimator_.max_iter}")

print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Grid best score: 0.8089855072463769
Grid best penalty: l1
Grid best tol: 1e-06
Grid best C: 10
Grid best solver: liblinear
Grid best max_Iter: 500
Training Data Score: 0.8123188405797102
Testing Data Score: 0.8096479791395046
