In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from scipy.stats import mode
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report, cohen_kappa_score, f1_score
from yellowbrick.classifier import ClassificationReport, ClassPredictionError, ConfusionMatrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from keras import Sequential
from keras.layers import Dense

In [None]:
df = pd.read_csv("/content/cleanedcumalative.csv")
df.drop(columns = ["Unnamed: 0"], inplace=True)
print(df.shape)
df.head()

(9110, 47)


Unnamed: 0,rowid,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [None]:
# Assuming 'df' from the previous cell contains the data you want in 'subset_df'
subset_df = df.copy() # Create a copy to avoid modifying the original 'df'

# Remove 'kepid' (and any other non-numeric columns) from TO_USE
TO_USE = [col for col in TO_USE if col != 'kepid' and subset_df[col].dtype != object]

X = subset_df[TO_USE].values
X = StandardScaler().fit_transform(X)
y = subset_df["koi_disposition"].apply(lambda x: x=='CONFIRMED').astype(int).values

In [None]:
def performance(test, pred):
    conf_matrix = confusion_matrix(test, pred)
    f1 = f1_score(test, pred)
    report = classification_report(test, pred)
    accuracy = balanced_accuracy_score(test, pred)
    kappa = cohen_kappa_score(test, pred)
    print(f"F1 Score: {f1}")
    print(f"Kappa Score: {kappa}")
    print(f"Accuracy Score: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(report)

In [None]:
def createNNModel():
    model = Sequential([
                    Dense(256, input_dim=X.shape[1], activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(1, activation = 'sigmoid')
                   ])
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
models = [
    AdaBoostClassifier(algorithm='SAMME',
                    estimator=DecisionTreeClassifier(max_depth=3), # Changed 'base_estimator' to 'estimator'
                    learning_rate=0.9500000000000002, n_estimators=80,
                    random_state=0),
    RandomForestClassifier(n_jobs=-1, random_state=0),
    SVC(C=1.8500000000000008, random_state=0, tol=1),
    createNNModel()
]

model_names = ["adaboost", "rf", "svc", "nn"]

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from scipy.stats import mode

def trainEvaluate(model, model_name, fold, X_train, y_train, X_test, y_test):
    if model_name == "nn":
        model.fit(X_train, y_train, epochs=20, verbose=0)
    else:
        model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = (pred >= 0.5).astype(int).flatten()
    #f1 = f1_score(y_test, pred)
    #print(f"F1 Score in fold {fold} = {f1}")
    return pred

def finalPrediction(preds):
    preds = np.asarray(preds)
    # Calculate the mode along axis 0 (across predictions for each sample)
    final_preds = mode(preds, axis=0)[0]
    # Ensure final_preds is 1-dimensional
    return final_preds.flatten()

def crossValidation(K, models, model_names):
    kFold = KFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    k_ctr = 1
    pred_per_split = list()

    # Create an imputer to handle missing values
    imputer = SimpleImputer(strategy='mean')  # Or another strategy like 'median', 'most_frequent'

    for train, test in kFold.split(X, y):
        pred_per_split.clear()

        # Impute missing values in training and testing data
        X_train_imputed = imputer.fit_transform(X[train])
        X_test_imputed = imputer.transform(X[test])

        for idx, model in enumerate(models):
            current_model_name = model_names[idx]
            # Pass the imputed data to trainEvaluate
            preds = trainEvaluate(model, current_model_name, k_ctr, X_train_imputed, y[train], X_test_imputed, y[test])
            pred_per_split.append(preds)

        final_preds = finalPrediction(pred_per_split)
        current_f1 = f1_score(y[test], final_preds)
        print(f"F1 Score in fold {k_ctr} = {current_f1}")
        k_ctr+=1
        f1_scores.append(current_f1)
    print(f"Average {K}-Fold F1 Score = {np.mean(f1_scores)}\n")

    kFold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    k_ctr = 1
    pred_per_split = list()

    # Create an imputer to handle missing values
    imputer = SimpleImputer(strategy='mean')  # Or another strategy like 'median', 'most_frequent'

    for train, test in kFold.split(X, y):
        pred_per_split.clear()

        # Impute missing values in training and testing data
        X_train_imputed = imputer.fit_transform(X[train])
        X_test_imputed = imputer.transform(X[test])

        for idx, model in enumerate(models):
            current_model_name = model_names[idx]
            # Pass the imputed data to trainEvaluate
            preds = trainEvaluate(model, current_model_name, k_ctr, X_train_imputed, y[train], X_test_imputed, y[test])
            pred_per_split.append(preds)

        final_preds = finalPrediction(pred_per_split)
        current_f1 = f1_score(y[test], final_preds)
        print(f"F1 Score in fold {k_ctr} = {current_f1}")
        k_ctr+=1
        f1_scores.append(current_f1)
    print(f"Average Stratified {K}-Fold F1 Score = {np.mean(f1_scores)}")

# %%
crossValidation(10, models, model_names)

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 1 = 0.8434237995824635
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 2 = 0.8253275109170306
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 3 = 0.8701030927835052
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 4 = 0.8623853211009175
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 5 = 0.87248322147651
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
F1 Score in fold 6 = 0.8879492600422833
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 7 = 0.862144420131291
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 8 = 0.8495575221238938
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
F1 Score in fold 