In [3]:
# If running in Google Colab, mount Google Drive
import sys

if 'google.colab' in sys.modules:
	# Only import drive if running in Colab
	from google.colab import drive  # type: ignore
	drive.mount('/content/drive')
else:
	print("Not running in Google Colab. Skipping Google Drive mount.")

Not running in Google Colab. Skipping Google Drive mount.


Note: Go through the README file to get an idea of the Code.

# Importing Libraries

In [4]:
%pip install tensorflow

#Tensorflow Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import metrics


import numpy as np
import pandas as pd #For converting csv to DataFrame

#For Plotting Purpose
import matplotlib.pyplot as plt
import seaborn as sn

#For Transforming data and evaluation of Models
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

#Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier

#Hyperparameter tuning of Models
from sklearn.model_selection import GridSearchCV

#For calculating time required for Training and Testing of Models
import time

#Hide Warnings
import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None  # default='warn'

Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'matplotlib'

# Loading the Data

In [5]:
# Reading the Training and Testing Data 
# Note: Update the paths below to the correct location on your system.

import os

# Update these paths to the correct location of your CSV files
train_path = 'C:/Users/sarthak/Downloads/archive/exoTrain.csv'
test_path = 'C:/Users/sarthak/Downloads/archive/exoTest.csv'
if not os.path.exists(train_path) or not os.path.exists(test_path):
    print("File(s) not found. Please update the paths above to the correct location on your system.")
else:
    exoTrain = pd.read_csv(train_path)
    exoTest = pd.read_csv(test_path)
    print(exoTrain.head(5))

   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \
0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   
1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   
2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   
3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   
4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   

    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \
0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   
1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   
2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   
3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   
4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   

   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  
0      48.57      92.54      39.32      61.42       5.08  

In [6]:
#Printing the count of Labels
#1 ---> Not Exoplanet
#2 ---> Exoplanet

if 'exoTrain' in globals() and 'exoTest' in globals():
	print("Train label counts:")
	print(exoTrain['LABEL'].value_counts())
	print("Test label counts:")
	print(exoTest['LABEL'].value_counts())
else:
	print("exoTrain and/or exoTest are not defined. Please check if the data files exist and are loaded correctly.")

Train label counts:
LABEL
1    5050
2      37
Name: count, dtype: int64
Test label counts:
LABEL
1    565
2      5
Name: count, dtype: int64


# EDA (EXPLORATORY DATA ANALYSIS)

In [7]:
def flux_graph(dataset, row, dataframe, planet):

  fig = plt.figure(figsize=(10,5))
  ax = fig.add_subplot()
  ax.set_title(planet, fontsize=22)
  ax.set_xlabel('time', fontsize=17)
  ax.set_ylabel('flux_' + str(row), fontsize=17)
  ax.grid(False)
  if dataframe:
    flux_time = list(dataset.columns)
    flux_values = dataset[flux_time].iloc[row]
  else:
    flux_values = dataset[row]

  ax.plot(range(1, len(flux_values) + 1),flux_values)
  ax.tick_params(colors='black', labelsize=14)
  plt.show()


In [8]:
def display_flux(dataframe, dataset):
    with_planet = exoTrain[exoTrain['LABEL'] == 2].head(1).index
    wo_planet = exoTrain[exoTrain['LABEL'] == 1].head(1).index

    for row in with_planet:
        flux_graph(dataset, row, dataframe, planet = 'At least One Exoplanet')
    for row in wo_planet:
        flux_graph(dataset, row, dataframe, planet = 'No Exoplanet')

In [10]:
import matplotlib.pyplot as plt

if 'exoTrain' in globals():
	display_flux(True, dataset = exoTrain.loc[:, exoTrain.columns != 'LABEL'])
else:
	print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# PairPlot for first 10 columns
if 'exoTrain' in globals():
	subset_data = exoTrain.iloc[:, :5]
	sn.pairplot(subset_data, hue='LABEL')
else:
	print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

In [None]:
# Define subset_data as in previous EDA cell
if 'exoTrain' in globals():
	subset_data = exoTrain.iloc[:, :5]
	sn.kdeplot(data = subset_data, hue = 'LABEL', x='FLUX.1')
else:
	print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

# Data Preprocessing

In [None]:
def handle_outliers(dataset, num_iterations):
    dataset_outlier = dataset

    for n in range(num_iterations):
        for index, row in dataset_outlier.iterrows():
            row_values = row.values
            row_max, row_min = row_values.max(), row_values.min()
            row_maxidx, row_minidx = row_values.argmax(), row_values.argmin()
            row_mean = row_values.mean()

            dataset_outlier.iloc[index][row_maxidx] = row_mean

            dataset_outlier.iloc[index][row_minidx] = row_mean

    return dataset_outlier

In [None]:
#Changing the labels from (1 ---> 1) and (2 ---> 0)
def change_labels(y_train, y_test):
    label_changer = lambda x: 1 if x == 2 else 0
    y_train_temp = y_train.apply(label_changer)
    y_test_temp = y_test.apply(label_changer)

    return y_train_temp, y_test_temp

In [None]:
# Handling the Imbalance of datasets by oversampling
from imblearn.over_sampling import RandomOverSampler
def RVS(x_train, y_train):
    rv = RandomOverSampler(random_state = 42)
    x_train_res, y_train_res = rv.fit_resample(x_train, y_train)
    return x_train_res, y_train_res

In [None]:
# Preprocessing training and testing datasets
def pre_processing():
    x_train, y_train = exoTrain.loc[:, exoTrain.columns != 'LABEL'], exoTrain.loc[:, 'LABEL']
    x_test, y_test = exoTest.loc[:, exoTest.columns != 'LABEL'], exoTest.loc[:, 'LABEL']


    x_train = handle_outliers(x_train, 5) #Removing the outliers
    x_train, y_train = RVS(x_train, y_train) #Upsampling the data using RandomOverSampler
    y_train, y_test = change_labels(y_train, y_test) #Changing the labels from (1 --> 1) and (2 --> 0)
 
    return x_train, y_train, x_test, y_test

In [None]:
# Printing the Confusion matrix
def plot_confusion_matrix(y_test, y_pred):

    matrix = confusion_matrix(y_test, y_pred,normalize='true')
    df = pd.DataFrame(matrix, columns=[0, 1], index = [0, 1])
    df.index.name = 'Real Values'
    df.columns.name = 'Predicted Values'
    plt.figure(figsize = (10,10)) 
    sn.heatmap(df, cmap="BuGn", annot=True)
    plt.show()
    
    return matrix

In [None]:
# Print prediction metrics
def display_predictions(y_test, y_pred, y_class_pred, matrix):

  #TP ---> TRUE POSITIVE
  #TN ---> TRUE NEGATIVE
  #FP ---> FALSE POSITIVE
  #FN ---> FALSE NEGATIVE
  TP = matrix[0][0]
  TN = matrix[1][1]
  FP = matrix[0][1] 
  FN = matrix[1][0]



  rec = TP/(TP+FN) #Recall
  accuracy = (TP+TN)/(TP+FP+TN+FN) #Accuracy
  precision = TP/(TP+FP) #Precision
  f1 = (2*precision*rec)/(precision+rec) #F1 Score
  auc = roc_auc_score(y_test, y_pred) #ROC curve (Area under curve)

  print('\t\t Prediction Metrics\n')
  print("Accuracy:\t", "{:0.4f}".format(accuracy))
  print("Precision:\t", "{:0.4f}".format(precision))
  print("Recall:\t\t", "{:0.4f}".format(rec))
  print("ROC AUC:\t", "{:0.4f}".format(auc))

# Modelling

In [None]:
# Faster Random Forest with RandomizedSearchCV and fewer parameter combinations
from sklearn.model_selection import RandomizedSearchCV
def find_best_model(model):
    # Splitting Dataset
    x_train, y_train, x_test, y_test = pre_processing()
    x_train, y_train = shuffle(x_train, y_train) # shuffle the data

    if(model == 'rf'):
        print('Finding the best Random Forest Model (Fast Mode)')
        param_dist = { 
            'n_estimators': [100],
            'max_features': ['sqrt'],
            'max_depth' : [4],
            'criterion' :['gini']
        }
        md = RandomForestClassifier(random_state=42)

        search = RandomizedSearchCV(estimator=md, param_distributions=param_dist, n_iter=1, cv=2, scoring='recall', random_state=42, n_jobs=-1)

    elif (model == 'lr'):
        param_grid = { 
            'penalty': ['l2'],
            'C':[1],
        }
        md = LogisticRegression(random_state=42)
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)

    elif(model == 'knn'):
        param_grid = { 
            'weights' : ['uniform'],
            'n_neighbors':[3],
            'p':[2]
        }
        md = KNeighborsClassifier()
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)

    elif(model == 'lightgbm'):
        param_grid = {
            'learning_rate': [0.1],
            'n_estimators': [100],
            'max_depth': [6],
        }
        md = lgb.LGBMClassifier(random_state=42)
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)

    import time
    start = time.time()
    search.fit(x_train, y_train)
    end = time.time()

    print('Total Time Required : \n', end-start)

    best = search.best_estimator_ #Best Estimator
    print('Best Estimator is : \n')
    print(best)
    clf = best
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print('The Confusion Matrix is: \n')
    print(plot_confusion_matrix(y_test, y_pred))

    print('Classification Report is: \n')
    print(classification_report(y_test, y_pred))

In [None]:
# Example: Run a model and get the classifier and predictions for visualization
# Modify find_best_model to return clf, x_train, x_test, y_test, y_pred

def find_best_model_return(model):
    x_train, y_train, x_test, y_test = pre_processing()
    x_train, y_train = shuffle(x_train, y_train)
    if model == 'rf':
        param_dist = { 
            'n_estimators': [100],
            'max_features': ['sqrt'],
            'max_depth' : [4],
            'criterion' :['gini']
        }
        md = RandomForestClassifier(random_state=42)
        search = RandomizedSearchCV(estimator=md, param_distributions=param_dist, n_iter=1, cv=2, scoring='recall', random_state=42, n_jobs=-1)
    elif model == 'lr':
        param_grid = { 
            'penalty': ['l2'],
            'C':[1],
        }
        md = LogisticRegression(random_state=42)
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)
    elif model == 'knn':
        param_grid = { 
            'weights' : ['uniform'],
            'n_neighbors':[3],
            'p':[2]
        }
        md = KNeighborsClassifier()
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)
    elif model == 'lightgbm':
        param_grid = {
            'learning_rate': [0.1],
            'n_estimators': [100],
            'max_depth': [6],
        }
        md = lgb.LGBMClassifier(random_state=42)
        search = GridSearchCV(estimator=md, param_grid=param_grid, cv=2, scoring='recall', n_jobs=-1)
    search.fit(x_train, y_train)
    best = search.best_estimator_
    clf = best
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return clf, x_train, x_test, y_test, y_pred

# Example usage:
clf, x_train, x_test, y_test, y_pred = find_best_model_return('rf')

# --- Visualization and Feature Importance for Tree-Based Models ---
importances = None
if hasattr(clf, 'feature_importances_'):
    importances = clf.feature_importances_
    feature_names = x_train.columns if hasattr(x_train, 'columns') else [f'Feature {i}' for i in range(len(importances))]
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print('Feature importances are not available for this model.')

# --- Visualization: Confusion Matrix Heatmap ---
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# --- Visualization: ROC Curve ---
from sklearn.metrics import roc_curve, auc
if hasattr(clf, "predict_proba"):
    y_score = clf.predict_proba(x_test)[:, 1]
else:
    y_score = clf.decision_function(x_test)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

## Random Forest

In [None]:
find_best_model('rf')

## Logistic Regression

In [None]:
find_best_model('lr')

## K Nearest Neighbours

In [None]:
find_best_model('knn')

## LIGHT GRADIENT BOOSTING 

In [None]:
find_best_model('lightgbm')

## Proposed CNN Model

In [None]:
def cnn_model():

    # Data preparation
    x_train, y_train, x_test, y_test = pre_processing()
    x_train, y_train = shuffle(x_train, y_train) # shuffle the data to avoid stagnant 0.0000e+00 val_accuracy


    n_features = x_train.shape[1]

    # Architecture
    model = keras.Sequential()
    model.add(layers.Reshape((3197, 1), input_shape=(3197,)))
    model.add(layers.Normalization())
    model.add(layers.Conv1D(filters=11, kernel_size=2, activation='relu', input_shape=(n_features, 1), kernel_regularizer='l2'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv1D(filters=7, kernel_size=2, activation='relu', input_shape=(n_features, 1), kernel_regularizer='l2'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling1D(pool_size=2, strides=2))
    model.add(layers.Dropout(0.4))
    model.add(layers.Flatten())
    model.add(layers.Dense(50, activation="relu"))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(30, activation="relu"))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(12, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))

    # Representation of architecture
    print(model.summary())

    # Compile model
    lr_schedule = optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.8)

    model.compile(optimizer = Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=[metrics.Recall()])

    # Fit model
    early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

    history = model.fit(x_train, y_train, validation_split = 0.2, batch_size=64, callbacks=[early_stop], epochs=30, verbose=2)


    #Training Prediction
    print('Training Prediction:')
    y_class_pred = (model.predict(x_train) > 0.5).astype("int32")
    y_pred = model.predict(x_train)

    # Calculating the Confustion Matrix
    matrix = plot_confusion_matrix(y_train, y_class_pred)

    # Displaying the Output Predictions
    display_predictions(y_train, y_pred, y_class_pred, matrix)



    # Testing Predictions
    print('Testing Prediction:')
    y_class_pred = (model.predict(x_test) > 0.5).astype("int32")
    y_pred = model.predict(x_test)

    # Confustion matrix
    matrix = plot_confusion_matrix(y_test, y_class_pred)


    # Metrics
    display_predictions(y_test, y_pred, y_class_pred, matrix)
      


We do not get the best results in the first iteration, but are able to achieve it after multiple iterations.

In [None]:
# --- Advanced Visualization: 3D Scatter Plot of Exoplanet Features ---
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
if 'exoTrain' in globals():
    # Select three features for 3D visualization (change as needed)
    features_3d = exoTrain.columns[:3] if len(exoTrain.columns) >= 3 else exoTrain.columns
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    xs = exoTrain[features_3d[0]]
    ys = exoTrain[features_3d[1]]
    zs = exoTrain[features_3d[2]]
    labels = exoTrain['LABEL'] if 'LABEL' in exoTrain.columns else np.zeros(len(xs))
    scatter = ax.scatter(xs, ys, zs, c=labels, cmap='viridis', s=20)
    ax.set_xlabel(features_3d[0])
    ax.set_ylabel(features_3d[1])
    ax.set_zlabel(features_3d[2])
    ax.set_title('3D Scatter Plot of Exoplanet Data')
    legend1 = ax.legend(*scatter.legend_elements(), title="LABEL")
    ax.add_artist(legend1)
    plt.show()
else:
    print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

In [None]:
# --- Advanced Visualization: Interactive 3D Plot with Plotly ---
import plotly.express as px
if 'exoTrain' in globals():
    # Select three features for 3D visualization (change as needed)
    features_3d = exoTrain.columns[:3] if len(exoTrain.columns) >= 3 else exoTrain.columns
    fig = px.scatter_3d(exoTrain, x=features_3d[0], y=features_3d[1], z=features_3d[2],
                        color='LABEL' if 'LABEL' in exoTrain.columns else None,
                        title='Interactive 3D Scatter Plot of Exoplanet Data',
                        labels={features_3d[0]: features_3d[0], features_3d[1]: features_3d[1], features_3d[2]: features_3d[2]})
    fig.show()
else:
    print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

In [None]:
# --- Advanced Visualization: 3D Surface Plot of Feature Correlations ---
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
if 'exoTrain' in globals():
    # Select two features and plot their correlation as a surface
    features_2d = exoTrain.columns[:2] if len(exoTrain.columns) >= 2 else exoTrain.columns
    z = exoTrain['LABEL'] if 'LABEL' in exoTrain.columns else np.zeros(len(exoTrain))
    x = exoTrain[features_2d[0]]
    y = exoTrain[features_2d[1]]
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    surf = ax.plot_trisurf(x, y, z, cmap='viridis', edgecolor='none')
    ax.set_xlabel(features_2d[0])
    ax.set_ylabel(features_2d[1])
    ax.set_zlabel('LABEL')
    ax.set_title('3D Surface Plot of Feature Correlations')
    fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
    plt.show()
else:
    print("exoTrain is not defined. Please ensure the data files exist and are loaded correctly.")

In [None]:
cnn_model()

# Summary of Exoplanet Detection Project

This notebook demonstrates the detection of exoplanets using both traditional machine learning models and a Convolutional Neural Network (CNN). The workflow includes:

- **Data Loading & Exploration:** Importing and visualizing the dataset, including flux time series for exoplanet and non-exoplanet samples.
- **Preprocessing:** Handling outliers, balancing classes with oversampling, and label encoding.
- **Model Training:** Training and evaluating Random Forest, Logistic Regression, K-Nearest Neighbors, LightGBM, and a custom CNN model. Hyperparameter tuning is performed for each model.
- **Evaluation & Visualization:** Confusion matrices, classification reports, ROC curves, and feature importance plots are provided for model interpretation.
- **Key Insights:**
  - Tree-based models (Random Forest, LightGBM) provide feature importances for interpretability.
  - The CNN model leverages the sequential nature of flux data for improved performance.
  - Model performance is evaluated using accuracy, recall, precision, F1-score, and ROC AUC.

**Next Steps:**
- Experiment with additional feature engineering or advanced deep learning architectures.
- Further optimize hyperparameters for best results.
- Explore model ensembling for improved accuracy.

_Refer to the README for more details on dataset and methodology._
