In [None]:
%pip install pandas numpy matplotlib

# Import our primary tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# A setting to make our plots look nicer
plt.style.use('seaborn-v0_8-darkgrid')

print("Libraries imported successfully!")

In [None]:
# Load the dataset
# You will need to change the filename to match the one you downloaded.
# Try to read with default options, but skip bad lines if parsing fails
file_path = '../data/cumulative_2025.10.04_00.14.27.csv'
koi_data = pd.read_csv(file_path, on_bad_lines='skip', engine='python')

print("Data loaded successfully!")

In [None]:
# Display the first 5 rows of the table
koi_data.head()

In [None]:
# Check the values in our target column
# Make sure cell 4 (which reloads koi_data with the correct header) has been run before this cell
koi_data['koi_disposition'].value_counts()

In [None]:
# Reload the CSV with the correct header row (skip comment lines)
# The actual data header is likely at line 51 (0-based index), so header=51
koi_data = pd.read_csv(file_path, comment='#', header=0, engine='python')

# Let's plot Orbital Period vs. Planet Radius for CONFIRMED exoplanets
confirmed_planets = koi_data[koi_data['koi_disposition'] == 'CONFIRMED']

# Create the plot
plt.figure(figsize=(10, 6))
plt.scatter(confirmed_planets['koi_period'], confirmed_planets['koi_prad'], alpha=0.5, s=10)

# Use a log scale for better visibility of the distribution
plt.xscale('log')
plt.yscale('log')

# Add labels and a title
plt.title('NASA Kepler Confirmed Exoplanets')
plt.xlabel('Orbital Period (days)')
plt.ylabel('Planet Radius (Earth Radii)')

# Show the plot
plt.show()

In [None]:
# Select a subset of promising features
features = [
    'koi_period',        # Orbital Period (days)
    'koi_duration',      # Transit Duration (hours)
    'koi_depth',         # Transit Depth (parts per million)
    'koi_prad',          # Planetary Radius (Earth radii)
    'koi_steff',         # Stellar Effective Temperature (Kelvin)
    'koi_slogg',         # Stellar Surface Gravity (log10(cm/s^2))
    'koi_srad',          # Stellar Radius (Solar radii)
    'koi_disposition'    # The target label we want to predict
]

# Create a new DataFrame with only these features
df_clean = koi_data[features].copy()

df_clean.info()

In [None]:
# Filter out the 'CANDIDATE' entries
df_clean = df_clean[df_clean['koi_disposition'] != 'CANDIDATE']

# Create a numerical mapping for our target
disposition_map = {'CONFIRMED': 1, 'FALSE POSITIVE': 0}
df_clean['koi_disposition'] = df_clean['koi_disposition'].map(disposition_map)

# Check the new distribution
df_clean['koi_disposition'].value_counts()

In [None]:
# Drop rows with any missing values
df_clean = df_clean.dropna()

# Verify that there are no more missing values
df_clean.info()

In [None]:
if 'df_clean' not in locals():
	print("Variable 'df_clean' is not defined. Please run the previous data cleaning cells first.")
else:
	# 'X' contains all our feature columns
	X = df_clean.drop('koi_disposition', axis=1)

	# 'y' contains the target column
	y = df_clean['koi_disposition']

In [None]:
from sklearn.model_selection import train_test_split

# Check if X and y are defined
if 'X' not in locals() or 'y' not in locals():
	print("Variables X and y are not defined. Please run the cell that defines X and y (cell 8) first.")
else:
	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

	# stratify=y ensures that the proportion of confirmed planets and false positives is the same in both the train and test sets.
	# random_state=42 ensures we get the same split every time we run the code.

	print(f"Training data shape: {X_train.shape}")
	print(f"Testing data shape: {X_test.shape}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Check if X_train and y_train are defined
if 'X_train' not in locals() or 'y_train' not in locals():
	print("Variables X_train and y_train are not defined. Please run the cell that splits the data (cell 9) first.")
else:
	# Create the model instance
	# n_estimators is the number of "trees" in our forest. 100 is a good starting point.
	model = RandomForestClassifier(n_estimators=100, random_state=42)

	# Train the model on the training data
	model.fit(X_train, y_train)

	print("Model training complete!")

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Use the trained model to make predictions on the test set
predictions = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n---------------------------------------\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['False Positive', 'Confirmed Planet']))

In [28]:
%pip install joblib

import joblib
import os

# Create a directory to store the model if it doesn't exist
if not os.path.exists('../model'):
    os.makedirs('../model')

# Save the trained model to a file
joblib.dump(upgraded_model, '../model/exoplanet_model.joblib')

print("Model saved successfully to 'model/exoplanet_model.joblib'")

Note: you may need to restart the kernel to use updated packages.
Model saved successfully to 'model/exoplanet_model.joblib'


In [29]:
import pandas as pd
import numpy as np

# Load the datasets
# Make sure to update the filenames to match what you have in your 'data' folder
df_koi = pd.read_csv('../data/cumulative_2025.10.04_00.14.27.csv', comment='#')
df_k2 = pd.read_csv('../data/k2pandc_2025.10.04_22.22.53.csv', comment='#')
df_tess = pd.read_csv('../data/TOI_2025.10.04_22.22.47.csv', comment='#')

print("Kepler (KOI) Shape:", df_koi.shape)
print("K2 Shape:", df_k2.shape)
print("TESS (TOI) Shape:", df_tess.shape)

Kepler (KOI) Shape: (9564, 49)
K2 Shape: (4004, 94)
TESS (TOI) Shape: (7703, 65)


In [30]:
# --- 1. Process Kepler (KOI) Data ---
# Select and rename columns
df_koi_clean = df_koi[['koi_disposition', 'koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_steff', 'koi_slogg', 'koi_srad']].copy()
df_koi_clean = df_koi_clean.rename(columns={
    'koi_disposition': 'disposition', 'koi_period': 'period', 'koi_duration': 'duration', 'koi_depth': 'depth',
    'koi_prad': 'planet_radius', 'koi_steff': 'stellar_temp', 'koi_slogg': 'stellar_gravity', 'koi_srad': 'stellar_radius'
})
# Filter for our target classes and map them to 1s and 0s
df_koi_clean = df_koi_clean[df_koi_clean['disposition'].isin(['CONFIRMED', 'FALSE POSITIVE'])]
df_koi_clean['disposition'] = df_koi_clean['disposition'].map({'CONFIRMED': 1, 'FALSE POSITIVE': 0})


# --- 2. Process K2 Data ---
# Check actual column names in df_k2
print("K2 columns:", df_k2.columns.tolist())

# Adjust column names below to match your actual df_k2 columns
# Example: If your columns are ['k2_disp', 'k2_orbper', ...], update accordingly
# For demonstration, let's use placeholder names; replace with your actual column names
k2_columns = [
    'k2_disp',        # disposition
    'k2_orbper',      # period
    'k2_trandur',     # duration
    'k2_trandep',     # depth
    'k2_rade',        # planet_radius
    'k2_teff',        # stellar_temp
    'k2_logg',        # stellar_gravity
    'k2_rad'          # stellar_radius
]

# Only proceed if all columns exist
missing_cols = [col for col in k2_columns if col not in df_k2.columns]
if missing_cols:
    print("Missing columns in df_k2:", missing_cols)
    # Create an empty DataFrame with the expected columns and correct names
    df_k2_clean = pd.DataFrame(columns=[
        'disposition', 'period', 'duration', 'depth',
        'planet_radius', 'stellar_temp', 'stellar_gravity', 'stellar_radius'
    ])
else:
    df_k2_clean = df_k2[k2_columns].copy()
    df_k2_clean = df_k2_clean.rename(columns={
        'k2_disp': 'disposition', 'k2_orbper': 'period', 'k2_trandur': 'duration', 'k2_trandep': 'depth',
        'k2_rade': 'planet_radius', 'k2_teff': 'stellar_temp', 'k2_logg': 'stellar_gravity', 'k2_rad': 'stellar_radius'
    })
    df_k2_clean = df_k2_clean[df_k2_clean['disposition'].isin(['CONFIRMED', 'FALSE POSITIVE'])]
    df_k2_clean['disposition'] = df_k2_clean['disposition'].map({'CONFIRMED': 1, 'FALSE POSITIVE': 0})


# --- 3. Process TESS (TOI) Data ---
df_tess_clean = df_tess[['tfopwg_disp', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'st_teff', 'st_logg', 'st_rad']].copy()
df_tess_clean = df_tess_clean.rename(columns={
    'tfopwg_disp': 'disposition', 'pl_orbper': 'period', 'pl_trandurh': 'duration', 'pl_trandep': 'depth',
    'pl_rade': 'planet_radius', 'st_teff': 'stellar_temp', 'st_logg': 'stellar_gravity', 'st_rad': 'stellar_radius'
})
# For TESS, the labels are 'CP' (Confirmed Planet) and 'FP' (False Positive)
df_tess_clean = df_tess_clean[df_tess_clean['disposition'].isin(['CP', 'FP'])]
df_tess_clean['disposition'] = df_tess_clean['disposition'].map({'CP': 1, 'FP': 0})


# --- 4. Combine into a Master DataFrame ---
df_master = pd.concat([df_koi_clean, df_k2_clean, df_tess_clean], ignore_index=True)

# --- 5. Final Cleanup ---
# Drop any rows with missing data
df_master = df_master.dropna()

print("Shape of the final master dataset:", df_master.shape)
print("\nDistribution of classes in the master dataset:")
print(df_master['disposition'].value_counts())

K2 columns: ['pl_name', 'hostname', 'default_flag', 'disposition', 'disp_refname', 'sy_snum', 'sy_pnum', 'discoverymethod', 'disc_year', 'disc_facility', 'soltype', 'pl_controv_flag', 'pl_refname', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2', 'pl_eqtlim', 'ttv_flag', 'st_refname', 'st_spectype', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad', 'st_raderr1', 'st_raderr2', 'st_radlim', 'st_mass', 'st_masserr1', 'st_masserr2', 'st_masslim', 'st_met', 'st_meterr1', 'st_meterr2',

  df_master = pd.concat([df_koi_clean, df_k2_clean, df_tess_clean], ignore_index=True)


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# --- 1. Define Features (X) and Target (y) ---
# Ensure the target is numeric (0/1) and drop any rows with invalid targets
df_master['disposition'] = pd.to_numeric(df_master['disposition'], errors='coerce')
df_master = df_master.dropna(subset=['disposition'])
df_master['disposition'] = df_master['disposition'].astype(int)

X = df_master.drop('disposition', axis=1)
y = df_master['disposition']

# --- 2. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("New training data shape:", X_train.shape)
print("New testing data shape:", X_test.shape)

# --- 3. Create and Train the New Model ---
# We'll use the same Random Forest model, but it will learn from the richer dataset
upgraded_model = RandomForestClassifier(n_estimators=100, random_state=42)
upgraded_model.fit(X_train, y_train)

# --- 4. Evaluate the Upgraded Model ---
predictions = upgraded_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"\nUpgraded Model Accuracy: {accuracy * 100:.2f}%")
print("\nNew Classification Report:")
print(classification_report(y_test, predictions, target_names=['False Positive', 'Confirmed Planet']))

# --- 5. Save the Upgraded Model ---
# This will overwrite your old model file with the new, more powerful one
joblib.dump(upgraded_model, '../model/exoplanet_model.joblib')

print("\nUpgraded model saved successfully to 'model/exoplanet_model.joblib'")

New training data shape: (6269, 7)
New testing data shape: (2687, 7)

Upgraded Model Accuracy: 86.90%

New Classification Report:
                  precision    recall  f1-score   support

  False Positive       0.90      0.89      0.89      1664
Confirmed Planet       0.82      0.84      0.83      1023

        accuracy                           0.87      2687
       macro avg       0.86      0.86      0.86      2687
    weighted avg       0.87      0.87      0.87      2687


Upgraded model saved successfully to 'model/exoplanet_model.joblib'
