In [10]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('grand_dAAAAAAaataset_final_scaled.csv')

In [3]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15875 entries, 0 to 15874
Data columns (total 53 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   koi_disposition                 15875 non-null  object 
 1   koi_score                       15875 non-null  float64
 2   koi_period                      15875 non-null  float64
 3   koi_eccen                       15875 non-null  float64
 4   koi_impact                      15875 non-null  float64
 5   koi_duration                    15875 non-null  float64
 6   koi_depth                       15875 non-null  float64
 7   koi_ror                         15875 non-null  float64
 8   koi_srho                        15875 non-null  float64
 9   koi_incl                        15875 non-null  float64
 10  koi_teq                         15830 non-null  float64
 11  koi_insol                       15861 non-null  float64
 12  koi_dor                         

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [5]:
y = le.fit_transform(df["koi_disposition"])
X = df.drop("koi_disposition", axis=1)

In [27]:
for idx, label in enumerate(le.classes_):
    print(f"{idx}: {label}")

0: APC
1: CANDIDATE
2: CONFIRMED
3: FALSE POSITIVE


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15875 entries, 0 to 15874
Data columns (total 53 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   koi_disposition                 15875 non-null  object 
 1   koi_score                       15875 non-null  float64
 2   koi_period                      15875 non-null  float64
 3   koi_eccen                       15875 non-null  float64
 4   koi_impact                      15875 non-null  float64
 5   koi_duration                    15875 non-null  float64
 6   koi_depth                       15875 non-null  float64
 7   koi_ror                         15875 non-null  float64
 8   koi_srho                        15875 non-null  float64
 9   koi_incl                        15875 non-null  float64
 10  koi_teq                         15830 non-null  float64
 11  koi_insol                       15861 non-null  float64
 12  koi_dor                         

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [11]:
cat_model = CatBoostClassifier(
    iterations=800, learning_rate=0.05, depth=8,
    loss_function='MultiClass', verbose=0
)

lgbm_model = LGBMClassifier(
    n_estimators=800, learning_rate=0.05, max_depth=8,
    objective='multiclass', random_state=42
)

models = {
    "CatBoost": cat_model,
    "LightGBM": lgbm_model
}

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
for name, model in models.items():
    print(f"\n🔹 Cross-validating {name}...")
    acc_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    f1_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
    results[name] = {
        "Accuracy Mean": acc_scores.mean(),
        "F1 Macro Mean": f1_scores.mean()
    }
    print(f"Accuracy: {acc_scores.mean():.4f} | F1-Macro: {f1_scores.mean():.4f}")


🔹 Cross-validating CatBoost...
Accuracy: 0.7900 | F1-Macro: 0.6265

🔹 Cross-validating LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12501
[LightGBM] [Info] Number of data points in the train set: 10160, number of used features: 51
[LightGBM] [Info] Start training from score -3.729045
[LightGBM] [Info] Start training from score -0.988734
[LightGBM] [Info] Start training from score -1.384721
[LightGBM] [Info] Start training from score -1.039749
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12501
[LightGBM] [Info] Number of data points in the train set: 10160, number of used features: 51
[LightGBM] [Info] Start training from score -3.733152
[LightGBM] [Info] Start training from score -0.988

In [14]:
best_model_name = max(results, key=lambda x: results[x]['F1 Macro Mean'])
best_model = models[best_model_name]
print(f"\n Best Model based on CV: {best_model_name}")


 Best Model based on CV: LightGBM


In [15]:
#fit the best model on the entire training set
best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12501
[LightGBM] [Info] Number of data points in the train set: 12700, number of used features: 51
[LightGBM] [Info] Start training from score -3.732330
[LightGBM] [Info] Start training from score -0.988523
[LightGBM] [Info] Start training from score -1.384406
[LightGBM] [Info] Start training from score -1.039972


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,8
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [18]:
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.07      0.11        76
           1       0.73      0.82      0.77      1182
           2       0.83      0.77      0.80       795
           3       0.87      0.85      0.86      1122

    accuracy                           0.80      3175
   macro avg       0.72      0.63      0.64      3175
weighted avg       0.80      0.80      0.79      3175


Confusion Matrix:
[[  5  51   6  14]
 [  3 973  87 119]
 [  1 172 611  11]
 [  2 143  28 949]]


In [19]:
decoded_preds = le.inverse_transform(y_pred.astype(int))
print(decoded_preds)

['CONFIRMED' 'CONFIRMED' 'FALSE POSITIVE' ... 'FALSE POSITIVE'
 'FALSE POSITIVE' 'CANDIDATE']


In [None]:
#Feature Importance Visualization
importance = best_model.feature_importances_
features = X.columns
fi = pd.DataFrame({"Feature": features, "Importance": importance}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(fi["Feature"].head(15)[::-1], fi["Importance"].head(15)[::-1])
plt.title(f"Top 15 Feature Importances - {best_model_name}")
plt.show()

In [22]:
# Create a new DataFrame df2 without the relative error columns
relative_error_cols = [
    'Relative_Error_PLANET_ORBPER',
    'Relative_Error_PLANET_RADE',
    'Relative_Error_PLANET_TRANDEP',
    'Relative_Error_PLANET_TRANDURH',
    'Relative_Error_PLANET_TRANMID',
    'Relative_Error_STAR_DIST',
    'Relative_Error_STAR_LOGG',
    'Relative_Error_STAR_PMDEC',
    'Relative_Error_STAR_PMRA',
    'Relative_Error_STAR_RAD',
    'Relative_Error_STAR_TEFF',
    'Relative_Error_STAR_TMAG'
]
df2 = df.drop(columns=relative_error_cols)


In [23]:
# Label encode target for df2
le2 = LabelEncoder()
y2 = le2.fit_transform(df2["koi_disposition"])
X2 = df2.drop("koi_disposition", axis=1)


In [24]:
# Train/test split for df2
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=42
)


In [25]:
# Reuse the same models for df2
results2 = {}
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    print(f"\n🔹 Cross-validating {name} (df2)...")
    acc_scores = cross_val_score(model, X2_train, y2_train, cv=skf2, scoring='accuracy')
    f1_scores = cross_val_score(model, X2_train, y2_train, cv=skf2, scoring='f1_macro')
    results2[name] = {
        "Accuracy Mean": acc_scores.mean(),
        "F1 Macro Mean": f1_scores.mean()
    }
    print(f"Accuracy: {acc_scores.mean():.4f} | F1-Macro: {f1_scores.mean():.4f}")



🔹 Cross-validating CatBoost (df2)...
Accuracy: 0.7872 | F1-Macro: 0.6126

🔹 Cross-validating LightGBM (df2)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9441
[LightGBM] [Info] Number of data points in the train set: 10160, number of used features: 39
[LightGBM] [Info] Start training from score -3.729045
[LightGBM] [Info] Start training from score -0.988734
[LightGBM] [Info] Start training from score -1.384721
[LightGBM] [Info] Start training from score -1.039749
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9441
[LightGBM] [Info] Number of data points in the train set: 10160, number of used features: 39
[LightGBM] [Info] Start training from score -3.733152
[LightGBM] [Info] Start training from sc

In [26]:
import joblib
joblib.dump(best_model, f"{best_model_name}_exoplanet_model.pkl")

['LightGBM_exoplanet_model.pkl']

In [28]:
import joblib
print(list(X.columns))

['koi_score', 'koi_period', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_smass', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'loc_rowid', 'st_pmra', 'st_pmdec', 'pl_tranmid', 'koi_prad', 'koi_tmag', 'st_dist', 'koi_srad', 'Relative_Error_PLANET_ORBPER', 'Relative_Error_PLANET_RADE', 'Relative_Error_PLANET_TRANDEP', 'Relative_Error_PLANET_TRANDURH', 'Relative_Error_PLANET_TRANMID', 'Relative_Error_STAR_DIST', 'Relative_Error_STAR_LOGG', 'Relative_Error_STAR_PMDEC', 'Relative_Error_STAR_PMRA', 'Relative_Error_STAR_RAD', 'Relative_Error_STAR_TEFF', 'Relative_Error_STAR_TMAG', 'koi_fittype_LS+MCMC', 'koi_fittype_MCMC', 'koi_fittype_none']
