In [15]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp313-cp313-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading https://download.pytorch.org/whl/fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB

In [1]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())

2.8.0+cu128
CUDA available: True


In [2]:
import torch, platform
print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

Python: 3.13.7
Torch: 2.8.0+cu128
Built with CUDA: 12.8
CUDA available: True
Device: NVIDIA GeForce RTX 5090


In [3]:
import torch

device = torch.device("cuda")  
torch.set_default_device(device)  


In [4]:
import pandas as pd
df = pd.read_csv("cumulative_2025.10.05_02.00.07.csv")
print(df.shape)

(9564, 49)


In [5]:
print(df["koi_disposition"].value_counts(dropna=False))


koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1979
Name: count, dtype: int64


In [6]:
df_filtered = df[df["koi_disposition"].isin(["CONFIRMED", "CANDIDATE"])].copy()
df_filtered["koi_disposition"] = df_filtered["koi_disposition"].map({
    "CONFIRMED": 0,  # Non-planet (negative class)
    "CANDIDATE": 1   # Planet candidate (positive class)
})
print(df_filtered["koi_disposition"].value_counts())


koi_disposition
0    2746
1    1979
Name: count, dtype: int64


In [7]:
df_clean = df_filtered.copy()
drop_cols = [
    "rowid", 
    "kepid", 
    "kepoi_name", 
    "kepler_name",
    "koi_pdisposition", 
    "koi_tce_delivname", 
    "koi_vet_date",
    "koi_vet_stat", 
    "kepler_disposition"
]
for c in drop_cols:
    if c in df_clean.columns:
        df_clean.drop(columns=c, inplace=True)

# Drop columns that are >40 % missing
missing_frac = df_clean.isna().mean()
too_missing = missing_frac[missing_frac > 0.4].index
df_clean.drop(columns=too_missing, inplace=True)

print(f"Dropped {len(too_missing)} high-missing columns")
print("Remaining columns:", len(df_clean.columns))

# Quick look at remaining data
print(df_clean.head(3))
print(df_clean.isna().mean().sort_values(ascending=False).head(10))

Dropped 2 high-missing columns
Remaining columns: 42
   koi_disposition  koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  \
0                0      1.000              0              0              0   
1                0      0.969              0              0              0   
2                1      0.000              0              0              0   

   koi_fpflag_ec  koi_period  koi_period_err1  koi_period_err2  koi_time0bk  \
0              0    9.488036         0.000028        -0.000028   170.538750   
1              0   54.418383         0.000248        -0.000248   162.513840   
2              0   19.899140         0.000015        -0.000015   175.850252   

   ...  koi_steff_err2  koi_slogg  koi_slogg_err1  koi_slogg_err2  koi_srad  \
0  ...           -81.0      4.467           0.064          -0.096     0.927   
1  ...           -81.0      4.467           0.064          -0.096     0.927   
2  ...          -176.0      4.544           0.044          -0.176     0.868   



In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Separate features / target
target_col = "koi_disposition"
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]

# --- Identify numeric vs categorical ---
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]

print(f"Numeric cols: {len(num_cols)}, Categorical cols: {len(cat_cols)}")

# --- Preprocessor ---
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("Shapes -> train:", X_train.shape, "test:", X_test.shape)


Numeric cols: 41, Categorical cols: 0
Shapes -> train: (3307, 41) test: (1418, 41)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

# --- Random Forest baseline ---
rf = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

t0 = time.time()
rf.fit(X_train, y_train)
fit_time = time.time() - t0

# --- Predictions ---
y_pred = rf.predict(X_test)

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# specificity = TN / (TN + FP)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
spec = tn / (tn + fp)

print(f"Fit time: {fit_time:.2f} sec")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall (Sensitivity): {rec:.3f}")
print(f"Specificity: {spec:.3f}")
print(f"F1 Score: {f1:.3f}")


Fit time: 0.24 sec
Accuracy: 0.878
Precision: 0.860
Recall (Sensitivity): 0.847
Specificity: 0.900
F1 Score: 0.853


In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
import numpy as np

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

cv_results = cross_validate(
    rf,          # the Pipeline from the last step
    X, y,        # use the full filtered dataset here
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

for m in ["accuracy", "precision", "recall", "f1"]:
    mean = np.mean(cv_results[f"test_{m}"])
    std  = np.std(cv_results[f"test_{m}"])
    print(f"{m:9s}: {mean:.3f} ± {std:.3f}")


accuracy : 0.876 ± 0.013
precision: 0.863 ± 0.024
recall   : 0.838 ± 0.024
f1       : 0.850 ± 0.016


In [11]:
from sklearn.ensemble import ExtraTreesClassifier

et = Pipeline([
    ("prep", preprocessor),
    ("clf", ExtraTreesClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

cv_results_et = cross_validate(
    et,
    X, y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

for m in ["accuracy", "precision", "recall", "f1"]:
    mean = np.mean(cv_results_et[f"test_{m}"])
    std  = np.std(cv_results_et[f"test_{m}"])
    print(f"{m:9s}: {mean:.3f} ± {std:.3f}")


accuracy : 0.871 ± 0.012
precision: 0.861 ± 0.019
recall   : 0.824 ± 0.024
f1       : 0.842 ± 0.016


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# --- Define base learners and meta-learner ---
stack = Pipeline([
    ("prep", preprocessor),
    ("clf", StackingClassifier(
        estimators=[
            ("rf", RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)),
            ("et", ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=42))
        ],
        final_estimator=LogisticRegression(max_iter=200, random_state=42),
        n_jobs=-1
    ))
])

cv_results_stack = cross_validate(
    stack,
    X, y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

for m in ["accuracy", "precision", "recall", "f1"]:
    mean = np.mean(cv_results_stack[f"test_{m}"])
    std  = np.std(cv_results_stack[f"test_{m}"])
    print(f"{m:9s}: {mean:.3f} ± {std:.3f}")


accuracy : 0.876 ± 0.014
precision: 0.866 ± 0.022
recall   : 0.835 ± 0.026
f1       : 0.850 ± 0.017


In [13]:
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import time

# Baseline RF inside the same preprocessing pipeline
rf_pipe = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# A compact but effective grid — keeps runtime reasonable
param_grid = {
    "clf__n_estimators": [200, 400, 800],
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"]
}

grid = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid,
    scoring="f1",          # optimize for F1 like in the paper
    cv=cv,                 # 10x5 repeated stratified CV
    n_jobs=-1,
    refit=True,
    verbose=1
)

t0 = time.time()
grid.fit(X, y)
fit_time = time.time() - t0

print("\nBest params:")
for k, v in grid.best_params_.items():
    print(f"  {k}: {v}")
print(f"GridSearch fit time: {fit_time:.1f}s")

# Cross-validated performance of the tuned model (fresh CV on the whole data)
tuned_model = grid.best_estimator_

cv_results_tuned = cross_validate(
    tuned_model,
    X, y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

print("\nTuned RandomForest (CV means ± std):")
for m in ["accuracy", "precision", "recall", "f1"]:
    mean = np.mean(cv_results_tuned[f"test_{m}"])
    std  = np.std(cv_results_tuned[f"test_{m}"])
    print(f"{m:9s}: {mean:.3f} ± {std:.3f}")


Fitting 50 folds for each of 216 candidates, totalling 10800 fits

Best params:
  clf__max_depth: 30
  clf__max_features: sqrt
  clf__min_samples_leaf: 1
  clf__min_samples_split: 5
  clf__n_estimators: 400
GridSearch fit time: 3836.6s

Tuned RandomForest (CV means ± std):
accuracy : 0.878 ± 0.014
precision: 0.863 ± 0.022
recall   : 0.844 ± 0.025
f1       : 0.853 ± 0.017


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score, classification_report
)
import numpy as np

# If you still have X_train/X_test from earlier, reuse them; if not, recreate:
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Build the tuned RF from your best params
bp = grid.best_params_
rf = RandomForestClassifier(
    n_estimators=bp["clf__n_estimators"],
    max_depth=bp["clf__max_depth"],
    min_samples_split=bp["clf__min_samples_split"],
    min_samples_leaf=bp["clf__min_samples_leaf"],
    max_features=bp["clf__max_features"],
    random_state=42,
    n_jobs=-1
)

final_pipe = Pipeline([("prep", preprocessor), ("clf", rf)])
final_pipe.fit(X_tr, y_tr)

# Evaluate on the holdout
y_hat = final_pipe.predict(X_te)
p1 = final_pipe.predict_proba(X_te)[:,1]

acc  = accuracy_score(y_te, y_hat)
prec = precision_score(y_te, y_hat)
rec  = recall_score(y_te, y_hat)
f1   = f1_score(y_te, y_hat)
tn, fp, fn, tp = confusion_matrix(y_te, y_hat).ravel()
spec = tn / (tn + fp)
roc  = roc_auc_score(y_te, p1)
pr   = average_precision_score(y_te, p1)

print(f"Holdout — Acc:{acc:.3f} Prec:{prec:.3f} Rec:{rec:.3f} Spec:{spec:.3f} F1:{f1:.3f} ROC-AUC:{roc:.3f} PR-AUC:{pr:.3f}")
print("\nClassification report:\n", classification_report(y_te, y_hat, digits=3))


Holdout — Acc:0.877 Prec:0.858 Rec:0.845 Spec:0.899 F1:0.852 ROC-AUC:0.938 PR-AUC:0.921

Classification report:
               precision    recall  f1-score   support

           0      0.890     0.899     0.894       824
           1      0.858     0.845     0.852       594

    accuracy                          0.877      1418
   macro avg      0.874     0.872     0.873      1418
weighted avg      0.876     0.877     0.876      1418



In [15]:
import joblib, json, os
import pandas as pd

os.makedirs("models", exist_ok=True)
joblib.dump(final_pipe, "models/rf_pipeline.joblib", compress=3)
print("Saved models/rf_pipeline.joblib")

num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
schema = {"feature_cols": X.columns.tolist(), "num_cols": num_cols, "cat_cols": cat_cols}
with open("models/koi_schema.json","w") as f: json.dump(schema, f, indent=2)

cat_values = {}
for c in cat_cols:
    vals = X[c].dropna().astype(str).value_counts().index.tolist()
    cat_values[c] = vals[:50]
with open("models/categorical_values.json","w") as f: json.dump(cat_values, f, indent=2)
print("Saved schema and categorical values in models/")


Saved models/rf_pipeline.joblib
Saved schema and categorical values in models/
