## Project ini binary classification, karena targetnya cuma dua kelas:
#### 0 → major/common planets (TypeFlag = 0)
#### 1 → minor/rare planets (TypeFlag != 0)
---

## Import Dataset

In [None]:
import pandas as pd 

df = pd.read_csv("dataset/oec.csv")

## EDA

In [2]:
print(f'Cek 5 data teratas \n {df.head()} \n')

print(f'Cek Tipe Data \n {df.info()} \n')

print(f'Cek Missing Values \n {df.isnull().sum()} \n')

Cek 5 data teratas 
   PlanetIdentifier  TypeFlag  PlanetaryMassJpt  RadiusJpt  PeriodDays  \
0      HD 143761 b         0            1.0450        NaN   39.845800   
1      HD 143761 c         0            0.0790        NaN  102.540000   
2      KOI-1843.03         0            0.0014      0.054    0.176891   
3      KOI-1843.01         0               NaN      0.114    4.194525   
4      KOI-1843.02         0               NaN      0.071    6.356006   

   SemiMajorAxisAU  Eccentricity  PeriastronDeg  LongitudeDeg  \
0           0.2196         0.037          270.6           NaN   
1           0.4123         0.050          190.0           NaN   
2           0.0048           NaN            NaN           NaN   
3           0.0390           NaN            NaN           NaN   
4           0.0520           NaN            NaN           NaN   

   AscendingNodeDeg  ...  LastUpdated  RightAscension  Declination  \
0               NaN  ...     16/07/11        16 01 03    +33 18 13   
1        

## Cek distribusi TypeFlag

In [3]:
df['TargetBinary'] = (df['TypeFlag'] != 0).astype(int)


df['TargetBinary'].value_counts()
df['TargetBinary'].value_counts(normalize=True)


TargetBinary
0    0.947545
1    0.052455
Name: proportion, dtype: float64

## Tentuin fitur dulu

In [4]:
features = [
    'PlanetaryMassJpt',
    'RadiusJpt',
    'PeriodDays',
    'SemiMajorAxisAU',
    'Eccentricity',
    'HostStarMassSlrMass',
    'HostStarRadiusSlrRad'
]


## Bikin target binary

In [5]:
df['TargetBinary'] = (df['TypeFlag'] != 0).astype(int)


## Baru define X dan y

In [6]:
X = df[features]
y = df['TargetBinary']


## --- IMPUTASI DULU ---

In [7]:

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

# --- MASUKIN LAGI KE DATAFRAME BIAR BISA NAMA KOLOM ---
import pandas as pd
X = pd.DataFrame(X, columns=features)

## --- FEATURE ENGINEERING DITARO DI SINI ---


In [8]:
import numpy as np

X['MassRadiusRatio'] = X['PlanetaryMassJpt'] / X['RadiusJpt']
X['DensityApprox'] = X['PlanetaryMassJpt'] / (X['RadiusJpt']**3)
X['OrbitalEnergy'] = X['PlanetaryMassJpt'] / X['SemiMajorAxisAU']
X['ScaledPeriod'] = np.log1p(X['PeriodDays'])

# Fitur tambahan
X['MassSqrt'] = np.sqrt(X['PlanetaryMassJpt'])
X['RadiusSqrt'] = np.sqrt(X['RadiusJpt'])
X['PeriodLog'] = np.log1p(X['PeriodDays'])
X['SemiMajorAxisLog'] = np.log1p(X['SemiMajorAxisAU'])
X['EccentricitySquared'] = X['Eccentricity']**2


## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


## Apply SMOTE di training

In [10]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())

Before SMOTE: TargetBinary
0    2717
1     150
Name: count, dtype: int64
After SMOTE: TargetBinary
0    2717
1    2717
Name: count, dtype: int64


## Modeling

In [11]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score

# Build CatBoost model
# scale_pos_weight udah di-handle SMOTE, jadi gak perlu class_weights
model = CatBoostClassifier(
    iterations=500,
    depth=5,
    learning_rate=0.05,
    l2_leaf_reg=3,
    subsample=0.8,
    colsample_bylevel=0.8,
    random_seed=42,
    verbose=0
)

# Training
model.fit(X_train_res, y_train_res, eval_set=(X_test, y_test))

<catboost.core.CatBoostClassifier at 0x166b04830>

## Probabilitas & threshold tuning minor class

In [12]:
y_proba = model.predict_proba(X_test)[:, 1]

best = (0, 0)
for t in [i/100 for i in range(10, 90)]:
    pred = (y_proba > t).astype(int)
    f1 = f1_score(y_test, pred, pos_label=1)  # fokus minor
    if f1 > best[1]:
        best = (t, f1)

best_threshold = best[0]
print("Best threshold (minor class):", best_threshold)
print("F1 minor class:", best[1])

Best threshold (minor class): 0.33
F1 minor class: 0.33783783783783783


## Final prediction pakai threshold

In [13]:
y_pred = (y_proba > best_threshold).astype(int)

## Evaluasi

In [14]:
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92       679
           1       0.23      0.66      0.34        38

    accuracy                           0.86       717
   macro avg       0.60      0.77      0.63       717
weighted avg       0.94      0.86      0.89       717

Confusion Matrix:
 [[594  85]
 [ 13  25]]
Balanced Accuracy: 0.766355321292923


## Feature importance (CatBoost)

In [15]:
fi = model.get_feature_importance(prettified=True)
print(fi)

              Feature Id  Importances
0    EccentricitySquared    16.608270
1           Eccentricity    12.443700
2          OrbitalEnergy    10.590295
3        SemiMajorAxisAU     8.020382
4    HostStarMassSlrMass     7.193458
5       SemiMajorAxisLog     6.568921
6   HostStarRadiusSlrRad     5.672240
7           ScaledPeriod     4.044083
8       PlanetaryMassJpt     3.985872
9          DensityApprox     3.930123
10             RadiusJpt     3.888934
11              MassSqrt     3.863270
12            RadiusSqrt     3.442286
13             PeriodLog     3.410249
14       MassRadiusRatio     3.318830
15            PeriodDays     3.019085


## test model

### Misal ini contoh data baru

In [16]:
import pandas as pd
import numpy as np

new_data = pd.DataFrame([{
    'PlanetaryMassJpt': 0.5,
    'RadiusJpt': 0.9,
    'PeriodDays': 50,
    'SemiMajorAxisAU': 0.3,
    'Eccentricity': 0.05,
    'HostStarMassSlrMass': 1.0,
    'HostStarRadiusSlrRad': 1.0
}])

### Feature engineering sama seperti training

In [17]:
new_data['EccentricitySquared'] = new_data['Eccentricity'] ** 2
new_data['OrbitalEnergy'] = new_data['HostStarMassSlrMass'] / new_data['SemiMajorAxisAU']  # contoh sederhana
new_data['SemiMajorAxisLog'] = np.log1p(new_data['SemiMajorAxisAU'])
new_data['ScaledPeriod'] = new_data['PeriodDays'] / new_data['PeriodDays'].max()
new_data['RadiusSqrt'] = np.sqrt(new_data['RadiusJpt'])
new_data['MassSqrt'] = np.sqrt(new_data['PlanetaryMassJpt'])
new_data['MassRadiusRatio'] = new_data['PlanetaryMassJpt'] / new_data['RadiusJpt']
new_data['DensityApprox'] = new_data['PlanetaryMassJpt'] / new_data['RadiusJpt']**3
new_data['PeriodLog'] = np.log1p(new_data['PeriodDays'])
new_data['EccentricitySquared'] = new_data['Eccentricity']**2


### Pilih fitur sesuai model

In [18]:
features = [
    'EccentricitySquared', 'Eccentricity', 'OrbitalEnergy', 'SemiMajorAxisAU',
    'HostStarMassSlrMass', 'SemiMajorAxisLog', 'HostStarRadiusSlrRad',
    'ScaledPeriod', 'PlanetaryMassJpt', 'DensityApprox', 'RadiusJpt',
    'MassSqrt', 'RadiusSqrt', 'PeriodLog', 'MassRadiusRatio', 'PeriodDays'
]

X_new = new_data[features]

### Prediksi pakai model yang udah dilatih

In [19]:
y_proba = model.predict_proba(X_new)[:, 1]
threshold = best_threshold  # dari threshold tuning
y_pred = (y_proba > threshold).astype(int)

### Tampilkan hasil

In [20]:
for i, row in new_data.iterrows():
    print(f"Planet {i+1}:")
    print(f"  Predicted class: {y_pred[i]}")
    print(f"  Probability (minor class): {y_proba[i]:.2f}")

Planet 1:
  Predicted class: 1
  Probability (minor class): 0.40


#### Artinya: model prediksi planet itu minor class dengan probabilitas 40%. Threshold yang dipakai sebelumnya adalah 0.33, makanya 0.40 > 0.33 → diprediksi 1.


---
#### Predicted class = 1 → planet ini termasuk minor/rare class (TypeFlag ≠ 0)
#### Predicted class = 0 → planet ini termasuk major/common class (TypeFlag = 0)

## Simpan model & threshold pakai pickle

In [23]:
import pickle
pickle.dump(model, open("models/catboost_model.pkl", "wb"))
pickle.dump(best_threshold, open("models/best_threshold.pkl", "wb"))
