In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
#Use koi_disposition for the y values

y = df['koi_disposition']
X = df.drop(columns=["koi_disposition"], axis=1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.6 )

In [5]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder().fit(y_train) 

y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)
y_train_encoded

array([2, 1, 1, ..., 2, 2, 2])

In [6]:
code=pd.DataFrame({"Actual": y_test, "Encoded": y_test_encoded})
code.head(6)

Unnamed: 0,Actual,Encoded
4982,FALSE POSITIVE,2
4866,CANDIDATE,0
2934,FALSE POSITIVE,2
5007,FALSE POSITIVE,2
3869,FALSE POSITIVE,2
233,CONFIRMED,1


In [7]:
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train_encoded)

In [9]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test_encoded)}")

Training Data Score: 1.0
Testing Data Score: 0.8984626385412943


In [10]:
from sklearn.metrics import classification_report

predictions = rf.predict(X_test_scaled)
print(classification_report(y_test_encoded, predictions,
                            target_names=[ 'Candidate', 'Confirmed', 'False Positive']))


                precision    recall  f1-score   support

     Candidate       0.82      0.76      0.79       672
     Confirmed       0.83      0.84      0.84       764
False Positive       0.97      1.00      0.98      1361

      accuracy                           0.90      2797
     macro avg       0.87      0.87      0.87      2797
  weighted avg       0.90      0.90      0.90      2797



In [11]:
predictions = rf.predict(X_test_scaled)

predictions_df=pd.DataFrame({"Actual": y_test_encoded, "Prediction": predictions })

results=predictions_df.apply(pd.Series.value_counts)
code = ['False Positive', 'Confirmed', 'Candidate',] 
results['Code']=code
results

Unnamed: 0,Actual,Prediction,Code
2,1361,1397,False Positive
1,764,777,Confirmed
0,672,623,Candidate


In [12]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.10501523322195407, 'koi_fpflag_co'),
 (0.10147943072762046, 'koi_fpflag_nt'),
 (0.07707145850627667, 'koi_fpflag_ss'),
 (0.05278193834181785, 'koi_model_snr'),
 (0.03923214763933334, 'koi_prad'),
 (0.034288354982730064, 'koi_prad_err2'),
 (0.03352219127170376, 'koi_duration_err1'),
 (0.032979351367336096, 'koi_fpflag_ec'),
 (0.031456242340727185, 'koi_steff_err2'),
 (0.031099008261017108, 'koi_prad_err1'),
 (0.029098104707664853, 'koi_steff_err1'),
 (0.027568764119805432, 'koi_duration_err2'),
 (0.02557882795023595, 'koi_time0bk_err2'),
 (0.02357139802091029, 'koi_time0bk_err1'),
 (0.023011313346165182, 'koi_duration'),
 (0.02295832657173301, 'koi_period'),
 (0.019604096430955724, 'koi_depth'),
 (0.019330767746291624, 'koi_impact'),
 (0.0177944019566704, 'koi_period_err2'),
 (0.017734998398748227, 'koi_insol_err1'),
 (0.01768390951674988, 'koi_period_err1'),
 (0.016170242179051976, 'koi_teq'),
 (0.014777289385873088, 'koi_insol_err2'),
 (0.014426094193163949, 'koi_depth_err1'),
 (0

In [13]:
import joblib
filename = 'random_forest.sav'
joblib.dump(rf, filename)

['random_forest.sav']