In [2]:
import pandas as pd

## Preprocess the Data
* Preprocess the dataset prior to fitting the model.
* Perform feature selection and remove unnecessary features.

In [3]:
# Read the raw data
raw_df = pd.read_csv("Data/Kepler_Exoplanet_Search_Results.csv") 
raw_df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [4]:
# Check to see if any null values and their count
def get_null_info(df):
    null_info = pd.concat(
        [df.dtypes, df.isna().any(), df.isna().sum()],
        axis=1
    )
    null_info.columns = ["type", "has_null","null_count"]
    return null_info

get_null_info(raw_df)

Unnamed: 0,type,has_null,null_count
rowid,int64,False,0
kepid,int64,False,0
kepoi_name,object,False,0
kepler_name,object,True,7270
koi_disposition,object,False,0
koi_pdisposition,object,False,0
koi_score,float64,True,1510
koi_fpflag_nt,int64,False,0
koi_fpflag_ss,int64,False,0
koi_fpflag_co,int64,False,0


In [5]:
# Select the features to train the data
target_feature = "koi_disposition"

# no nulls for the selected features
selected_features = [
#    "rowid",               # int64     False     0
#    "kepid",               # int64     False     0
#    "kepoi_name",          # object    False     0
#    "kepler_name",         # object    True      7270
#    "koi_disposition",     # object    False     0
#    "koi_pdisposition",    # object    False     0
#    "koi_score",           # float64   True      1510
     "koi_fpflag_nt",       # int64     False     0
     "koi_fpflag_ss",       # int64     False     0
     "koi_fpflag_co",       # int64     False     0
     "koi_fpflag_ec",       # int64     False     0
#    "koi_period",          # float64   False     0
#    "koi_period_err1",     # float64   True      454
#    "koi_period_err2",     # float64   True      454
     "koi_time0bk",         # float64   False     0
#    "koi_time0bk_err1",    # float64   True      454
#    "koi_time0bk_err2",    # float64   True      454
#    "koi_impact",          # float64   True      363
#    "koi_impact_err1",     # float64   True      454
#    "koi_impact_err2",     # float64   True      454
     "koi_duration",        # float64   False     0
#    "koi_duration_err1",   # float64   True      454
#    "koi_duration_err2",   # float64   True      454
#    "koi_depth",           # float64   True      363
#    "koi_depth_err1",      # float64   True      454
#    "koi_depth_err2",      # float64   True      454
#    "koi_prad",            # float64   True      363
#    "koi_prad_err1",       # float64   True      363
#    "koi_prad_err2",       # float64   True      363
#    "koi_teq",             # float64   True      363
#    "koi_teq_err1",        # float64   True      9564
#    "koi_teq_err2",        # float64   True      9564
#    "koi_insol",           # float64   True      321
#    "koi_insol_err1",      # float64   True      321
#    "koi_insol_err2",      # float64   True      321
#    "koi_model_snr",       # float64   True      363
#    "koi_tce_plnt_num",    # float64   True      346
#    "koi_tce_delivname",   # object    True      346
#    "koi_steff",           # float64   True      363
#    "koi_steff_err1",      # float64   True      468
#    "koi_steff_err2",      # float64   True      483
#    "koi_slogg",           # float64   True      363
#    "koi_slogg_err1",      # float64   True      468
#    "koi_slogg_err2",      # float64   True      468
#    "koi_srad",            # float64   True      363
#    "koi_srad_err1",       # float64   True      468
#    "koi_srad_err2",       # float64   True      468
     "ra",                  # float64   False     0
     "dec",                 # float64   False     0
#    "koi_kepmag",          # float64   True      1
]

# just in case we remove the selected feature in testing
if target_feature not in selected_features:
    selected_features.append(target_feature)
    
training_df = raw_df[selected_features]
training_df

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_time0bk,koi_duration,ra,dec,koi_disposition
0,0,0,0,0,170.538750,2.95750,291.93423,48.141651,CONFIRMED
1,0,0,0,0,162.513840,4.50700,291.93423,48.141651,CONFIRMED
2,0,1,0,0,175.850252,1.78220,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,170.307565,2.40641,285.53461,48.285210,FALSE POSITIVE
4,0,0,0,0,171.595550,1.65450,288.75488,48.226200,CONFIRMED
...,...,...,...,...,...,...,...,...,...
9559,0,0,0,1,132.016100,4.80600,298.74921,46.973351,FALSE POSITIVE
9560,0,1,1,0,131.705093,3.22210,297.18875,47.093819,FALSE POSITIVE
9561,0,0,0,0,133.001270,3.11400,286.50937,47.163219,CANDIDATE
9562,0,0,1,0,132.181750,0.86500,294.16489,47.176281,FALSE POSITIVE


In [6]:
print(training_df[target_feature].unique().tolist())
training_df.describe()

['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_time0bk,koi_duration,ra,dec
count,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0
mean,0.188206,0.231598,0.194898,0.120033,166.183251,5.621606,292.060163,43.810433
std,0.390897,0.421875,0.396143,0.325018,67.91896,6.471554,4.766657,3.601243
min,0.0,0.0,0.0,0.0,120.515914,0.052,279.85272,36.577381
25%,0.0,0.0,0.0,0.0,132.761718,2.43775,288.66077,40.777173
50%,0.0,0.0,0.0,0.0,137.224595,3.7926,292.261125,43.677504
75%,0.0,0.0,0.0,0.0,170.694603,6.2765,295.85916,46.714611
max,1.0,1.0,1.0,1.0,1472.522306,138.54,301.72076,52.33601


In [7]:
training_df.to_csv("Data/Cleaned_Data.csv", index=False)