In [1]:
import pandas as pd

## Preprocess the Data
* Preprocess the dataset prior to fitting the model.
* Perform feature selection and remove unnecessary features.

In [2]:
# Read the raw data
raw_df = pd.read_csv("Data/Kepler_Exoplanet_Search_Results.csv") 
raw_df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
# Check to see if any null values and their count
def get_null_info(df):
    null_info = pd.concat(
        [df.dtypes, df.isna().any(), df.isna().sum()],
        axis=1
    )
    null_info.columns = ["type", "has_null","null_count"]
    return null_info

get_null_info(raw_df)

Unnamed: 0,type,has_null,null_count
rowid,int64,False,0
kepid,int64,False,0
kepoi_name,object,False,0
kepler_name,object,True,7270
koi_disposition,object,False,0
koi_pdisposition,object,False,0
koi_score,float64,True,1510
koi_fpflag_nt,int64,False,0
koi_fpflag_ss,int64,False,0
koi_fpflag_co,int64,False,0


In [4]:
# Select the features to train the data
target_feature = "koi_disposition"

# no nulls for the selected features
selected_features = [
#    "rowid",               # int64     False     0
#    "kepid",               # int64     False     0
#    "kepoi_name",          # object    False     0
#    "kepler_name",         # object    True      7270
#    "koi_disposition",     # object    False     0
#    "koi_pdisposition",    # object    False     0
#    "koi_score",           # float64   True      1510
     "koi_fpflag_nt",       # int64     False     0
     "koi_fpflag_ss",       # int64     False     0
     "koi_fpflag_co",       # int64     False     0
     "koi_fpflag_ec",       # int64     False     0
     "koi_period",          # float64   False     0
#    "koi_period_err1",     # float64   True      454
#    "koi_period_err2",     # float64   True      454
     "koi_time0bk",         # float64   False     0
#    "koi_time0bk_err1",    # float64   True      454
#    "koi_time0bk_err2",    # float64   True      454
     "koi_impact",          # float64   True      363
#    "koi_impact_err1",     # float64   True      454
#    "koi_impact_err2",     # float64   True      454
     "koi_duration",        # float64   False     0
#    "koi_duration_err1",   # float64   True      454
#    "koi_duration_err2",   # float64   True      454
     "koi_depth",           # float64   True      363
#    "koi_depth_err1",      # float64   True      454
#    "koi_depth_err2",      # float64   True      454
     "koi_prad",            # float64   True      363
#    "koi_prad_err1",       # float64   True      363
#    "koi_prad_err2",       # float64   True      363
     "koi_teq",             # float64   True      363
#    "koi_teq_err1",        # float64   True      9564
#    "koi_teq_err2",        # float64   True      9564
     "koi_insol",           # float64   True      321
#    "koi_insol_err1",      # float64   True      321
#    "koi_insol_err2",      # float64   True      321
     "koi_model_snr",       # float64   True      363
#    "koi_tce_plnt_num",    # float64   True      346
#    "koi_tce_delivname",   # object    True      346
     "koi_steff",           # float64   True      363
#    "koi_steff_err1",      # float64   True      468
#    "koi_steff_err2",      # float64   True      483
     "koi_slogg",           # float64   True      363
#    "koi_slogg_err1",      # float64   True      468
#    "koi_slogg_err2",      # float64   True      468
     "koi_srad",            # float64   True      363
#    "koi_srad_err1",       # float64   True      468
#    "koi_srad_err2",       # float64   True      468
     "ra",                  # float64   False     0
     "dec",                 # float64   False     0
#    "koi_kepmag",          # float64   True      1
]

# just in case we remove the selected feature in testing
if target_feature not in selected_features:
    selected_features.append(target_feature)
    
training_df = raw_df[selected_features]
training_df

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_disposition
0,0,0,0,0,9.488036,170.538750,0.146,2.95750,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
1,0,0,0,0,54.418383,162.513840,0.586,4.50700,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
2,0,1,0,0,19.899140,175.850252,0.969,1.78220,10829.0,14.60,638.0,39.30,76.3,5853.0,4.544,0.868,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.285210,FALSE POSITIVE
4,0,0,0,0,2.525592,171.595550,0.701,1.65450,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.226200,CONFIRMED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,0,0,0,1,8.589871,132.016100,0.765,4.80600,87.7,1.11,929.0,176.40,8.4,5638.0,4.296,1.088,298.74921,46.973351,FALSE POSITIVE
9560,0,1,1,0,0.527699,131.705093,1.252,3.22210,1579.2,29.35,2088.0,4500.53,453.3,5638.0,4.529,0.903,297.18875,47.093819,FALSE POSITIVE
9561,0,0,0,0,1.739849,133.001270,0.043,3.11400,48.5,0.72,1608.0,1585.81,10.6,6119.0,4.444,1.031,286.50937,47.163219,CANDIDATE
9562,0,0,1,0,0.681402,132.181750,0.147,0.86500,103.6,1.07,2218.0,5713.41,12.3,6173.0,4.447,1.041,294.16489,47.176281,FALSE POSITIVE


In [5]:
# Drop the null columns where all values are null
training_df = training_df.dropna(axis="columns", how="all")
# Drop the null rows
training_df = training_df.dropna()
get_null_info(training_df)

Unnamed: 0,type,has_null,null_count
koi_fpflag_nt,int64,False,0
koi_fpflag_ss,int64,False,0
koi_fpflag_co,int64,False,0
koi_fpflag_ec,int64,False,0
koi_period,float64,False,0
koi_time0bk,float64,False,0
koi_impact,float64,False,0
koi_duration,float64,False,0
koi_depth,float64,False,0
koi_prad,float64,False,0


In [6]:
print(training_df[target_feature].unique().tolist())
training_df.describe()

['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec
count,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0,9201.0
mean,0.170742,0.238126,0.200848,0.1239,74.307951,165.461519,0.735105,5.654692,23791.34,102.891778,1085.385828,7716.611,259.895001,5706.82328,4.310157,1.728712,292.062986,43.808212
std,0.376304,0.42596,0.400656,0.329485,1360.538847,67.315287,3.348832,6.499703,82242.68,3077.639126,856.351161,159545.5,795.806615,796.857947,0.432606,6.127185,4.760401,3.602567
min,0.0,0.0,0.0,0.0,0.241843,120.515914,0.0,0.052,0.0,0.08,25.0,0.0,0.0,2661.0,0.047,0.109,279.85272,36.577381
25%,0.0,0.0,0.0,0.0,2.682768,132.74598,0.197,2.4536,159.9,1.4,539.0,20.01,12.0,5310.0,4.218,0.829,288.68259,40.77697
50%,0.0,0.0,0.0,0.0,9.296746,137.02059,0.537,3.82433,421.1,2.39,878.0,140.73,23.0,5767.0,4.438,1.0,292.26291,43.680962
75%,0.0,0.0,0.0,0.0,36.856776,170.36915,0.889,6.289,1473.4,14.93,1379.0,853.5,78.0,6112.0,4.543,1.345,295.86533,46.703129
max,1.0,1.0,1.0,1.0,129995.7784,1472.522306,100.806,138.54,1541400.0,200346.0,14667.0,10947550.0,9054.7,15896.0,5.364,229.908,301.72076,52.33601


In [7]:
training_df.to_csv("Data/Cleaned_Data.csv", index=False)