In [4]:
import pandas as pd

In [28]:
# Loading raw data
df_all = pd.read_csv("data_files/nasa_exo_data.csv", comment='#', on_bad_lines='skip', engine='python')

# Inspecting 
print("Total enteries in raw data:", len(df_all))
df_all.sample(10)

Total enteries in raw data: 38509


Unnamed: 0,pl_name,pl_orbsmax,pl_orbsmaxerr1,pl_orbsmaxerr2,pl_orbsmaxlim,pl_eqt,pl_eqterr1,pl_eqterr2,pl_eqtlim,st_teff,st_tefferr1,st_tefferr2,st_tefflim,st_rad,st_raderr1,st_raderr2,st_radlim
29864,Kepler-636 b,,,,0.0,,,,0.0,5179.48,103.73,-103.73,0.0,,,,
30310,Kepler-67 b,0.11804,0.00112,-0.00112,0.0,,,,,5238.0,60.0,-60.0,0.0,0.794,0.057,-0.051,0.0
25299,Kepler-370 b,0.054,,,0.0,,,,,5852.0,200.0,-200.0,0.0,0.9,0.416,-0.416,0.0
23615,Kepler-32 b,0.0514,,,0.0,513.0,,,0.0,3727.0,104.0,-64.0,0.0,0.5,0.06,-0.06,0.0
30399,Kepler-678 b,,,,,,,,,5520.0,72.95,-108.11,0.0,0.91,0.117,-0.057,0.0
37409,WASP-171 b,0.0504,0.00083,-0.00083,0.0,1642.0,51.0,-35.0,0.0,5965.0,100.0,-100.0,0.0,1.637,0.091,-0.046,0.0
8794,Kepler-114 d,,,,,,,,,,,,,0.715,0.029,-0.029,0.0
37340,WASP-16 b,,,,,,,,,,,,,,,,
25785,Kepler-390 b,,,,,,,,,5270.0,74.21,-65.93,0.0,0.8,0.025,-0.027,0.0
8702,Kepler-1133 b,0.0959,,,0.0,720.0,,,0.0,5627.0,161.0,-144.0,0.0,0.808,0.351,-0.063,0.0


In [37]:
# CLEANING DATA

# Removing incomplete rows & retaining necessary columns
df_clean = df_all.dropna(subset=['pl_eqt', 'st_teff','pl_orbsmax', 'st_rad']).reset_index(drop=True)
df_clean = df_clean[['pl_name', 'pl_eqt', 'st_teff', 'pl_orbsmax', 'st_rad']]

# Removing extraneous values
df_filtered = df_clean[(df_clean["pl_orbsmax"] > 0.05) & (df_clean["pl_orbsmax"] < 40)]
df_filtered = df_filtered[~((df_filtered['pl_orbsmax'] > 5) & (df_filtered['pl_eqt'] > 1000))].reset_index(drop=True)

# OPTIONAL -- removing redundant rows since many planets are repeated, but it will reduce the training data size
df_unique = df_filtered.drop_duplicates(subset='pl_name', keep = 'first').reset_index(drop=True)

# OPTIONAL -- add Solar System planetary values
solar_df = pd.DataFrame({
    "pl_name": ["Mercury","Venus","Earth","Mars","Jupiter","Saturn","Uranus","Neptune"],
    "pl_orbsmax": [0.387,0.723,1.000,1.524,5.204,9.583,19.191,30.07],
    "pl_eqt": [440,230,255,210,112,81,59,47],
    "st_teff": [5772]*8,
    "st_rad": [1]*8
})
df_filtered = pd.concat([df_filtered, solar_df])
df_unique = pd.concat([df_unique, solar_df])


print("Training Data Size:", len(df_filtered))
print("Total Clean Unique Enteries:", len(df_unique))


Training Data Size: 11715
Total Clean Unique Enteries: 3015


In [38]:
# Saving data file as CSV

df_filtered.to_csv('data_files/clean_data.csv', index = False)