In [8]:
import pandas as pd

# Load dataset
df = pd.read_csv("Confirmed Exoplanet (Planetary System).csv")
df.columns


Index(['pl_name', 'hostname', 'default_flag', 'sy_snum', 'sy_pnum',
       'discoverymethod', 'disc_year', 'disc_facility', 'soltype',
       'pl_controv_flag', 'pl_refname', 'pl_orbper', 'pl_orbpererr1',
       'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax', 'pl_orbsmaxerr1',
       'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade', 'pl_radeerr1',
       'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1', 'pl_radjerr2',
       'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2',
       'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2',
       'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen', 'pl_orbeccenerr1',
       'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol', 'pl_insolerr1',
       'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2',
       'pl_eqtlim', 'ttv_flag', 'st_refname', 'st_spectype', 'st_teff',
       'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad', 'st_raderr1',
       'st_raderr2', 'st_radlim', 'st_mass', 'st_masserr1', 'st_masserr2

In [9]:
# Mapping required features to actual dataset column names
required_cols = {
    "planet_radius": "pl_rade",
    "planet_mass": "pl_bmasse",
    "surface_temperature": "pl_eqt",
    "orbital_period": "pl_orbper",
    "distance_from_star": "pl_orbsmax",
    "host_star_type": "st_spectype",
    "host_star_temperature": "st_teff",
    "host_star_metallicity": "st_met",
}

# Check availability
available = {k: v for k, v in required_cols.items() if v in df.columns}
# Extract available columns
extract_cols = list(available.values())
extracted_df = df[extract_cols]

print("\nExtracted Feature Dataset (first 5 rows):")
print(extracted_df.head())


Extracted Feature Dataset (first 5 rows):
   pl_rade   pl_bmasse  pl_eqt  pl_orbper  pl_orbsmax st_spectype  st_teff  \
0      NaN  6165.60000     NaN  326.03000       1.290      G8 III   4742.0   
1      NaN  5434.70000     NaN        NaN       1.210         NaN      NaN   
2      NaN  4914.89849     NaN  323.21000       1.178      G8 III   4874.0   
3      NaN  4684.81420     NaN  516.21997       1.530         NaN   4213.0   
4      NaN  3337.07000     NaN  516.22000       1.540      K4 III   4340.0   

   st_met  
0   -0.35  
1     NaN  
2   -0.26  
3   -0.02  
4    0.04  


In [12]:
# DROP rows with ANY null values
clean_df = extracted_df.dropna()
print("\nRows before cleaning:", len(extracted_df))
print("Rows after cleaning:", len(clean_df))



Rows before cleaning: 38170
Rows after cleaning: 591


In [16]:
# SAVE CLEAN DATASET
output_path = "clean_exoplanet_features.csv"
clean_df.to_csv(output_path, index=False)
print(f"\nClean dataset saved successfully to: {output_path}")
print(clean_df.head())


Clean dataset saved successfully to: clean_exoplanet_features.csv
     pl_rade   pl_bmasse  pl_eqt     pl_orbper  pl_orbsmax st_spectype  \
191    2.230    16.30000   546.0  4.168550e+01      0.2410           G   
236   12.442  2002.31896   434.0  4.020000e+08   7506.0000        M3 V   
247   10.870   874.00000   600.0  1.324060e+01      0.1055        K1 V   
250   16.030   740.51000  1657.0  2.994330e+00      0.0436        F6 V   
266    9.920   415.70400  1700.0  4.035190e+00      0.0510        G0 V   

     st_teff  st_met  
191   5766.0   -0.15  
236   3406.0    0.00  
247   5075.0    0.26  
250   6440.0   -0.03  
266   5945.0    0.01  
