In [None]:
import pandas as pd

# Loading raw data

df_all = pd.read_csv("raw_data/exo_data.csv", comment='#', on_bad_lines='skip', engine='python')

print("Total enteries in raw data:", len(df_all))



In [None]:
# removing incomplete rows  
df_clean = df_all.dropna(subset=['pl_orbsmax', 'st_rad', 'st_teff'])

# removing redundant rows
df_final = df_clean.drop_duplicates(subset='pl_name', keep = 'first').reset_index(drop=True)
df_final.to_csv("raw_data/clean_raw_data.csv")

print("Planets with complete data:", len(df_final))

In [None]:
# selecting a sample data set, eventually to be used as a traning data


# sorting data based on cold neptunes, temperate planets & hot jupiters
hot = df_final[
                (df_final["pl_orbsmax"] < 0.5) &
                (df_final["st_teff"] > 5900)
               ]
hot = hot.copy()
hot["type"] = "hot"

temperate = df_final[
                    (df_final["pl_orbsmax"] >= 0.8) &
                    (df_final["pl_orbsmax"] <= 1.2) &
                    (df_final["st_teff"] >= 5500) &
                    (df_final["st_teff"] <= 5900)
                    ]
temperate = temperate.copy()
temperate["type"] = "temperate"

cold = df_final[
                (df_final["pl_orbsmax"] > 2.0) &
                (df_final["st_teff"] < 5900)
               ]
cold = cold.copy()
cold["type"] = "cold"

# creating a sample of 15 exoplanets from each category
hot_sample = hot.sample(n=min(700, len(hot)), random_state = 42)
temperate_sample = temperate.sample(n=min(20, len(temperate)), random_state = 42)
cold_sample = cold.sample(n=min(50, len(cold)), random_state = 42)

# combining to make a large set: the final sample of training data

df_sample = pd.concat([hot_sample, temperate_sample, cold_sample])
df_sample.to_csv("raw_data/exoplanet_sample.csv", index = False)

print(len(cold) + len(hot) + len(temperate))
print(len(hot))
print(len(temperate))
print(len(cold))