In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import *

In [2]:
# Load clean data file
data = pd.read_csv("data_files/clean_data.csv")

<h4>Note For Feature Engineering:</h4>
<ul>
    <li>We will teach the model equilibrium temperature formula's relationship (without stellar radius)</li>
    <li>The value of albedo varies from planet to planet and isn't available, but in this case we will use an average value of 0.3</li>
</ul>

In [3]:
# Feature Engineering 

T_star = data["st_teff"].values
a_planet = data["pl_orbsmax"].values

Feat1 = T_star.copy()
Feat2 = T_star * (np.sqrt(1 / (2*(a_planet)))) * ((1 - 0.3) ** 0.25)
Feat3 = np.log10(T_star)
Feat4 = np.log10(1 / (2*a_planet))
Feat5 = np.log10(T_star * np.sqrt(1 / (2*a_planet)) * ((1 - 0.3) ** 0.25))

In [4]:
training_df = pd.DataFrame({
    "feat1": Feat1,
    "feat2": Feat2,
    "feat3": Feat3,
    "feat4": Feat4,
    "feat5": Feat5,
    "Eqt": data["pl_eqt"]
})

initial_model = setup(training_df, target = 'Eqt')

Unnamed: 0,Description,Value
0,Session id,7873
1,Target,Eqt
2,Target type,Regression
3,Original data shape,"(11715, 6)"
4,Transformed data shape,"(11715, 6)"
5,Transformed train set shape,"(8200, 6)"
6,Transformed test set shape,"(3515, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


In [5]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,37.8114,9889.2624,98.8164,0.8917,0.1034,0.0454,0.074
rf,Random Forest Regressor,46.075,10373.9496,101.2788,0.8864,0.1062,0.0562,0.214
lightgbm,Light Gradient Boosting Machine,65.8864,13215.9397,114.5969,0.8554,0.1263,0.0848,0.247
gbr,Gradient Boosting Regressor,67.8607,13759.6743,116.8371,0.8493,0.1317,0.0862,0.095
knn,K Neighbors Regressor,65.0935,14750.5346,121.1259,0.8383,0.1311,0.0821,0.005
dt,Decision Tree Regressor,45.8171,14852.1126,121.3326,0.8371,0.1272,0.0556,0.006
lr,Linear Regression,73.4496,16712.7479,128.8001,0.8177,0.1427,0.0944,0.183
ridge,Ridge Regression,73.8176,16711.4525,128.7681,0.8177,0.1428,0.095,0.003
lar,Least Angle Regression,73.4488,16712.6711,128.7998,0.8177,0.1427,0.0944,0.003
lasso,Lasso Regression,76.0006,17145.7991,130.4948,0.8131,0.1601,0.0983,0.004


In [None]:
tuned_model = tune_model(best_model)

In [7]:
finalize_model(best_model)

In [8]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [9]:
save_model(best_model, "main_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['feat1', 'feat2', 'feat3', 'feat4',
                                              'feat5'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model',
                  ExtraTreesRegressor(n_jobs=-1, random_state=7873))]),
 'main_model.pkl')