# Importing the libraries and seting the paths

In [236]:
from utils import open_nea_table, download_nea_table
from mass_year import mass_vs_year_plot, planet_discovery_stat
import pandas as pd
import numpy as np
import os
import sys
from datetime import date
import math

from mr_ml_utils import a_from_P, Teq_from_teff_v_plx_a, Teq_from_teff_v_plx_P, teq_hellper_function

If needed to update the NEA table then first run  "download_nea_table"

In [237]:
nea_full_table = open_nea_table()
nea_full_table.head(3)

  nea_full_table = pd.read_csv(table_directory)


Unnamed: 0,pl_name,pl_letter,hostname,hd_name,hip_name,tic_id,gaia_id,default_flag,pl_refname,sy_refname,...,sy_jmagerr1,sy_jmagerr2,sy_jmagstr,sy_hmag,sy_hmagerr1,sy_hmagerr2,sy_hmagstr,sy_kmag,sy_kmagerr1,sy_kmagerr2
0,OGLE-TR-10 b,b,OGLE-TR-10,,,TIC 130150682,Gaia DR2 4056443366649948160,1,<a refstr=TORRES_ET_AL__2008 href=https://ui.a...,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,...,,,13.692,13.314,0.121,-0.121,13.314&plusmn;0.121,12.856,,
1,BD-08 2823 c,c,BD-08 2823,,HIP 49067,TIC 33355302,Gaia DR2 3770419611540574080,1,<a refstr=HEBRARD_ET_AL__2010 href=https://ui....,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,...,0.02,-0.02,7.96&plusmn;0.02,7.498,0.047,-0.047,7.498&plusmn;0.047,7.323,0.021,-0.021
2,HR 8799 c,c,HR 8799,HD 218396,HIP 114189,TIC 245368902,Gaia DR2 2832463659640297472,1,<a refstr=MAROIS_ET_AL__2008 href=https://ui.a...,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,...,0.027,-0.027,5.383&plusmn;0.027,5.28,0.018,-0.018,5.280&plusmn;0.018,5.24,0.018,-0.018


# Selecting columns that are relevant for the ML analysis

In [238]:
relevant_columns = ['pl_rade', 'pl_orbsmax', 'pl_bmasse', 'pl_orbper',  'sy_vmag', 'sy_plx', 'st_teff', 'st_mass']
nea_relevant = nea_full_table[relevant_columns]


### Equilibrium temperature is important for the radius of hot planets. 

Selecting planets with Mass and Radius measurements for ML

In [239]:
nea_with_M_and_R = nea_relevant[(nea_relevant.pl_bmasse > 0) & (nea_relevant.pl_rade > 0) ].reset_index(drop=True) # reset index, drop old ones

print (f'There are {len(nea_with_M_and_R)} planets with mass and radius measurements')

nea_with_M_and_R.describe()

There are 1113 planets with mass and radius measurements


Unnamed: 0,pl_rade,pl_orbsmax,pl_bmasse,pl_orbper,sy_vmag,sy_plx,st_teff,st_mass
count,1113.0,878.0,1113.0,1098.0,1110.0,1085.0,1032.0,1100.0
mean,8.967707,17.160037,420.861732,379053.0,12.274946,9.295143,5483.824806,1.003882
std,6.083019,272.583363,963.110129,12135090.0,2.171725,17.976954,1361.8372,0.345527
min,0.31,0.006,0.07,0.240104,5.56911,-0.149186,575.0,0.01
25%,2.63,0.037958,10.1,3.003277,11.0125,1.93929,5016.0,0.82
50%,10.447,0.053735,126.49634,4.603155,12.247,3.51679,5620.0,1.0
75%,13.787,0.100075,379.0,11.89179,13.688,7.96147,6009.0,1.18
max,33.6,7506.0,9534.9,402000000.0,20.1544,153.081,27730.0,2.78


There seem to be star(s) with negative parallaxes. They must be removed fromthe sample

In [240]:
nea_with_M_and_R = nea_with_M_and_R[nea_with_M_and_R.sy_plx > 0]

To calculate Teq, one needs Luminosity (us such, Teff, Vmag, and Plx) and semimajor axis (or orbital period ond Mstar)

In [241]:
#check how many nan values have in each column
nea_with_M_and_R.isna().sum()

pl_rade         0
pl_orbsmax    234
pl_bmasse       0
pl_orbper      13
sy_vmag         0
sy_plx          0
st_teff        78
st_mass        11
dtype: int64

There are 235 missing entries for semimajor axis. This parameter can be estimated from the stellar mass and orbital period.

Next, all the rows containing nan values will be removed, except if the nan is in the "pl_orbsmax" column

In [242]:
nea_with_M_and_R_cleaned = nea_with_M_and_R.dropna(subset=nea_with_M_and_R.columns.drop('pl_orbsmax'))
# Check results
nea_with_M_and_R_cleaned.isna().sum()


pl_rade         0
pl_orbsmax    188
pl_bmasse       0
pl_orbper       0
sy_vmag         0
sy_plx          0
st_teff         0
st_mass         0
dtype: int64

In [243]:
print (f'There are {len(nea_with_M_and_R_cleaned)} planets for the ML analysis')
nea_with_M_and_R_cleaned.head(3)

There are 999 planets for the ML analysis


Unnamed: 0,pl_rade,pl_orbsmax,pl_bmasse,pl_orbper,sy_vmag,sy_plx,st_teff,st_mass
0,14.011,0.0434,197.046,3.101278,16.006,0.72138,5950.0,1.14
2,2.8,0.106,508.5,12.3335,15.04,0.836127,5800.0,1.03
3,2.2,0.05,1303.0,5.90124,16.36,3.05948,3900.0,0.58


Calculating Teq from "a" or "P" 

In [244]:
def teq_hellper_function(x, pl_orbsmax, pl_orbper, st_teff, sy_vmag, sy_plx, st_mass, Teq):
    """
    Helper function to calculate the equilibrium temperature (Teq) based on different parameters.

    Parameters:
        x (array-like): Input array or DataFrame row containing the values of the parameters.
        pl_orbsmax (str): Column name or key for the planet's semimajor axis parameter.
        pl_orbper (str): Column name or key for the planet's orbital period parameter.
        st_teff (str): Column name or key for the stellar effective temperature parameter.
        sy_vmag (str): Column name or key for the system's visual magnitude parameter.
        sy_plx (str): Column name or key for the system's parallax parameter.
        st_mass (str): Column name or key for the stellar mass parameter.
        Teq (str): Column name or key for the equilibrium temperature (output) parameter.

    Returns:
        x (array-like): Input array or DataFrame row with the equilibrium temperature (Teq) added as a new column.

    Note:
        The function uses the `Teq_from_teff_v_plx_a` and `Teq_from_teff_v_plx_P` functions to calculate Teq.
        The resulting Teq value is rounded to the nearest integer using `np.round`.
    """
    if x[pl_orbsmax] > 0:
        # Calculate Teq using Teq_from_teff_v_plx_a function
        x[Teq] = Teq_from_teff_v_plx_a(x[st_teff], x[sy_vmag], x[sy_plx], x[pl_orbsmax])
    else:
        # Calculate Teq using Teq_from_teff_v_plx_P function
        x[Teq] = Teq_from_teff_v_plx_P(x[st_teff], x[sy_vmag], x[sy_plx], x[pl_orbper], x[st_mass])
    
    # Round the Teq value to the nearest integer
    x[Teq] = np.round(x[Teq])
    
    return x


In [245]:
print (nea_with_M_and_R_cleaned)

      pl_rade  pl_orbsmax   pl_bmasse  pl_orbper  sy_vmag     sy_plx  st_teff  \
0      14.011     0.04340   197.04600   3.101278   16.006   0.721380   5950.0   
2       2.800     0.10600   508.50000  12.333500   15.040   0.836127   5800.0   
3       2.200     0.05000  1303.00000   5.901240   16.360   3.059480   3900.0   
4      16.870     0.05350   271.74465   3.899052   12.006   1.731260   5941.0   
5      21.633         NaN   352.79130   2.175180   11.965   1.296790   6100.0   
...       ...         ...         ...        ...      ...        ...      ...   
1108    1.240     0.02933     2.14000   2.769530    9.630  99.916400   3803.0   
1109    4.570     0.03550    13.90000   3.336650   12.332  33.960100   3600.0   
1110   18.046     0.03080  1430.17500   2.485533   14.718   0.586731   6933.0   
1111   11.781     0.07229   953.49000   6.180235   11.237   2.663910   6202.0   
1112   11.321     0.09564   310.83618  11.910100   13.004   4.209610   5144.0   

      st_mass  
0        1.

Use teq_hellper_function function to determine Teq from "a" or "P" depending on their availability. Priority is given to "a"

In [246]:
# Add new columns: Teq
nea_with_M_and_R_cleaned['Teq'] = 0

nea_with_M_and_R_cleaned = nea_with_M_and_R_cleaned.apply(lambda x: teq_hellper_function(x, 'pl_orbsmax', 'pl_orbper', 'st_teff', 'sy_vmag', 'sy_plx', 'st_mass', 'Teq'), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nea_with_M_and_R_cleaned['Teq'] = 0


In [247]:
# Final table with Mass, Radius, and Teq
nea_MR_final_table = nea_with_M_and_R_cleaned[['pl_rade', 'pl_bmasse', 'Teq']]
nea_MR_final_table.head(2)

Unnamed: 0,pl_rade,pl_bmasse,Teq
0,14.011,197.046,1194.0
2,2.8,508.5,891.0


In [248]:
print (nea_MR_final_table)

      pl_rade   pl_bmasse     Teq
0      14.011   197.04600  1194.0
2       2.800   508.50000   891.0
3       2.200  1303.00000   662.0
4      16.870   271.74465  1744.0
5      21.633   352.79130  5466.0
...       ...         ...     ...
1108    1.240     2.14000   743.0
1109    4.570    13.90000   699.0
1110   18.046  1430.17500  2075.0
1111   11.781   953.49000  1433.0
1112   11.321   310.83618   695.0

[999 rows x 3 columns]
