# Combined Feature Analysis
This notebook attempts to perform replication of some code snippets and experimental analysis. The insight gained from this notebook can help us to better understand the given features of the data and also inform us on methods we can potentially exploit for feature engineering.

Most likely this work will be employed in the second phase of the project.

In [1]:
import numpy as np
import pandas as pd
from astropy.io import ascii
import matplotlib.pyplot as plt

In [6]:
# Import Dataset
stlr = ascii.read("../data/raw/misc/q1_q17_dr25_stellar.txt").to_pandas()
robo = ascii.read("../data/raw/tces/kplr_dr25_inj1_tces.txt").to_pandas()
df = ascii.read("../data/raw/plti/kplr_dr25_inj1_plti.txt").to_pandas()
df["kepid"] = df.KIC_ID.astype(int)
print(len(df))

# Remove Non-Negative Depth
m = df.i_depth > 0.0
df = df[m]
print(len(df))

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


146294
146262


  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
# Column Headers
cdpp_cols = ["rrmscdpp01p5", "rrmscdpp02p0", "rrmscdpp02p5", "rrmscdpp03p0",
             "rrmscdpp03p5", "rrmscdpp04p5", "rrmscdpp05p0", "rrmscdpp06p0",
             "rrmscdpp07p5", "rrmscdpp09p0", "rrmscdpp10p5", "rrmscdpp12p0",
             "rrmscdpp12p5", "rrmscdpp15p0"]

pars = stlr[cdpp_cols + ["kepid", "limbdark_coeff1", "limbdark_coeff2", 
                         "limbdark_coeff3", "limbdark_coeff4", "dataspan", 
                         "dutycycle"]]

# Join Tables
joined = pd.merge(df, stlr, on="kepid", suffixes=("", "_stlr"))
joined = pd.merge(joined, robo, how="left", suffixes=("", "_robo"), on="TCE_ID")

In [9]:
# Filter Out Values
m = joined.i_period < 100
m &= joined.EB_injection < 1.0
m &= joined.Offset_from_source < 1.0

m = (4200 <= joined.teff) & (joined.teff <= 6100)
m &= joined.radius <= 1.15

# Only include stars with sufficient data coverage.
m &= joined.dataspan > 365.25*2.
m &= joined.dutycycle > 0.6
m &= joined.rrmscdpp07p5 <= 1000.

# Only select stars with mass estimates.
m &= np.isfinite(joined.mass)

joined = joined[m]
print("selected {0} / {1} injections".format(len(joined), len(df)))

selected 67279 / 146262 injections


In [10]:
joined.head()

Unnamed: 0,KIC_ID,Sky_Group,i_period,i_epoch,N_Transit,i_depth,i_dur,i_b,i_ror,i_dor,...,Rs,Ts,logg_robo,a,Rp/Rs,a/Rs,impact,SNR_DV,Sp,Fit_Prov
740,4443342,51,354.9091,202.7478,3.0405,675.0,14.14179,0.2961,0.0207,187.873,...,,,,,,,,,,
742,4443393,51,284.4971,76.2585,4.1412,217.0,8.84581,0.7703,0.014,162.326,...,,,,,,,,,,
743,4443419,51,450.9151,209.58,2.0083,941.0,10.1232,0.6685,0.026,266.533,...,,,,,,,,,,
744,4443452,51,244.8703,293.7472,3.9886,882.0,6.83987,0.8663,0.0275,153.195,...,,,,,,,,,,
745,4443467,51,49.1535,75.4744,22.967,499.0,6.37456,0.2511,0.018,58.259,...,0.991,5828.0,4.475,0.2686,0.023176,17.93,0.9569,8.9,14.07,1.0


In [13]:
# List Column Names
list(joined.columns.values)

['KIC_ID',
 'Sky_Group',
 'i_period',
 'i_epoch',
 'N_Transit',
 'i_depth',
 'i_dur',
 'i_b',
 'i_ror',
 'i_dor',
 'EB_injection',
 'Offset_from_source',
 'Offset_distance',
 'Expected_MES',
 'Recovered',
 'TCE_ID',
 'Measured_MES',
 'r_period',
 'r_epoch',
 'r_depth',
 'r_dur',
 'r_b',
 'r_ror',
 'r_dor',
 'Fit_Provenance',
 'kepid',
 'tm_designation',
 'teff',
 'teff_err1',
 'teff_err2',
 'logg',
 'logg_err1',
 'logg_err2',
 'feh',
 'feh_err1',
 'feh_err2',
 'mass',
 'mass_err1',
 'mass_err2',
 'radius',
 'radius_err1',
 'radius_err2',
 'dens',
 'dens_err1',
 'dens_err2',
 'prov_sec',
 'kepmag',
 'limbdark_coeff1',
 'limbdark_coeff2',
 'limbdark_coeff3',
 'limbdark_coeff4',
 'dist',
 'dist_err1',
 'dist_err2',
 'nconfp',
 'nkoi',
 'ntce',
 'datalink_dvr',
 'st_delivname',
 'st_vet_date_str',
 'ra',
 'dec',
 'st_quarters',
 'teff_prov',
 'logg_prov',
 'feh_prov',
 'jmag',
 'jmag_err',
 'hmag',
 'hmag_err',
 'kmag',
 'kmag_err',
 'dutycycle',
 'dataspan',
 'mesthres01p5',
 'mesthres02p