In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
# df_raw = pd.read_csv('data/raw/sdBShortP_large_BPS_set.csv')
df_raw = pd.read_csv('data/raw/sdBShortP_large_BPS_set_stability_VAR_alphace_VAR.csv')

In [3]:
df_raw.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52700 entries, 0 to 52699
Data columns (total 108 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   path                     object 
 1   stability                object 
 2   n_ML_phases              int64  
 3   FeH_init                 float64
 4   Z_init                   float64
 5   termination_code         object 
 6   PRLODays                 float64
 7   PoverPMax                int64  
 8   TtipMyr                  float64
 9   GalAgeMyr                float64
 10  AgeBinNum                int64  
 11  DeltaTBin                int64  
 12  P_init                   float64
 13  P_final                  float64
 14  M1_init                  float64
 15  M1_final                 float64
 16  M2_init                  float64
 17  M2_final                 float64
 18  q_init                   float64
 19  q_final                  float64
 20  M1_HeIgnition            float64
 21  M1core_HeIg

Extract the Alpha_ce parameter from the ce_parameter dictionary. Both a_ce and a_th have the same value, so we only need 1 for the modelling.

In [4]:
df_raw['alpha_ce'] = df_raw['ce_parameters'].apply(lambda x: eval(x)['a_ce'])

For fields that will be predicted, replace NaN values with 0

In [5]:
fields = ['MS_P', 'MS_q', 'MS_M1', 'RGB_P', 'RGB_q', 'RGB_M1', 
          'ML_P', 'ML_q', 'ML_M1', 'CE_P', 'CE_q', 'CE_M1',
          'HeCoreBurning_P', 'HeCoreBurning_q', 'HeCoreBurning_M1', 'HeShellBurning_P',
          'HeShellBurning_q', 'HeShellBurning_M1', 'He-WD_P', 'He-WD_q', 'He-WD_M1']
values = {v:0 for v in fields}
df_raw.fillna(value=values, inplace=True)

Convert True - False fields to 1 - 0

In [6]:
for field in ['MS', 'RGB', 'ML', 'HeCoreBurning', 'HeShellBurning', 'He-WD']:
    df_raw[field] = df_raw[field].apply(lambda x: 1 if x else 0)

## Check for error models

The error checks are already done by NNaPS, but we need to decide which models we want to exclude. Here we will exclude all systems that have He ignition or He core burning errors, and flag systems with a possible ML issue although that is not so important.

In [7]:
he_error = df_raw['error_flags'].apply(lambda x: 1 if 4 in eval(x) or 5 in eval(x) else 0)
print('#Errors: ', len(he_error[he_error > 0]))
df_raw = df_raw[he_error == 0]

#Errors:  3239


## Check that ML/CE is before He burning

There are systems that will undergo mass loss and a CE phase after the He core burning phase. For our purposes, the systems that undergo the CE or ML phase late will be labeled as no interaction systems.

In [8]:
late_ce_ind = df_raw[(df_raw['CE']) & (df_raw['HeCoreBurning'])].index
print('#Late CE: ', len(late_ce_ind))

#Late CE:  2533


In [9]:
df_raw.loc[late_ce_ind, 'stability'] = 'late-interaction'

To check for systems that have a late ML phase without the CE phase, we can check the core mass at the start of ML and compare it to the core mass at the HeCoreBurning phase. If the latter is lower than the ML core mass, then the ML phase is late.

In [10]:
late_ml_ind = df_raw[(df_raw['M1core_HeIgnition'] < df_raw['M1core_MLend']) & (df_raw['HeCoreBurning']) & (~df_raw['CE'])].index
print('#Late ML: ', len(late_ml_ind))

#Late ML:  5121


In [11]:
df_raw.loc[late_ml_ind, 'stability'] = 'late-interaction'

## Add the final product for the stable models and the late interaction models

Note: sdAs and sdBs are considered the same and are all called sdBs.

In [12]:
df_raw['product'] = 'UK'

In [13]:
sda_ind = df_raw[(df_raw['sdA']) & (df_raw['stability'].isin(['stable', 'late-interaction']))].index
sdb_ind = df_raw[(df_raw['sdB']) & (df_raw['stability'].isin(['stable', 'late-interaction']))].index
sdo_ind = df_raw[(df_raw['sdO']) & (df_raw['stability'].isin(['stable', 'late-interaction']))].index

hb_ind = df_raw[(df_raw['HeCoreBurning']) & ~(df_raw['sdA'] | df_raw['sdB'] | df_raw['sdO']) & (df_raw['stability'].isin(['stable', 'late-interaction']))].index

he_ind = df_raw[~(df_raw['HeCoreBurning']) & (df_raw['stability'].isin(['stable', 'late-interaction']))].index

# check that there are no intersections
for i, a in enumerate([sda_ind, sdb_ind, sdo_ind, hb_ind, he_ind]):
    for j, b in enumerate([sda_ind, sdb_ind, sdo_ind, hb_ind, he_ind]):
        if i == j:
            continue
        else:
            if len(a.intersection(b)) > 0:
                print('Error! {} and {} intersect'.format(i, j))

In [14]:
df_raw.loc[sda_ind, 'product'] = 'sdB'
df_raw.loc[sdb_ind, 'product'] = 'sdB'
df_raw.loc[sdo_ind, 'product'] = 'sdO'

df_raw.loc[hb_ind, 'product'] = 'HB'

df_raw.loc[he_ind, 'product'] = 'He-WD'

## some statistics

In [15]:
len(df_raw)

49461

In [16]:
df_raw['stability'].value_counts()

stable              17893
CE                  11320
contact              7730
late-interaction     7654
merger               4864
Name: stability, dtype: int64

In [17]:
df_raw['product'].value_counts()

UK       23914
HB       13707
He-WD     9196
sdB       2644
Name: product, dtype: int64

In [19]:
df_raw.to_csv('data/processed/sdBShortP_large_BPS_set_stability_VAR_alphace_VAR_processed.csv', index=False)