In this notebook for each eclipsing binary system we extract the features with feets and join them with the periods. We add a new feature, the difference of the minima, and also perform a new curation, pre-process the data, and prepare the data for input to the classification model.

# FEATURES

Note: feets fails with data less than 20.

In [1]:
import numpy as np
import feets 
import pandas as pd
from scipy import stats
from os import listdir
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

# Extration-Features feets

In [2]:
fuentes = listdir('TimeSeries_vivace/')
matriz = []
mat = []

In [3]:
for i in fuentes:
    ruta = 'TimeSeries_vivace/' + i
    
    data = np.genfromtxt(ruta)
    if (data.shape[0] >=20):
        time = data[:,0]
        Ks = data[:,1]
        eKs = data[:,2]

        lc = [time, Ks, eKs]
        #print(ruta)
        fs = feets.FeatureSpace(data=['magnitude','error','time'])
        features, values = fs.extract(*lc)

        id_vvv = np.array([float(i)])
        features_values = np.concatenate((id_vvv, values))

        id_vvv_name = np.array([i])
        features_name = np.concatenate((id_vvv_name, features))

        mat.append(features_values)


# Data Frame

In [16]:
df1 = pd.DataFrame(mat, columns=features_name)
df1 = df1.rename(columns={'515882025121':'id_vvv'}).copy()

In [17]:
display(df1.head(1))
df1.shape

Unnamed: 0,id_vvv,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,Con,Eta_e,...,Rcs,Skew,SlottedA_length,SmallKurtosis,Std,StetsonK,StetsonK_AC,StructureFunction_index_21,StructureFunction_index_31,StructureFunction_index_32
0,515882800000.0,0.146903,1.0,1.0,0.25,29.341143,10.0,0.5,0.0,,...,0.103894,0.885269,1.0,0.303034,0.074933,0.804019,0.730023,1.741213,2.369615,1.408185


(728, 64)

In [18]:
df1.to_csv('outputs/feets_output.csv', index=False)

**Levanto la salida de feets**

In [19]:
df1 = pd.read_csv('outputs/feets_output.csv')

In [20]:
display(df1.head(2))
df1.shape

Unnamed: 0,id_vvv,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,Con,Eta_e,...,Rcs,Skew,SlottedA_length,SmallKurtosis,Std,StetsonK,StetsonK_AC,StructureFunction_index_21,StructureFunction_index_31,StructureFunction_index_32
0,515882800000.0,0.146903,1.0,1.0,0.25,29.341143,10.0,0.5,0.0,,...,0.103894,0.885269,1.0,0.303034,0.074933,0.804019,0.730023,1.741213,2.369615,1.408185
1,515882100000.0,0.196734,1.0,1.0,0.182927,28.686937,10.0,0.5,0.0,,...,0.134247,2.01024,1.0,4.797852,0.097485,0.715358,0.784014,1.5074,1.989706,1.573024


(728, 64)

In [21]:
df1.iloc[0]

id_vvv                        5.158828e+11
Amplitude                     1.469026e-01
AndersonDarling               1.000000e+00
Autocor_length                1.000000e+00
Beyond1Std                    2.500000e-01
                                  ...     
StetsonK                      8.040189e-01
StetsonK_AC                   7.300228e-01
StructureFunction_index_21    1.741213e+00
StructureFunction_index_31    2.369615e+00
StructureFunction_index_32    1.408185e+00
Name: 0, Length: 64, dtype: float64

**Filas Duplicadas**

In [22]:
df1.set_index("id_vvv", inplace=True)
df1[df1.astype(str).duplicated()].shape
print('cantidad de fuentes duplicadas', df1[df1.index.astype(str).duplicated()].shape[0])

cantidad de fuentes duplicadas 0


In [23]:
display('fuentes duplicadas', df1[df1.index.astype(str).duplicated()].index)

'fuentes duplicadas'

Float64Index([], dtype='float64', name='id_vvv')

In [24]:
df_cl = df1.loc[~(df1.index.astype(str).duplicated(keep="first"))].copy()

In [25]:
df_cl.index[0]

515882777684.0

In [26]:
df_cl = df_cl.reset_index()

In [27]:
df_cl1 = df_cl.sort_values('id_vvv').copy()
display(df_cl1.head(2))
df_cl1.shape

Unnamed: 0,id_vvv,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,Con,Eta_e,...,Rcs,Skew,SlottedA_length,SmallKurtosis,Std,StetsonK,StetsonK_AC,StructureFunction_index_21,StructureFunction_index_31,StructureFunction_index_32
215,515881800000.0,0.375037,0.982969,1.0,0.266667,33.405273,10.0,0.5,0.0,1206468.0,...,0.148049,0.43918,1.0,-0.262852,0.219407,0.799939,0.667972,1.453847,1.733139,1.27355
160,515881800000.0,0.173605,1.0,1.0,0.217391,27.47764,10.0,0.5,0.0,,...,0.107519,1.299428,1.0,1.40677,0.086935,0.743439,0.682698,1.41316,1.64907,1.210574


(728, 64)

In [28]:
df_cl1.iloc[0]

id_vvv                        5.158818e+11
Amplitude                     3.750372e-01
AndersonDarling               9.829693e-01
Autocor_length                1.000000e+00
Beyond1Std                    2.666667e-01
                                  ...     
StetsonK                      7.999392e-01
StetsonK_AC                   6.679719e-01
StructureFunction_index_21    1.453847e+00
StructureFunction_index_31    1.733139e+00
StructureFunction_index_32    1.273550e+00
Name: 215, Length: 64, dtype: float64

In [29]:
df_cl1[df_cl1['id_vvv'] == 515881819260.0].Amplitude

Series([], Name: Amplitude, dtype: float64)

**Load EBS information**

In [30]:
df2 = pd.read_csv('outputs/outputPaso1_fuentes.csv')
df2 = df2.sort_values('VIVAID')
display(df2.head(2))
df2.shape

Unnamed: 0,VIVAID,ra(J2000),dec(J2000),ksEMeanMagPawprint,Period_KFI,Period_PAN,Period_LSG,Period_PDM,Period_STR
0,515881800000.0,178.744776,-61.074091,16.836236,0.037107,0.059735,0.463907,0.463888,355.38876
1,515881800000.0,178.334446,-61.937927,15.853327,0.044667,0.05833,0.082251,0.082251,0.05833


(41508, 9)

In [31]:
df2[df2['VIVAID']==515881819260.0].Period_PAN

0    0.059735
Name: Period_PAN, dtype: float64

# Join Features más Features_feets

In [32]:
Features_total = df_cl1.set_index('id_vvv').join(df2.set_index('VIVAID'))

In [33]:
Features_total = Features_total.reset_index()
Features_total.shape[0]

728

In [34]:
Features_total.head(2)

Unnamed: 0,id_vvv,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,Con,Eta_e,...,StructureFunction_index_31,StructureFunction_index_32,ra(J2000),dec(J2000),ksEMeanMagPawprint,Period_KFI,Period_PAN,Period_LSG,Period_PDM,Period_STR
0,515881800000.0,0.375037,0.982969,1.0,0.266667,33.405273,10.0,0.5,0.0,1206468.0,...,1.733139,1.27355,178.260191,-62.108137,16.703827,0.057703,0.058487,0.15888,0.158877,0.158882
1,515881800000.0,0.173605,1.0,1.0,0.217391,27.47764,10.0,0.5,0.0,,...,1.64907,1.210574,178.360623,-61.914256,13.73882,0.345819,0.345819,0.345819,0.25695,0.345819


**Observo para una fuente si los datos de los dos dataframe son concordates**

In [35]:
print('fuente', Features_total.id_vvv.iloc[0])
display(Features_total.iloc[0].Amplitude)
Features_total.iloc[0].Period_PAN

fuente 515881822600.0


0.3750371933000007

0.0584867179582004

**Agrego nuevo feature**

In [36]:
def fase(path_fuente, T):
    '''Función dedicada a calcular la fase a partir 
    de los días julianos y un periodo particular
    
    INPUT: path del dataset:[días Julianos, magnitud, err] 
    OUTPUT: fase, magnitud aparente, error.
    '''

    d = np.genfromtxt(path_fuente)
    
    t0 = d[:, 0].min()
    pha = []
    mag = []
    err = []
    
    for t in range(d.shape[0]):
        numerador = abs(t0 - d[t, 0])
        cociente = numerador/T
        parte_entera_cociente = int(cociente)
        ph = cociente-parte_entera_cociente
        pha.append(ph)
        mag.append(d[t, 1])
        err.append(d[t, 2])

    return(pha, mag, err) 

In [39]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

dmin = [ ]

for i in range(Features_total.shape[0]):
    # Cargo la curva de luz
    ruta = 'TimeSeries_vivace/' + str(int(Features_total.id_vvv[i]))
    P = Features_total.Period_PDM[i]*2
    d3 = fase(ruta, P)
    d31 = list(map(lambda x : x + 1, d3[0]))
    f = np.array(d3[0])
    y = np.array(d3[1])
    e = np.array(d3[2])
        
    # orden por fase a través de df
    m = np.zeros((len(d3[0]), len(d3)))
    m[:,0] = f
    m[:,1] = y
    m[:,2] = e
    df1 = pd.DataFrame(m, columns=['fase', 'mag', 'e_mag'])
    df1 = df1.sort_values('fase')
    x = df1.fase.values
    y = df1.mag.values
    e = df1.e_mag.values

    # escalado de la curva de luz a [0, 1]
    y_minmax =  min_max_scaler.fit_transform(y.reshape(-1, 1))
    
    # localización del máximo antes del primer mínimo
    index1 = np.where(y_minmax[0:int(len(y)/2)] == np.max(y_minmax[0:int(len(y)/2)]))
    # localización del máximo después del primer mínimo
    index2 = np.where(y_minmax[int(len(y)/2):] == np.max(y_minmax[int(len(y)/2):]))
    delta_min = abs(y_minmax[index1[0][0]] - y_minmax[int(len(y)/2)+index2[0][0]])
    dmin.append(delta_min[0])

In [40]:
Features_total['dmin'] = dmin

**Cambio de dtype a float32**

In [41]:
Features_total_32 = pd.DataFrame(Features_total.astype(np.float32).values, 
                                 columns=Features_total.columns)
Features_total_32['id_vvv'] = Features_total.id_vvv.values

**Datos para imprimir:**

In [42]:
df_ebs = Features_total_32[['id_vvv', 'ra(J2000)', 'dec(J2000)','ksEMeanMagPawprint']]
df_ebs = df_ebs.replace([np.inf, -np.inf], np.nan)
df_ebs = df_ebs.fillna(0)
display(df_ebs.shape)
df_ebs.head(2)

(728, 4)

Unnamed: 0,id_vvv,ra(J2000),dec(J2000),ksEMeanMagPawprint
0,515881800000.0,178.260193,-62.108135,16.703827
1,515881800000.0,178.360626,-61.914257,13.73882


In [43]:
df_ebs.to_csv('outputs/outputPaso2ebs.csv', header=True, index=False)

**Datos para clasificar**

In [44]:
df_clasificar = Features_total_32.drop(['ra(J2000)', 'dec(J2000)','ksEMeanMagPawprint'], axis=1)
df_clasificar = df_clasificar.replace([np.inf, -np.inf], np.nan)
df_clasificar = df_clasificar.fillna(0)
df_clasificar.shape

(728, 70)

In [45]:
display('Existen valores NaN', df_clasificar.isnull().values.any())

'Existen valores NaN'

False

In [46]:
df_clasificar.to_csv('outputs/outputPaso2clasificar.csv', header=True, index=False)

In [47]:
# Ahora
print('fuente', Features_total_32.id_vvv.iloc[0])
display(Features_total_32.iloc[0].Amplitude)
display(Features_total_32.iloc[0].Period_PAN)

# Antes
print('fuente', Features_total.id_vvv.iloc[0])
display(Features_total.iloc[0].Amplitude)
Features_total.iloc[0].Period_PAN

fuente 515881822600.0


0.37503719329833984

0.05848671868443489

fuente 515881822600.0


0.3750371933000007

0.0584867179582004

In [48]:
Features_total_32.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728 entries, 0 to 727
Data columns (total 73 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id_vvv                           728 non-null    float64
 1   Amplitude                        728 non-null    float32
 2   AndersonDarling                  728 non-null    float32
 3   Autocor_length                   728 non-null    float32
 4   Beyond1Std                       728 non-null    float32
 5   CAR_mean                         728 non-null    float32
 6   CAR_sigma                        728 non-null    float32
 7   CAR_tau                          728 non-null    float32
 8   Con                              728 non-null    float32
 9   Eta_e                            25 non-null     float32
 10  FluxPercentileRatioMid20         728 non-null    float32
 11  FluxPercentileRatioMid35         728 non-null    float32
 12  FluxPercentileRatioMid