In [1]:
from __future__ import absolute_import, division, print_function
import argparse
import multiprocessing
import os
import sys
import numpy as np
import pdb
from random import shuffle
import pandas as pd
import tensorflow as tf
from astropy.io import fits
from astropy.io.fits import getheader
import tensorflow as tf
from tf_util import example_util
import matplotlib.pylab as plt

ModuleNotFoundError: No module named 'tensorflow'

# write it all to Tf_records

Params needed:
1. bjd
2. CCFs
    * OG CCF
    * Jup CCF
    * ext CCF
    * Median CCF
3. Rescaled CCF residuals
4. Activity signal
5. mu fits
    * mu_og_fit
    * mu_jup_fit
    * mu_zero_fit
6. Other
    * fwhm
    * contrast
    * bis

Subsets needed
1. Training set (80%) & Cross-validation subsets (%80)
2. Validation set (10%)
3. Test set (10%)

In [2]:
outfile = 'Injection0.4ms_365d_10d_HARPS2.3.1_ready_for_TF_records.npz'#'HARPS2.3.1_ready_for_TF_records.npz'#'
npzfile = np.load(outfile) 
npzfile.files

['BJD',
 'vrad_star',
 'og_ccf_list',
 'jup_shifted_CCF_data_list',
 'zero_shifted_CCF_list',
 'CCF_normalized_list',
 'cff_residual_list',
 'ccf_residual_rescaled',
 'CCF_normalized_list_cutoff',
 'CCF_residual_list_cutoff',
 'ccf_residual_rescaled_cutoff',
 'mu_og_list',
 'mu_jup_list',
 'mu_zero_list',
 'fwhm',
 'cont',
 'bis',
 'shift_by_rv']

In [3]:
j = 0
[(npzfile['BJD'][j])]

[105.05275882408023]

In [4]:
class TfRecordMaker:

    def __init__(self, input_path, path, numfits, index=None):
        self.input_path = input_path or os.input_path.dirname(os.input_path.realpath(__file__))#or 'shifted_fits_clean73_May26_one_file/'  # or os.path.dirname(os.path.realpath(__file__))
        self.path = path or os.path.dirname(os.path.realpath(__file__))
        self.numfits = numfits or 0
        self.index = index

    def make_examples(self):
        examples = []
        print(self.index)
        
        index_number = 0
        # index = np.arange(0,self.numfits,1)
        # np.random.seed(42)
        # np.random.shuffle(index)
        
        
        # read in files
        npzfile = np.load(outfile) 
        

        for j in self.index:
            ex = tf.train.Example()

            # Set CCF features.
            example_util.set_float_feature(ex, "OG_CCF",
                                           npzfile['og_ccf_list'][j])
            example_util.set_float_feature(ex, "JUP_CCF",
                                           npzfile['jup_shifted_CCF_data_list'][j])
            example_util.set_float_feature(ex, "ZERO_CCF",
                                           npzfile['zero_shifted_CCF_list'][j])
            example_util.set_float_feature(ex, "CCF",
                                           npzfile['CCF_normalized_list'][j]) 
            example_util.set_float_feature(ex, "CCF_residuals",
                                           npzfile['cff_residual_list'][j]) 
            example_util.set_float_feature(ex, "Rescaled CCF_residuals",
                                           npzfile['ccf_residual_rescaled'][j]) 
            example_util.set_float_feature(ex, "CCF_cutoff",
                                           npzfile['CCF_normalized_list_cutoff'][j]) 
            example_util.set_float_feature(ex, "CCF_residuals_cutoff",
                                           npzfile['CCF_residual_list_cutoff'][j]) 
            example_util.set_float_feature(ex, "Rescaled CCF_residuals_cutoff",
                                           npzfile['ccf_residual_rescaled_cutoff'][j]) 

            # prints what iteration we are currently at
            index_number = index_number + 1
            if index_number % 500 == 0:
                print(index_number)

            # Set residuals
            median_rv = np.median(npzfile['vrad_star'])
            # example_util.set_feature(ex, "activity signal residuals", act_signal)
            example_util.set_feature(ex, "activity signal", [(npzfile['vrad_star'][j] - median_rv)])  # in km/s
            example_util.set_feature(ex, "mu_og_fit", [(npzfile['mu_og_list'][j])])
            example_util.set_feature(ex, "mu_jup_fit", [(npzfile['mu_jup_list'][j])])
            example_util.set_feature(ex, "mu_zero_fit", [(npzfile['mu_zero_list'][j])])
            example_util.set_feature(ex, "BJD", [(npzfile['BJD'][j])])
            example_util.set_feature(ex, "fwhm", [(npzfile['fwhm'][j])])
            example_util.set_feature(ex, "contrast", [(npzfile['cont'][j])])
            example_util.set_feature(ex, "bis", [(npzfile['bis'][j])])

            # set the other features in the header
            #for k in headr_all[j]:
            #    example_util.set_feature(ex, str(k), [headr_all[j][k]])

            examples.append(ex)
        return examples

def tf_writer(input_path, path, numfits, randseed):
    num_ccfs = 528
    full_val_cutoff = int(0.80*num_ccfs) # where 628 is the number of nonzero ccfs
    cross_val_cutoff = int(0.08 * num_ccfs)
    val_cutoff = int(0.1*num_ccfs)
    test_cutoff = int(0.1*num_ccfs)
    index = np.arange(0, num_ccfs, 1)
    np.random.seed(randseed)
    np.random.shuffle(index)

    reps_bf = []
    reps_aft = []
    train_indeces = []
    intervals = [0.08, 0.16, 0.24, 0.32, 0.40, 0.48, 0.56, 0.64, 0.72, 0.80] # fix this (0.08 each) so it's 0.08*10 = 0.80
    for i in range(0, len(intervals)):
        if intervals[i] != 0.08:
            reps_bf.append(int(intervals[i - 1] * num_ccfs))
            reps_aft.append(int(intervals[i] * num_ccfs))
            train_indeces.append(index[int(intervals[i - 1] * num_ccfs):int(intervals[i] * num_ccfs)])
        else:
            print(intervals[i])
            reps_bf.append(0)
            reps_aft.append(int(intervals[i] * num_ccfs))
            train_indeces.append(index[0:int(intervals[i] * num_ccfs)])

    subset0 = train_indeces[1:]
    subset1 = train_indeces[0:1] + train_indeces[2:]
    subset2 = train_indeces[0:2] + train_indeces[3:]
    subset3 = train_indeces[0:3] + train_indeces[4:]
    subset4 = train_indeces[0:4] + train_indeces[5:]
    subset5 = train_indeces[0:5] + train_indeces[6:]
    subset6 = train_indeces[0:6] + train_indeces[7:]
    subset7 = train_indeces[0:7] + train_indeces[8:]
    subset8 = train_indeces[0:8] + train_indeces[9:]
    subset9 = train_indeces[0:9]

    flattened0 = [val for sublist in subset0 for val in sublist]
    flattened1 = [val for sublist in subset1 for val in sublist]
    flattened2 = [val for sublist in subset2 for val in sublist]
    flattened3 = [val for sublist in subset3 for val in sublist]
    flattened4 = [val for sublist in subset4 for val in sublist]
    flattened5 = [val for sublist in subset5 for val in sublist]
    flattened6 = [val for sublist in subset6 for val in sublist]
    flattened7 = [val for sublist in subset7 for val in sublist]
    flattened8 = [val for sublist in subset8 for val in sublist]
    flattened9 = [val for sublist in subset9 for val in sublist]
    indexes_full_val = [val for sublist in train_indeces for val in sublist]

    full_train_flats = []
    full_train_flats.extend([flattened0, flattened1, flattened2, flattened3, flattened4, flattened5, flattened6, flattened7, flattened8, flattened9])

    #train_index = index[0:train_cutoff]
    val_index = index[int(0.8 * num_ccfs):int(0.9 * num_ccfs)]
    test_index = index[int(0.9 * num_ccfs):]

    # # loop through cross_val sets
    # for iteration in range(0, len(full_train_flats)):
    #     with tf.python_io.TFRecordWriter('Archive_HARPS_N/TF_record_Jul_8/TF_ccf_train'+str(iteration)) as writer:
    #         tf_record_maker = TfRecordMaker(path=path, numfits=numfits, index=np.array(full_train_flats[iteration]))
    #         number_examples_train = 0
    #         examples_tf = tf_record_maker.make_examples()
    #         for example in examples_tf[0:train_cutoff]:
    #             print("train")
    #             number_examples_train = number_examples_train + 1
    #             if number_examples_train % 100 == 0:
    #                 print("iteration for training set: " + str(number_examples_train))
    #             writer.write(example.SerializeToString())
    #             # print(ex)

    # Make directory if it does not exist
    if not os.path.exists(path):
        os.makedirs(path)

    for iteration in range(0, len(train_indeces)):
        with tf.io.TFRecordWriter(path+'TF_ccf_cross_val'+str(iteration)) as writer:
             tf_record_maker = TfRecordMaker(input_path=input_path, path=path, numfits=numfits, index=train_indeces[iteration])
             number_examples_val = 0
             eval_counter = 0
             #for example in tf_record_maker.make_examples()[train_cutoff:val_cutoff]:
             for example in tf_record_maker.make_examples()[0:cross_val_cutoff+1]:
                 eval_counter += 1
                 print("val: " + str(eval_counter))
                 number_examples_val = number_examples_val + 1
                 if number_examples_val%100 == 0:
                     print("iteration for evaluation set: "+str(number_examples_val))
                 writer.write(example.SerializeToString())
                 # print(ex)

    with tf.io.TFRecordWriter(path+'TF_ccf_val') as writer:
        tf_record_maker = TfRecordMaker(input_path=input_path, path=path, numfits=numfits, index=val_index)
        number_examples_test = 0
        test_counter = 0
        for example in tf_record_maker.make_examples()[0:val_cutoff+1]:
        #for example in tf_record_maker.make_examples()[val_cutoff:]:
            test_counter += 1
            print("test: " + str(test_counter))
            number_examples_test = number_examples_test + 1
            if number_examples_test % 100 == 0:
                print("iteration for testing set: "+str(number_examples_test))
            writer.write(example.SerializeToString())
            # print(ex)


    with tf.io.TFRecordWriter(path+'TF_ccf_test') as writer:
        tf_record_maker = TfRecordMaker(input_path=input_path, path=path, numfits=numfits, index=test_index)
        number_examples_test = 0
        test_counter = 0
        for example in tf_record_maker.make_examples()[0:test_cutoff+1]:
        #for example in tf_record_maker.make_examples()[val_cutoff:]:
            test_counter += 1
            print("test: " + str(test_counter))
            number_examples_test = number_examples_test + 1
            if number_examples_test % 100 == 0:
                print("iteration for testing set: "+str(number_examples_test))
            writer.write(example.SerializeToString())
            # print(ex)

    # Optional: also write a file with all the evaluation files in one file
    #full_val_cutoff
    #indexes_full_val

    with tf.io.TFRecordWriter(path+'TF_ccf_full_train') as writer:
        tf_record_maker = TfRecordMaker(input_path=input_path, path=path, numfits=numfits, index=indexes_full_val)
        number_examples_val = 0
        eval_counter = 0
        # for example in tf_record_maker.make_examples()[train_cutoff:val_cutoff]:
        for example in tf_record_maker.make_examples()[0:full_val_cutoff+1]:
            eval_counter += 1
            print("val: " + str(eval_counter))
            number_examples_val = number_examples_val + 1
            if number_examples_val % 100 == 0:
                print("iteration for evaluation set: " + str(number_examples_val))
            writer.write(example.SerializeToString())
            # print(ex)

In [5]:
tf_writer(input_path='/Users/zdebeurs/Documents/GitHub/SOAP_2_smol2/Archive_HARPS_N_NEW DRS',
          path='TF_records_Injection0.4ms_365d_10d_March2021/',#"TF_records_bjd_fixed_March2021/", #'TF_records_Injection0.3ms_365.25d_10d_March2021/'#
          randseed=20,
          numfits=528)

0.08


NameError: name 'tf' is not defined

In [77]:
len(npzfile['ccf_residual_rescaled_cutoff'][j])

46

## read contents of files

In [78]:
cross_val_BJD_list = []
val_BJD_list = []
test_BJD_list = []
bis_list = []
ccf_rescaled_list = []
ccf_list = []
rv_activity_list = []


path_name = 'TF_records_decimal_test/TF_ccf_test'
record_iterator = tf.compat.v1.io.tf_record_iterator(path=path_name)#tf.io.tf_record_iterator(path=path_name)
for string_record in record_iterator:
        example = tf.train.Example()
        example.ParseFromString(string_record)
        edict = dict(example.features.feature)
        cross_val_BJD_list.append(edict['BJD'].float_list.value[0])
        bis_list.append(edict['bis'].float_list.value[0])
        
        # These are the CCFs that we used. They are normalized by subtracting the mean and dividing by the SD
        ccf_rescaled = edict['Rescaled CCF_residuals'].float_list.value
        ccf_rescaled_list.append(ccf_rescaled)
        ccf = edict['CCF_residuals'].float_list.value
        ccf_list.append(ccf)

        # This is the stellar activity signal that we used. All planets are removed from this signal.
        rv_activity_list.append(edict['activity signal'].float_list.value[0]*1000) #convert to m/s
        print(example)
print(len(cross_val_BJD_list))

features {
  feature {
    key: "BJD"
    value {
      float_list {
        value: 377.0233154296875
      }
    }
  }
  feature {
    key: "CCF"
    value {
      float_list {
        value: 1.0023667812347412
        value: 1.000858187675476
        value: 0.9993938207626343
        value: 0.998059093952179
        value: 0.9969536662101746
        value: 0.9963647127151489
        value: 0.9961222410202026
        value: 0.9961116909980774
        value: 0.9964798092842102
        value: 0.9967395663261414
        value: 0.9969236850738525
        value: 0.9967313408851624
        value: 0.9953345060348511
        value: 0.9916188716888428
        value: 0.9837172627449036
        value: 0.9691677093505859
        value: 0.9444511532783508
        value: 0.9059409499168396
        value: 0.8505081534385681
        value: 0.7773852944374084
        value: 0.6902024149894714
        value: 0.5982587337493896
        value: 0.5149760842323303
        value: 0.45506811141967773
       