# Project Example

In [4]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

import csv
import sys
sys.path.append('../astroML-coral/Tools')
sys.path.append('../astroML-coral/Data')
import datascrub as scrub
import mldataconfig as dtc
from astropy.io import fits
import numpy as np
import pprint
import progressbar
import gc

### Data Setup

It is assumed that the shell scripts for bulk downloads from the MAST archives have been manually converted to .txt by saving as such after removing the shebang line.
The contents of the .txt end up being referenced with exoplanet status disposition from the Kepler pipeline, and all that written to a .csv. The contents of that csv are then split into more manageable chunks.

Note: data is saved in files rather than used as objects because RAM is expensive


In [2]:
scrub.exofinder('Data/kepler_lightcurves_Q03_long.txt', 'example.csv', False)

<class 'list'>
Kepler pipeline data converted
Successfuly pulled system data from:  Data/kepler_lightcurves_Q03_long.txt


In [8]:
split1, split2, full = dtc.datasplit('example.csv', 100000)
print(split2[0])
dat = list(dtc.chunks(split2, 5000))

['8957928', 'https://mast.stsci.edu/api/v0.1/Download/file?uri=mast:Kepler/url/missions/kepler/lightcurves/0089/008957928//kplr008957928-2009350155506_llc.fits', '0']


### Getting Flux Data

Mass downloads of flux values from the MAST archive. Everything gets referenced via KeplerID

Warning: Limit queries to ~5000 objects at a time (hence further chunking in previous cell). It doesn't seem like a memory issue so best guess is download gets throttled 

In [7]:
dtc.getfluxes(dat[0], 'data1.csv')

 99% (4993 of 5000) |################### | Elapsed Time: 0:01:04 ETA:   0:00:00

In [9]:
dtc.getfluxes(dat[1], 'data2.csv')

 99% (4997 of 5000) |################### | Elapsed Time: 0:01:42 ETA:   0:00:00

In [10]:
import sys
print("Python: {}".format(sys.version))
print("tensorflow: {}".format(tf.__version__))
print("keras: {}".format(keras.__version__))
print("pandas: {}".format(pd.__version__))

Python: 3.8.8 (tags/v3.8.8:024d805, Feb 19 2021, 13:18:16) [MSC v.1928 64 bit (AMD64)]
tensorflow: 2.4.1
keras: 2.4.0
pandas: 1.2.3


In [11]:
traindat = pd.read_csv("data1.csv")
#traindat[(traindat != "Nan").all(1)]
train = traindat.to_numpy()
testdat = pd.read_csv("data2.csv")
#testdat[(testdat != "Nan").all(1)]
test = testdat.to_numpy()



train.shape
test.shape

FileNotFoundError: [Errno 2] No such file or directory: 'data1.csv'

In [109]:
def normalize(data):
    data_mean = data.mean(axis=0)
    data_std = data.std(axis=0)
    return (data - data_mean) / data_std

#train = normalize(train)
#test = normalize(test)

In [118]:
x = train[:,2:]
y = train[:,1]
test_x = test[:,2:]
test_y = test[:,1]
x = x[:, ~np.isnan(x).any(axis=0)]
x = normalize(x)

test_x =test_x[:, ~np.isnan(test_x).any(axis=0)]

test_x = normalize(test_x)
print(f"Train dataset shape: {x.shape}")
print(f"Train-res dataset shape: {y.shape}")

Train dataset shape: (4999, 4134)
Train-res dataset shape: (4999,)


In [119]:
print(f"Train dataset shape: {x.shape}")
print(f"Train-res dataset shape: {y.shape}")
print(f"Train dataset shape: {test_x.shape}")
print(f"Train-res dataset shape: {test_y.shape}")

Train dataset shape: (4999, 4134)
Train-res dataset shape: (4999,)
Train dataset shape: (11828, 4134)
Train-res dataset shape: (11828,)


In [120]:
pprint.pp(test_y[53:60])

array([3., 0., 0., 0., 0., 0., 0.])


In [131]:
model = Sequential()
model.add(Dense(64))
model.add(Dense(128, activation='relu'))
model.add(Dense(4))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(x, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2b5064c23a0>

In [134]:
model.summary()


Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_74 (Dense)             (None, 64)                264640    
_________________________________________________________________
dense_75 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_76 (Dense)             (None, 4)                 516       
Total params: 273,476
Trainable params: 273,476
Non-trainable params: 0
_________________________________________________________________


In [132]:
test_loss, test_acc = model.evaluate(test_x,  test_y, verbose=1)

print('\nTest accuracy:', test_acc)
probability_model = tf.keras.Sequential([model,
                                     tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_x)
predictions[54]


Test accuracy: 0.9592492580413818


array([0.9260877 , 0.02330524, 0.0215577 , 0.02904934], dtype=float32)

In [133]:
trainScore = model.evaluate(x, y, verbose=0)
print("Train Score: ", trainScore)
testScore = model.evaluate(test_x, test_y, verbose=0)
print("Train Score: ", testScore)
predictions = model.predict(test_x[:3])
print("predictions:", predictions)

Train Score:  [1.002947211265564, 0.9517903327941895]
Train Score:  [0.5482151508331299, 0.9592492580413818]
predictions: [[ 3.903813   -6.681203    0.844062   -1.2480245 ]
 [ 3.993324   -6.8996596   0.90373945 -1.2765431 ]
 [ 4.163962   -7.3160605   1.0175068  -1.3308969 ]]


In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) # path to the SavedModel directory
tflite_model = converter.convert()

# Save the model.
with open('exoalgo.tflite', 'wb') as f:
    f.write(tflite_model)