# Project Example

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

import csv
import sys
sys.path.append('../astroML-coral/Tools')
sys.path.append('../astroML-coral/Data')
import datascrub as scrub
import mldataconfig as dtc
from astropy.io import fits
import numpy as np
import pprint
import progressbar
import gc

In [2]:
import sys
print("Python: {}".format(sys.version))
print("tensorflow: {}".format(tf.__version__))
print("keras: {}".format(keras.__version__))
print("pandas: {}".format(pd.__version__))

Python: 3.8.8 (tags/v3.8.8:024d805, Feb 19 2021, 13:18:16) [MSC v.1928 64 bit (AMD64)]
tensorflow: 2.4.1
keras: 2.4.0
pandas: 1.2.3


### Data Setup

It is assumed that the shell scripts for bulk downloads from the MAST archives have been manually converted to .txt by saving as such after removing the shebang line.
The contents of the .txt end up being referenced with exoplanet status disposition from the Kepler pipeline, and all that written to a .csv. The contents of that csv are then split into more manageable chunks.

Note: data is saved in files rather than used as objects because RAM is expensive


In [4]:
scrub.exofinder('Data/kepler_lightcurves_Q03_long.txt', 'example.csv', False)

<class 'list'>
Kepler pipeline data converted
Successfuly pulled system data from:  Data/kepler_lightcurves_Q03_long.txt


In [5]:
split1, split2, full = dtc.datasplit('example.csv', 10000)
print(split1[0])
dat = list(dtc.chunks(split1, 5000))

# Do what is needed with the split data then delete. Python memory management woooo!

del split1, split2, full

['757450', 'https://mast.stsci.edu/api/v0.1/Download/file?uri=mast:Kepler/url/missions/kepler/lightcurves/0007/000757450//kplr000757450-2009350155506_llc.fits', '1']


### Getting Flux Data

Mass downloads of flux values from the MAST archive. Everything gets referenced via KeplerID. Again, refer to tools scripts for details

Warning: Limit queries to ~5000 objects at a time (hence further chunking in previous cell). This takes up a lot of memory.
         In efforts to not DDOS the archive, I included two converted lists in the data.zip

In [6]:
dtc.getfluxes(dat[0], 'example1.csv')

 99% (4993 of 5000) |################### | Elapsed Time: 0:01:22 ETA:   0:00:00

In [7]:
dtc.getfluxes(dat[1], 'example2.csv')

 99% (4998 of 4999) |################### | Elapsed Time: 0:43:14 ETA:   0:00:00

Here we use pandas to read out the .csv data and save it as numpy arrays

In [8]:
traindat = pd.read_csv("example1.csv", error_bad_lines=False)
train = traindat.to_numpy()
testdat = pd.read_csv("example2.csv", error_bad_lines=False)
test = testdat.to_numpy()



train.shape
test.shape

(4998, 4372)

And this is how the data is split into pices for training and testing of the ML algo
Should probably be a function but some weird bits that should be showm

In [9]:
x = train[:,2:]
y = train[:,1]
test_x = test[:,2:]
test_y = test[:,1]
x = x[:, ~np.isnan(x).any(axis=0)]
x = dtc.normalize(x)

test_x =test_x[:, ~np.isnan(test_x).any(axis=0)]
test_x = dtc.normalize(test_x)

In [10]:
print(f"Train dataset shape: {x.shape}")
print(f"Train-res dataset shape: {y.shape}")
print(f"Train dataset shape: {test_x.shape}")
print(f"Train-res dataset shape: {test_y.shape}")

Train dataset shape: (4999, 4134)
Train-res dataset shape: (4999,)
Train dataset shape: (4998, 4134)
Train-res dataset shape: (4998,)


# Machine Learning

Here we create a keras model with an input layer, interior layer with 'relu' activation function, and an output layer with 4 members, as this is supposed to identify whether the data represents a confirmed, candidate, false positive, or no planet at all. Again, this could be implemented as external functions but the internal workings are on display for the purposes of this project

In [12]:
model = Sequential()
model.add(Dense(64))
model.add(Dense(128, activation='relu'))
model.add(Dense(4))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(x, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x21b88356b50>

In [13]:
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                264640    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 516       
Total params: 273,476
Trainable params: 273,476
Non-trainable params: 0
_________________________________________________________________


# Analysis

While the model provides a quick first-pass analysis of the data, it still skews heavily towards no exoplanets. To rectify this, a large amount of either data processing or statistical analysis will need to be done. The model will need to be trained on a more balanced dataset (roughly equal numbers of no planet, candidate, confirmed, or false positive), or determination of what threshold probabilities indicate something about the system. As for binary detection of presence of an exoplanet, see the Flux_algo notebook.

In [18]:
test_loss, test_acc = model.evaluate(test_x,  test_y, verbose=1)


print('\nTest accuracy:', test_acc)
probability_model = tf.keras.Sequential([model,
                                     tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_x)

print(test_y[31])
print(predictions[31])


Test accuracy: 0.9483793377876282
2.0
[0.9554491  0.00249979 0.02469294 0.01735814]


In [19]:
trainScore = model.evaluate(x, y, verbose=0)
print("Train Score: ", trainScore)
testScore = model.evaluate(test_x, test_y, verbose=0)
print("Train Score: ", testScore)
predictions = model.predict(test_x[:3])
print("predictions:", predictions)

Train Score:  [0.2598845958709717, 0.9531906247138977]
Train Score:  [0.28468242287635803, 0.9483793377876282]
predictions: [[ 4.589682   -0.61855555  0.9471797   0.92334205]
 [ 5.47079    -0.8964104   1.8064309   1.266984  ]
 [ 5.451804   -0.8904179   1.7880336   1.2599893 ]]


## Creating a TSFLite model

Simple generation of a tensorflow lite model that can be run on other hardware. Currently unable to make it run on the Google Coral due to the massive datasets involved.

In [21]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the model.
with open('exoalgo.tflite', 'wb') as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: C:\Users\BARTEK~1\AppData\Local\Temp\tmpq7y5anw0\assets
