### Imports:

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras.datasets import mnist

from sklearn.model_selection import train_test_split

%matplotlib inline

### Load the Data:

In [2]:
has_planets = pd.read_csv('../clean_planet_data/non_null_planets.csv')

In [3]:
kep3 = pd.read_csv('../clean_planet_data/clean_cut_kepc3.csv')

In [4]:
k2c1 = pd.read_csv('../clean_planet_data/clean_cut_k2c1.csv')

### Preparing Training Set:
Mixing confirmed planets into data so the model can learn what they are like

In [33]:
kep3.shape

(5087, 3199)

In [15]:
has_planets.shape

(1265, 3915)

In [18]:
# cut out the extra data to compare across the same timeline
join_planets = has_planets.iloc[:,:3199]
join_planets.shape

(1265, 3199)

In [28]:
# make sure the dataframes have the same column names
join_planets.columns = kep3.columns

# label the confirmed planet systems with 1
join_planets['LABEL'] = 1

In [35]:
master_df = pd.concat([join_planets, kep3], axis = 0)
master_df.head()

Unnamed: 0,name,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,KIC 10000941,1,52605.582031,52609.445312,52598.464844,52600.964844,52589.683594,52578.382812,52567.902344,52567.40625,...,52763.519531,52767.398438,52766.816406,52747.269531,52755.199219,52752.09375,52728.089844,52747.636719,52724.042969,52721.351562
1,KIC 10001368,1,34893.9375,34898.949219,34890.667969,34893.171875,34888.085938,34893.332031,34888.773438,34881.898438,...,34896.5625,34892.875,34886.449219,34889.011719,34887.082031,34890.953125,34883.921875,34878.972656,34884.617188,34884.703125
2,KIC 10001893,1,6546.915039,6553.41748,6561.237793,6550.333008,6545.250977,6557.640625,6546.570312,6544.990723,...,6558.052246,6558.196777,6559.018066,6539.165039,6556.10791,6549.238281,6547.344727,6547.125977,6543.914062,6548.813477
3,KIC 10002866,1,13770.859375,13767.226562,13774.398438,13777.831055,13790.148438,13795.668945,13798.77832,13800.320312,...,13813.326172,13809.694336,13818.818359,13810.759766,13818.966797,13821.005859,13805.796875,13812.212891,13812.047852,13798.03125
4,KIC 10004519,1,10071.524414,10073.680664,10071.700195,10065.617188,10070.639648,10065.191406,10072.930664,10071.879883,...,10066.993164,10065.714844,10065.25293,10062.077148,10068.188477,10067.022461,10065.424805,10065.816406,10065.026367,10061.316406


In [None]:
# baseline accuracy
master_df['LABEL'].value_counts()

In [99]:
base_acc = 1302 / 5050
1 - base_acc

0.7421782178217822

In [38]:
# Make X and y
X = master_df.iloc[:, 2:]
y = master_df['LABEL']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

### Modeling:

In [None]:
# Things to tune over
#     number of layers
#     order of layers
#     filters
#     filter size
#     nodes
#     regularization

In [80]:
X_array = np.array(X_train)
X_test_array = np.array(X_test)

In [81]:
X_array = np.expand_dims(X_array, axis = 2)
X_test_array = np.expand_dims(X_test_array, axis = 2)

In [110]:
# Define model architecture
model = Sequential()

# Convolutional layer.
model.add(Conv1D(filters = 10,      # number of filters
                 kernel_size = (50),  # filter size
                 activation = 'relu',
                 input_shape = (3197, 1))) # dimensions of training data

# Pooling layer.
model.add(MaxPooling1D(pool_size = (5))) # by default, MaxPool will select the stride so the areas we pool will not overlap.

# More convo & pooling layers?

# regularize
#     do I really want to be regularizing the convo and pooling layers if I want to increase its sensitivity?
# model.add(Dropout(0.5))

# In order to go from a convolutional/pooling layer, we have to organize our neurons.
model.add(Flatten())

# Fully connected layer.
model.add(Dense(50, activation = 'relu'))
model.add(Dropout(0.5)) # regularization

# Output layer
model.add(Dense(1, activation = 'sigmoid'))

# Compile model
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [111]:
model.fit(X_array,
          y_train,
          batch_size = 64,
          epochs = 10,
          verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [95]:
# 8. Evaluate model on test data
score = model.evaluate(X_test_array, y_test, verbose = 1)
labels = model.metrics_names

score
# labels
# model.summary()



[3.2987286161715796, 0.7953400503778337]

In [None]:
# # Plot loss over time.
# plt.figure()
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='best')
# plt.show();

### Notes <hr>

In [None]:
# # What is the best way to do this?
# undersample?
# oversample?
# What is the best ratio?
# will this affect my result/ my false positive rate?

# change columns of kep3 so you can join it with the planet data
#     make sure there are no gaps in the confirmed planets data
#     or the other data
#     cut down dfs to be the right size
#     do I want to remove the confimed planets already in the kep3 set in case they are repeats?

# is it possible that some of the planets don't pass in front of their stars for the 66 days I'm looking at?
#     what proportion of exoplanets have orbits of less than 66 days?

# how am I going to deal with the nulls?
#     skip to the next nearest non-null value?
#     how many nulls in a row are there?
#     if this happens to almost all confirmed planets, its possible that the network may learn to find these "skips"
#         should I then just drop any rows with too many consecutive nans? 
#         Pretty much every one has a string of at least 40 consecutive nans
#     mean imputation may have a logical case here
#         it would make sense that the level of light is between previous value and the next one + random noise
#         what about for longer strings of missing values 
#         mean imputation could be good for avoiding creating false signals because of its smoothing effect
#         the disadvantages of mean imputation may not be important because I am basically using neural networks for pattern detection
#     maybe if it is a long string of missing values you want to skip to the next real data, 
#         mark if that had happened so I could tell if it contributes to false positives
#     drop rows with more than 150 nulls? there are only 8
#         1134    666
#         752     408
#         851     408
#         1105    408
#         441     408
#         832     285
#         871     230
#         762     230

# definitely get data on whether it is a binary star system or not
#     how to feed these single features to the neural networks to help with pattern detection?

# normalize the light curves