### Imports:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras import optimizers

from sklearn.model_selection import train_test_split

%matplotlib inline

### Load the Data:

In [None]:
has_planets = pd.read_csv('../clean_planet_data/rand_non_null_planets.csv')

In [None]:
kep3 = pd.read_csv('../clean_planet_data/clean_cut_kepc3.csv')

In [None]:
k2c1 = pd.read_csv('../clean_planet_data/clean_cut_k2c1.csv')

In [None]:
c4_kep = pd.read_csv('../clean_planet_data/extracted_kep_c4_7700_backup.csv')

In [None]:
all_confirmed = pd.read_csv('../clean_planet_data/all_planets_list.csv')

### Preparing Training Set:
Mixing confirmed planets into data so the model can learn what they are like

In [None]:
kep3.shape

In [None]:
k2c1.shape

In [None]:
has_planets.shape

In [None]:
# cut out the extra data to compare across the same timeline
join_planets = has_planets.iloc[:,:3199]
join_planets.shape

In [None]:
# make sure the dataframes have the same column names
join_planets.columns = kep3.columns
# k2c1.columns = join_planets.columns

# label the confirmed planet systems with 1
join_planets['LABEL'] = 1
# k2c1['LABEL'] = 0

In [None]:
join_planets.head(1)

In [None]:
kep3.head(1)

In [None]:
master_df = pd.concat([join_planets, kep3], axis = 0)
master_df.head(1)

In [None]:
master_df.shape

In [None]:
# baseline accuracy
val_count = master_df['LABEL'].value_counts()
val_count

In [None]:
base_acc = val_count[1] / val_count[0]
1 - base_acc

In [None]:
# Make X and y
X = master_df.iloc[:, 2:]
y = master_df['LABEL']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [None]:
X_train = X_train.T
X_test = X_test.T

In [None]:
X_train.head(1)

In [None]:
X_test.head(1)

In [None]:
# Scaling
ss = StandardScaler()

scaled_df = ss.fit_transform(X_train)
X_train = pd.DataFrame(scaled_df, columns=X_train.columns)

#####FIXME!!!! you shouldn't be fit transforming, just transforming
test_scaled_df = ss.fit_transform(X_test)
X_test = pd.DataFrame(test_scaled_df, columns=X_test.columns)

In [None]:
X_train = X_train.T
X_test = X_test.T

In [None]:
X_train.head()

### Modeling:

In [None]:
# Things to tune over
#     number of layers
#     order of layers
#     filters
#     filter size
#     nodes
#     regularization
# # Normalize outputs from previous layer.
# model.add(BatchNormalization())

# # Leaky ReLU activation function.
# model.add(LeakyReLU())

# try changing the learning rate
#     also the decay rate

# Things to try with the data itself
#     normalizing light data
#     combining confirmed planets with a different dataset
#     spline smoothing

# try changing the size of the neurons
#     meaning input dimensions?

In [None]:
# make into an array and then change the dimensions
X_array = np.array(X_train)
X_array = np.expand_dims(X_array, axis = 2)

# do this for the test set too
X_test_array = np.array(X_test)
X_test_array = np.expand_dims(X_test_array, axis = 2)

In [None]:
# Define model architecture
model = Sequential()

# Convolutional layer.
model.add(Conv1D(filters = 1,      # number of filters, best tuning so far is 15 (hasn't generalized well yet though)
                 kernel_size = (50),  # filter size
                 activation = 'relu',
                 input_shape = (3197, 1))) # dimensions of training data

# # Convolutional layer
# model.add(Conv1D(filters = 1,
#                  kernel_size = 25,
#                  activation = 'relu'))

# # Convolutional layer
# model.add(Conv1D(filters = 1,
#                  kernel_size = 100,
#                  activation = 'relu'))

# dropout
# model.add(Dropout(0.5))

# Pooling layer.
model.add(MaxPooling1D(pool_size = (10))) 


# regularize
# model.add(Dropout(0.5))

# In order to go from a convolutional/pooling layer, we have to organize our neurons.
model.add(Flatten())

# Fully connected layer.
model.add(Dense(50, activation = 'relu'))
model.add(Dropout(0.7)) # regularization

# Output layer
model.add(Dense(1, activation = 'sigmoid'))

# Changing adam optimization parameters
optimizers.adam(lr = 0.001, decay = 0) # decay is thought to help the network learn good features first and then tune them

# Compile model
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [None]:
result = model.fit(X_array,
                    y_train,
                    batch_size = 64,
                    epochs = 10,
                    verbose = 1,
                   validation_data = (X_test_array, y_test))

In [None]:
# Plot loss over time.
plt.figure()
plt.plot(result.history['acc'])
plt.plot(result.history['val_acc'])
plt.title('Model Acc')
plt.ylabel('Acc')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='best')
plt.show();

### Testing on Unseen Data:
Now that the model is fit, lets see if we can use it to detect some planets in data not seen before.

In [None]:
unseen_data = k2c1.iloc[:,2:]
unseen_data.head()

In [None]:
# scale
##########FIXME!!!!!! don't fit_transform, only fit to training data
unseen_data = unseen_data.T

scaled_unseen = ss.fit_transform(unseen_data)
unseen_data = pd.DataFrame(scaled_unseen, columns=unseen_data.columns)

unseen_data = unseen_data.T

In [None]:
# unseen_data.head()

In [None]:
# Change the shape
array_unseen = np.array(unseen_data)
array_unseen = np.expand_dims(array_unseen, axis = 2)

### Predict!

In [None]:
# predict_proba or classification
#     both, so we know what the model thinks and how confident it is
y_unseen = model.predict_classes(array_unseen)

In [None]:
yhat_unseen = pd.DataFrame(y_unseen, columns=['prediction'])
# yhat_unseen.head(5)

In [None]:
val_counts = yhat_unseen['prediction'].value_counts()
print(val_counts[1]/val_counts.sum())
val_count

In [None]:
yhat_unseen[yhat_unseen['prediction'] == 1]

In [None]:
# take a look at the unseen data
x_axis = list(range(unseen_data.shape[1]-2))
plt.scatter(x_axis, unseen_data.iloc[4028, 2:])
# plt.ylim(-2, 2)

### Example of a Transit:

In [None]:
x_axis = list(range(join_planets.shape[1]-2))
plt.scatter(x_axis, join_planets.iloc[23, 2:])
# plt.ylim(10040, 10090)
plt.xlabel('Time')
plt.ylabel('Flux Level')
plt.title('Possible Solar Flare')

### Notes <hr>

In [None]:
# # Evaluate model on test data
# score = model.evaluate(X_test_array, y_test, verbose = 1)
# labels = model.metrics_names

# score
# # labels
# # model.summary()

In [None]:
# # What is the best way to do this?
# undersample?
# oversample?
# What is the best ratio?
# will this affect my result/ my false positive rate?

# change columns of kep3 so you can join it with the planet data
#     make sure there are no gaps in the confirmed planets data
#     or the other data
#     cut down dfs to be the right size
#     do I want to remove the confimed planets already in the kep3 set in case they are repeats?

# is it possible that some of the planets don't pass in front of their stars for the 66 days I'm looking at?
#     what proportion of exoplanets have orbits of less than 66 days?

# how am I going to deal with the nulls?
#     skip to the next nearest non-null value?
#     how many nulls in a row are there?
#     if this happens to almost all confirmed planets, its possible that the network may learn to find these "skips"
#         should I then just drop any rows with too many consecutive nans? 
#         Pretty much every one has a string of at least 40 consecutive nans
#     mean imputation may have a logical case here
#         it would make sense that the level of light is between previous value and the next one + random noise
#         what about for longer strings of missing values 
#         mean imputation could be good for avoiding creating false signals because of its smoothing effect
#         the disadvantages of mean imputation may not be important because I am basically using neural networks for pattern detection
#     maybe if it is a long string of missing values you want to skip to the next real data, 
#         mark if that had happened so I could tell if it contributes to false positives
#     drop rows with more than 150 nulls? there are only 8
#         1134    666
#         752     408
#         851     408
#         1105    408
#         441     408
#         832     285
#         871     230
#         762     230

# definitely get data on whether it is a binary star system or not
#     how to feed these single features to the neural networks to help with pattern detection?

# normalize the light curves

# print out confusion matrix and other classification metrics

# turn it into a gridsearch
#     have it save the results to be analyzed later
#     can I have it tune over generalizability to the unseen set?