In [1]:
import numpy as np
import pandas as pd

target_name_dict = { 'astro-ph.GA' : 0,
                    'astro-ph.SR' : 1,
                    'astro-ph.IM' : 2,
                    'astro-ph.EP' : 3,
                    'astro-ph.HE' : 4,
                    'astro-ph.CO' : 5
                }
label2target = { v:k for k,v in target_name_dict.items()}

In [2]:
df = pd.HDFStore("../data/2014astroph_p.h5", "r")
df['/df'].keys()
abstracts = df['/df']['abstract']
labels = np.array(df['/df']['label'])
df.close()

In [3]:
j = np.random.randint(len(labels))
print(j, label2target[labels[j]])
print(abstracts[j])

6721 astro-ph.IM
Searching for nearby habitable worlds with direct imaging and spectroscopy
will require a telescope large enough to provide angular resolution and
sensitivity to planets around a significant sample of stars. Segmented
telescopes are a compelling option to obtain such large apertures. However,
these telescope designs have a complex geometry (central obstruction, support
structures, segmentation) that makes high-contrast imaging more challenging. We
are developing a new high-contrast imaging testbed at STScI to provide an
integrated solution for wavefront control and starlight suppression on complex
aperture geometries. We present our approach for the testbed optical design,
which defines the surface requirements for each mirror to minimize the
amplitude-induced errors from the propagation of out-of-pupil surfaces. Our
approach guarantees that the testbed will not be limited by these Fresnel
propagation effects, but only by the aperture geometry. This approach involves
i

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
# settings
maxlen = 150
max_words = 10000 # Top 10000 words
training_samples = 6000 #

In [6]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
word_index = tokenizer.word_index

print("Found %s unique tokens" % len(word_index))

word_index_reverse = dict()

for k, v in word_index.items():
    word_index_reverse[v] = k

Found 30677 unique tokens


In [7]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)
indices = np.arange(abstracts.shape[0])

np.random.seed(1234)

np.random.shuffle(indices)

data = data[indices]

labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_test = data[training_samples:]
y_test = labels[training_samples:]

In [8]:
#https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, data))

# alternative way
my_texts_2 = tokenizer.sequences_to_texts(sequences=sequences)

In [23]:
j = np.random.randint(len(x_train))
print(j, label2target[y_train[j]])
print(my_texts[j])

3824 astro-ph.CO
['dark', 'matter', 'voids', 'et', 'al', '2013', 'concentrate', 'on', 'the', 'velocity', 'profiles', 'around', 'voids', 'first', 'they', 'show', 'the', 'necessity', 'of', 'four', 'parameters', 'to', 'describe', 'the', 'density', 'profiles', 'around', 'voids', 'given', 'two', 'distinct', 'void', 'populations', 'voids', 'in', 'voids', 'and', 'voids', 'in', 'clouds', 'this', 'profile', 'is', 'used', 'to', 'predict', 'peculiar', 'velocities', 'around', 'voids', 'and', 'the', 'combination', 'of', 'the', 'latter', 'with', 'void', 'density', 'profiles', 'allows', 'the', 'construction', 'of', 'model', 'void', 'galaxy', 'cross', 'correlation', 'functions', 'with', 'redshift', 'space', 'distortions', 'when', 'these', 'models', 'are', 'tuned', 'to', 'fit', 'the', 'measured', 'correlation', 'functions', 'for', 'voids', 'and', 'galaxies', 'in', 'the', 'sloan', 'digital', 'sky', 'survey', 'small', 'voids', 'are', 'found', 'to', 'be', 'of', 'the', 'void', 'in', 'cloud', 'type', 'where

In [24]:
from keras.utils.np_utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

# Parsing GloVe file

In [25]:
import os
glove_dir = "../glove.6B/"

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [26]:
embeddings_dim = 100 # same dimension as the glove.6B above
embedding_vector = None
embedding_matrix = np.zeros((max_words, embeddings_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [27]:
from keras.models import Sequential
import keras.layers as layers

In [131]:
model = Sequential()
embeddings_dim = 100
model.add(layers.Embedding(max_words, embeddings_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu', input_shape=(maxlen,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 150, 100)          1000000   
_________________________________________________________________
flatten_7 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_19 (Dense)             (None, 64)                960064    
_________________________________________________________________
dense_20 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_21 (Dense)             (None, 6)                 390       
Total params: 1,964,614
Trainable params: 1,964,614
Non-trainable params: 0
_________________________________________________________________


In [132]:
#model.layers[0].set_weights([embedding_matrix])
#model.layers[0].trainable= False

In [134]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train_one_hot,
                   epochs=5,
                   batch_size=32,
                   validation_split=0.3)
#model.save_weights('pre_trained_glove_model_2.h5')

Train on 4200 samples, validate on 1800 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [127]:
results = model.evaluate(x_test, y_test_one_hot)
class_prediction = model.predict_classes(x_test)



In [128]:
results

[1.010026819209329, 0.6385110952466747]

In [129]:
jj = np.random.randint(len(x_test))
print(x_test[jj])
print(my_texts[training_samples+jj])
print(label2target[y_test[jj]])
print("prediction: ", label2target[class_prediction[jj]])

[  14    1 3667   29   73   27  164    1  873 1077 1038 1449    3    1
  548  156  253   14    1  152  124 2233 5387    6  399    1   47   34
   66    3  336    2  215    1  386   78 1410   27   19  997  575   13
 4183   10   80  571  785    5    1  104   98   50   13   40    1 1180
  200    7  382  148  509    1 4942  647 4484   23    8 1017  516 1963
   21 6261    2    1 4484  647    9   82 4278 2216    3    1  486   80
 4942  647   40    1  486   80  711    7 2138  243  591   20  244   20
    3  243  591    4  244   20  748   14    1 1449    3 5387  183  243
  207  249 1651  207  224  244  699   14    1  104   98  183    9   80
  571  785    3  243  207  249 1651  207  224  244  988    3  243  207
  249 1651  207 8352  244 7868   14    1 1180  183]
['from', 'the', 'lss', 'data', 'one', 'can', 'use', 'the', 'baryon', 'acoustic', 'oscillation', 'bao', 'and', 'the', 'growth', 'rate', 'derived', 'from', 'the', 'redshift', 'space', 'distortion', 'rsd', 'to', 'measure', 'the', 'dark', 'en

In [135]:
abstract_testing = "the first observed interstellar object Its light-curve amplitude indicates that the object is highly elongated with an axis ratio of at least 5:1. the absence of such elongated asteroids in the Solar system the first observed interstellar object Its light-curve amplitude indicates that the object is highly elongated with an axis ratio of at least 5:1. the absence of such elongated asteroids in the Solar system the first observed interstellar object Its light-curve amplitude indicates that the object is highly elongated with an axis ratio of at least 5:1. the absence of such elongated asteroids in the Solar system"

seq_testing = tokenizer.texts_to_sequences([[ w for w in abstract_testing.split(' ')]])
data_testing = pad_sequences(sequences=seq_testing, maxlen=maxlen)
print(data_testing)
classes_testing = model.predict(data_testing)
print("prediction: ", classes_testing[0])
print("predicted category: ", label2target[np.argmax(classes_testing[0])])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    1   91   63  581  578   71  424  981   11    1  578    8  557
  3258    9   21  861  189    2   17  642    1 1550    2   70 3258 2101
     5    1   86  118    1   91   63  581  578   71  424  981   11    1
   578    8  557 3258    9   21  861  189    2   17  642    1 1550    2
    70 3258 2101    5    1   86  118    1   91   63  581  578   71  424
   981   11    1  578    8  557 3258    9   21  861  189    2   17  642
     1 1550    2   70 3258 2101    5    1   86  118]]
prediction:  [1.9235373e-05 1.3179114e-02 4.2389169e-05 9.8675913e-01 2.9308373e-07
 1.0821752e-11]
predicted category:  astro-ph.EP
