In [1]:
import numpy as np
import pandas as pd

target_name_dict = { 'astro-ph.GA' : 0,
                    'astro-ph.SR' : 1,
                    'astro-ph.IM' : 2,
                    'astro-ph.EP' : 3,
                    'astro-ph.HE' : 4,
                    'astro-ph.CO' : 5
                }
label2target = { v:k for k,v in target_name_dict.items()}

In [2]:
df = pd.HDFStore("../data/2014astroph_p.h5", "r")
df['/df'].keys()
abstracts = df['/df']['abstract']
labels = np.array(df['/df']['label'])
df.close()

In [3]:
j = np.random.randint(len(labels))
print(j, label2target[labels[j]])
print(abstracts[j])

2256 astro-ph.GA
We show that the mass fraction of GMC gas (n>100 cm^-3) in dense (n>>10^4
cm^-3) star-forming clumps, observable in dense molecular tracers
(L_HCN/L_CO(1-0)), is a sensitive probe of the strength and mechanism(s) of
stellar feedback. Using high-resolution galaxy-scale simulations with pc-scale
resolution and explicit models for feedback from radiation pressure,
photoionization heating, stellar winds, and supernovae (SNe), we make
predictions for the dense molecular gas tracers as a function of GMC and galaxy
properties and the efficiency of stellar feedback. In models with weak/no
feedback, much of the mass in GMCs collapses into dense sub-units, predicting
L_HCN/L_CO(1-0) ratios order-of-magnitude larger than observed. By contrast,
models with feedback properties taken directly from stellar evolution
calculations predict dense gas tracers in good agreement with observations.
Changing the strength or timing of SNe tends to move systems along, rather than
off, the L_HCN

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [49]:
# settings
maxlen = 150
training_samples = 2000
validation_samples = 10000
max_words = 10000 # Top 10000 words

In [27]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
word_index = tokenizer.word_index

print("Found %s unique tokens" % len(word_index))

word_index_reverse = dict()

for k, v in word_index.items():
    word_index_reverse[v] = k

Found 30677 unique tokens


In [50]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)
indices = np.arange(abstracts.shape[0])

np.random.seed(1234)

np.random.shuffle(indices)

data = data[indices]

labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples:training_samples+validation_samples]
y_val = labels[training_samples:training_samples+validation_samples]

In [51]:
#https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, data))

# alternative way
my_texts_2 = tokenizer.sequences_to_texts(sequences=sequences)

In [52]:
j = np.random.randint(len(x_train))
print(j, label2target[y_train[j]])
print(my_texts[indices[j]])

740 astro-ph.CO
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'uncertainty', 'in', 'the', 'calibration', 'of', 'gravitational', 'wave', 'gw', 'detector', 'data', 'leads', 'to', 'systematic', 'errors', 'which', 'must', 'be', 'accounted', 'for', 'in', 'setting', 'limits', 'on', 'the', 'strength', 'of', 'gw', 'signals', 'when', 'cross', 'correlation', 'measurements', 'are', 'made', 'using', 'data', 'from', 'a', 'pair', 'of', 'instruments', 'as', 'in', 'searches', 'for', 'a', 'stochastic', 'gw', 'background', 'the', 'calibration', 'uncertainties', 'of', 'the', 'individual', 'instruments', 'can', 'be', 'combined', 'into', 'an', 'uncertainty', 'associated', 'with', 'the', 'pair', 'with', 'the', 'advent', 'of', 'multi', 'baseline', 'gw', 'observation', 'e', 'g', 'networks', 'consisting', 'of', 'multiple', 'detectors', 'such', 'as', 'the', 'ligo', 'observatories', 'a

In [53]:
from keras.utils.np_utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_val_one_hot = to_categorical(y_val)

# Parsing GloVe file

In [55]:
import os
glove_dir = "../glove.6B/"

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [57]:
embeddings_dim = 100 # same dimension as the glove.6B above
embedding_vector = None
embedding_matrix = np.zeros((max_words, embeddings_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [58]:
from keras.models import Sequential
import keras.layers as layers

In [84]:
model = Sequential()
model.add(layers.Embedding(max_words, embeddings_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu', input_shape=(maxlen,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 100)          1000000   
_________________________________________________________________
flatten_4 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_25 (Dense)             (None, 64)                960064    
_________________________________________________________________
dense_26 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_27 (Dense)             (None, 6)                 390       
Total params: 1,964,614
Trainable params: 1,964,614
Non-trainable params: 0
_________________________________________________________________


In [85]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable= False

In [86]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train_one_hot,
                   epochs=10,
                   batch_size=32,
                   validation_data = (x_val, y_val_one_hot))
#model.save_weights('pre_trained_glove_model_2.h5')

Train on 2000 samples, validate on 6794 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
x_train_text = tokenizer.sequences_to_texts(x_train)

In [77]:
x_train_text[0]

'accurate estimate of the turbulent energy spectrum we then apply this method to the 13co map of ngc 1333 from the complete database we find the turbulent energy spectrum is a power law e k k beta in the range of scales 0 06 pc ell 1 5 pc with slope beta 1 85 pm 0 04 the estimated energy injection scale of stellar outflows in ngc 1333 is ell 0 3 pc well resolved by the observations there is no evidence of the flattening of the energy spectrum above the scale ell predicted by outflow driven simulations and analytical models the power spectrum of integrated intensity is also a nearly perfect power law in the range of scales 0 16 pc ell 7 9 pc with no feature above ell we conclude that the observed turbulence in ngc 1333 does not appear to be driven primarily by stellar outflows'

['accurate',
 'estimate',
 'of',
 'the',
 'turbulent',
 'energy',
 'spectrum',
 'we',
 'then',
 'apply',
 'this',
 'method',
 'to',
 'the',
 '13co',
 'map',
 'of',
 'ngc',
 '1333',
 'from',
 'the',
 'complete',
 'database',
 'we',
 'find',
 'the',
 'turbulent',
 'energy',
 'spectrum',
 'is',
 'a',
 'power',
 'law',
 'e',
 'k',
 'k',
 'beta',
 'in',
 'the',
 'range',
 'of',
 'scales',
 '0',
 '06',
 'pc',
 'ell',
 '1',
 '5',
 'pc',
 'with',
 'slope',
 'beta',
 '1',
 '85',
 'pm',
 '0',
 '04',
 'the',
 'estimated',
 'energy',
 'injection',
 'scale',
 'of',
 'stellar',
 'outflows',
 'in',
 'ngc',
 '1333',
 'is',
 'ell',
 '0',
 '3',
 'pc',
 'well',
 'resolved',
 'by',
 'the',
 'observations',
 'there',
 'is',
 'no',
 'evidence',
 'of',
 'the',
 'flattening',
 'of',
 'the',
 'energy',
 'spectrum',
 'above',
 'the',
 'scale',
 'ell',
 'predicted',
 'by',
 'outflow',
 'driven',
 'simulations',
 'and',
 'analytical',
 'models',
 'the',
 'power',
 'spectrum',
 'of',
 'integrated',
 'intensity',
 